Add log output for slurm (#1657)
* add log output for slurm * change log levels * formatting Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
This commit is contained in:
parent
8d564b5e38
commit
f9c9e39ab8
|
@ -930,10 +930,12 @@ class LightningModule(ABC, GradInformation, ModelIO, ModelHooks):
|
|||
if 'MASTER_ADDR' not in os.environ:
|
||||
log.warning("MASTER_ADDR environment variable is not defined. Set as localhost")
|
||||
os.environ['MASTER_ADDR'] = '127.0.0.1'
|
||||
log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
|
||||
|
||||
if 'MASTER_PORT' not in os.environ:
|
||||
log.warning("MASTER_PORT environment variable is not defined. Set as 12910")
|
||||
os.environ['MASTER_PORT'] = '12910'
|
||||
log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")
|
||||
|
||||
if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != world_size:
|
||||
log.warning("WORLD_SIZE environment variable is not equal to the computed "
|
||||
|
|
|
@ -277,6 +277,10 @@ class TrainerDDPMixin(ABC):
|
|||
except Exception as e:
|
||||
pass
|
||||
|
||||
# notify user the that slurm is managing tasks
|
||||
if self.is_slurm_managing_tasks:
|
||||
log.info('Multi-processing is handled by Slurm.')
|
||||
|
||||
def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
|
||||
if data_parallel_device_ids is None:
|
||||
return
|
||||
|
@ -293,7 +297,7 @@ class TrainerDDPMixin(ABC):
|
|||
gpu_str = ','.join([str(x) for x in data_parallel_device_ids])
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str
|
||||
|
||||
log.info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
|
||||
log.debug(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
|
||||
|
||||
def ddp_train(self, process_idx, model):
|
||||
"""
|
||||
|
|
|
@ -215,7 +215,7 @@ class TrainerIOMixin(ABC):
|
|||
if result == 0:
|
||||
log.info(f'requeued exp {job_id}')
|
||||
else:
|
||||
log.info('requeue failed...')
|
||||
log.warning('requeue failed...')
|
||||
|
||||
# close experiment to avoid issues
|
||||
self.logger.close()
|
||||
|
|
Loading…
Reference in New Issue