Add log output for slurm (#1657)

* add log output for slurm

* change log levels

* formatting

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
This commit is contained in:
Jacob Zhong 2020-04-30 07:58:03 -04:00 committed by GitHub
parent 8d564b5e38
commit f9c9e39ab8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 8 additions and 2 deletions

View File

@ -930,10 +930,12 @@ class LightningModule(ABC, GradInformation, ModelIO, ModelHooks):
if 'MASTER_ADDR' not in os.environ:
log.warning("MASTER_ADDR environment variable is not defined. Set as localhost")
os.environ['MASTER_ADDR'] = '127.0.0.1'
log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
if 'MASTER_PORT' not in os.environ:
log.warning("MASTER_PORT environment variable is not defined. Set as 12910")
os.environ['MASTER_PORT'] = '12910'
log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")
if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != world_size:
log.warning("WORLD_SIZE environment variable is not equal to the computed "

View File

@ -277,6 +277,10 @@ class TrainerDDPMixin(ABC):
except Exception as e:
pass
# notify user the that slurm is managing tasks
if self.is_slurm_managing_tasks:
log.info('Multi-processing is handled by Slurm.')
def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
if data_parallel_device_ids is None:
return
@ -293,7 +297,7 @@ class TrainerDDPMixin(ABC):
gpu_str = ','.join([str(x) for x in data_parallel_device_ids])
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str
log.info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
log.debug(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
def ddp_train(self, process_idx, model):
"""

View File

@ -215,7 +215,7 @@ class TrainerIOMixin(ABC):
if result == 0:
log.info(f'requeued exp {job_id}')
else:
log.info('requeue failed...')
log.warning('requeue failed...')
# close experiment to avoid issues
self.logger.close()