Add log output for slurm (#1657)

* add log output for slurm * change log levels * formatting Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
2020-04-30 07:58:03 -04:00 · 2020-04-30 07:58:03 -04:00 · f9c9e39ab8
parent 8d564b5e38
commit f9c9e39ab8
3 changed files with 8 additions and 2 deletions
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@ -930,10 +930,12 @@ class LightningModule(ABC, GradInformation, ModelIO, ModelHooks):
        if 'MASTER_ADDR' not in os.environ:
            log.warning("MASTER_ADDR environment variable is not defined. Set as localhost")
            os.environ['MASTER_ADDR'] = '127.0.0.1'
+        log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")

        if 'MASTER_PORT' not in os.environ:
            log.warning("MASTER_PORT environment variable is not defined. Set as 12910")
            os.environ['MASTER_PORT'] = '12910'
+        log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")

        if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != world_size:
            log.warning("WORLD_SIZE environment variable is not equal to the computed "
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@ -277,6 +277,10 @@ class TrainerDDPMixin(ABC):
        except Exception as e:
            pass

+        # notify user the that slurm is managing tasks
+        if self.is_slurm_managing_tasks:
+            log.info('Multi-processing is handled by Slurm.')
+
    def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
        if data_parallel_device_ids is None:
            return
@ -293,7 +297,7 @@ class TrainerDDPMixin(ABC):
                gpu_str = ','.join([str(x) for x in data_parallel_device_ids])
                os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str

-        log.info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
+        log.debug(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')

    def ddp_train(self, process_idx, model):
        """
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@ -215,7 +215,7 @@ class TrainerIOMixin(ABC):
            if result == 0:
                log.info(f'requeued exp {job_id}')
            else:
-                log.info('requeue failed...')
+                log.warning('requeue failed...')

            # close experiment to avoid issues
            self.logger.close()