added slurm no process warning

2019-07-18 16:47:46 -04:00 · 2019-07-18 16:47:46 -04:00 · ad44d9168b
parent 53a1a6d462
commit ad44d9168b
1 changed files with 17 additions and 3 deletions
--- a/pytorch_lightning/models/trainer.py
+++ b/pytorch_lightning/models/trainer.py
@ -374,9 +374,23 @@ class Trainer(TrainerIO):
        if self.use_ddp:
            # must copy only the meta of the exp so it survives pickle/unpickle when going to new process
            self.experiment = self.experiment.get_meta_copy()
-            task = int(os.environ['SLURM_LOCALID'])
+
-            self.ddp_train(task, model)
+            # whenever we have the correct number of tasks, we let slurm manage processes
-            # mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))
+            # otherwise we launch the required number of processes
            nb_slurm_tasks = int(os.environ['SLURM_NTASKS'])
            nb_requested_gpus = len(self.data_parallel_device_ids)
            is_slurm_managing_tasks = nb_slurm_tasks == nb_requested_gpus
            if is_slurm_managing_tasks:
                task = int(os.environ['SLURM_LOCALID'])
                self.ddp_train(task, model)
            else:
                msg = f"""
                You requested {nb_requested_gpus} GPUs but launched {nb_slurm_tasks} slurm tasks. 
                We will launch {nb_requested_gpus} processes for you. 
                We recommend you let slurm manage the processes by setting: --ntasks-per-node={nb_requested_gpus}
                """
                warnings.warn(msg)
                mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))
        # 1 gpu or dp option triggers training using DP module
        # easier to avoid NCCL issues