From f46a7bae775d4a71275db4f525dc1cf8e5c375c0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 25 May 2020 15:59:32 -0400 Subject: [PATCH] updated docs (#1941) --- pytorch_lightning/trainer/trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 25ecd54359..eefbfe1a0d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -842,7 +842,10 @@ class Trainer( # route to appropriate start method # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp2: - task = int(os.environ['SLURM_LOCALID']) + if self.is_slurm_managing_tasks: + task = int(os.environ['SLURM_LOCALID']) + elif 'WORLD_SIZE' in os.environ and 'GROUP_RANK' in os.environ: + task = int(os.environ['LOCAL_RANK']) self.ddp_train(task, model) elif self.use_ddp: if self.is_slurm_managing_tasks: