From 7e37f68a5b6aa58ec4cdf4abe7d2e3295e418cf8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:16:46 -0400 Subject: [PATCH] fixed nccl init --- pytorch_lightning/models/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index b716e6592a..446a9e6e36 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -385,6 +385,8 @@ class Trainer(TrainerIO): port = 12910 os.environ['MASTER_PORT'] = f'{port}' + sleep(self.proc_rank * 2) + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size)