From 8b0cda84e70e573c16eb69eae5cae60a910e209c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 13 Jul 2019 10:13:52 -0400 Subject: [PATCH] added fallback local init --- pytorch_lightning/models/trainer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index c7bbc1a678..8fc4d5ee66 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -378,10 +378,14 @@ class Trainer(TrainerIO): port = 12910 os.environ['MASTER_PORT'] = f'{port}' - sleep(self.proc_rank*0.5) + try: + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] + except Exception as e: + root_node = '127.0.0.2' - root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node + + sleep(self.proc_rank*0.5) dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) def __run_pretrain_routine(self, model):