From 960937ebe96ca6155d861b8949046bdcd78d3f22 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:08:23 -0400 Subject: [PATCH] fixed nccl init --- pytorch_lightning/models/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index d206f0931a..b716e6592a 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -380,13 +380,13 @@ class Trainer(TrainerIO): :return: """ try: - port = os.environ['MASTER_PORT'] + os.environ['MASTER_PORT'] except Exception as e: port = 12910 + os.environ['MASTER_PORT'] = f'{port}' root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node - os.environ['MASTER_PORT'] = f'{port}' dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) def __run_pretrain_routine(self, model):