diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index f42357c264..086084e643 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -352,7 +352,10 @@ class Trainer(TrainerIO): print('SLURM ROOT NODE: ', root_node) print('-'*100) - dist.init_process_group("nccl", init_method=f'env://{root_node}:12007', rank=self.proc_rank, + os.environ['MASTER_ADDR'] = root_node + os.environ['MASTER_PORT'] = '12006' + + dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) # self.__init_tcp_connection(ip_file_dir)