From 5e033fd97a628b00dff143fe5f970be06517ebc5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 13:11:08 -0400 Subject: [PATCH] testing env init --- pytorch_lightning/models/trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index f42357c264..086084e643 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -352,7 +352,10 @@ class Trainer(TrainerIO): print('SLURM ROOT NODE: ', root_node) print('-'*100) - dist.init_process_group("nccl", init_method=f'env://{root_node}:12007', rank=self.proc_rank, + os.environ['MASTER_ADDR'] = root_node + os.environ['MASTER_PORT'] = '12006' + + dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) # self.__init_tcp_connection(ip_file_dir)