From 5812efcf24ea8ce693328b8135b63fe1f983fd28 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 16:05:46 -0400 Subject: [PATCH] fixed nccl init --- pytorch_lightning/models/trainer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 58f6b71dcc..ab8dbdb47b 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -371,7 +371,7 @@ class Trainer(TrainerIO): # continue training routine self.__run_pretrain_routine(model) - def __init_tcp_connection(self, port=12975): + def __init_tcp_connection(self, port=12945): """ Connect all procs in the world using the env:// init Use the first node as the root address @@ -383,8 +383,7 @@ class Trainer(TrainerIO): root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node os.environ['MASTER_PORT'] = f'{port}' - dist.init_process_group("nccl", rank=self.proc_rank) - # dist.init_process_group("nccl") + dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) def __run_pretrain_routine(self, model): """