From 6c02afefca01ab8a92c39f971b964f3dc33b53fc Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 12 Jul 2019 15:55:28 -0400 Subject: [PATCH] fixed nccl init --- pytorch_lightning/models/trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index a6d8e1e8e4..3bf2c1e888 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -379,7 +379,10 @@ class Trainer(TrainerIO): :param tries: :return: """ - sleep(self.proc_rank*2) + # hack to get nccl to stop throwing error... seems to be an nccl race condition + if self.proc_rank > 0: + sleep(10.0) + root_node = os.environ['SLURM_NODELIST'].split(' ')[0] os.environ['MASTER_ADDR'] = root_node os.environ['MASTER_PORT'] = f'{port}'