fixed nccl init

This commit is contained in:
William Falcon 2019-07-12 16:16:46 -04:00
parent 960937ebe9
commit 7e37f68a5b
1 changed files with 2 additions and 0 deletions

View File

@ -385,6 +385,8 @@ class Trainer(TrainerIO):
port = 12910 port = 12910
os.environ['MASTER_PORT'] = f'{port}' os.environ['MASTER_PORT'] = f'{port}'
sleep(self.proc_rank * 2)
root_node = os.environ['SLURM_NODELIST'].split(' ')[0] root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
os.environ['MASTER_ADDR'] = root_node os.environ['MASTER_ADDR'] = root_node
dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size) dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size)