fixed nccl init

This commit is contained in:
William Falcon 2019-07-12 16:08:23 -04:00
parent a87784b4c5
commit 960937ebe9
1 changed files with 2 additions and 2 deletions

View File

@ -380,13 +380,13 @@ class Trainer(TrainerIO):
:return:
"""
try:
port = os.environ['MASTER_PORT']
os.environ['MASTER_PORT']
except Exception as e:
port = 12910
os.environ['MASTER_PORT'] = f'{port}'
root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
os.environ['MASTER_ADDR'] = root_node
os.environ['MASTER_PORT'] = f'{port}'
dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size)
def __run_pretrain_routine(self, model):