fixed nccl init

This commit is contained in:
William Falcon 2019-07-12 16:07:57 -04:00
parent 5812efcf24
commit a87784b4c5
1 changed files with 5 additions and 1 deletions

View File

@ -371,7 +371,7 @@ class Trainer(TrainerIO):
# continue training routine
self.__run_pretrain_routine(model)
def __init_tcp_connection(self, port=12945):
def __init_tcp_connection(self):
"""
Connect all procs in the world using the env:// init
Use the first node as the root address
@ -379,6 +379,10 @@ class Trainer(TrainerIO):
:param tries:
:return:
"""
try:
port = os.environ['MASTER_PORT']
except Exception as e:
port = 12910
root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
os.environ['MASTER_ADDR'] = root_node