fixed nccl init
This commit is contained in:
parent
5812efcf24
commit
a87784b4c5
|
@ -371,7 +371,7 @@ class Trainer(TrainerIO):
|
||||||
# continue training routine
|
# continue training routine
|
||||||
self.__run_pretrain_routine(model)
|
self.__run_pretrain_routine(model)
|
||||||
|
|
||||||
def __init_tcp_connection(self, port=12945):
|
def __init_tcp_connection(self):
|
||||||
"""
|
"""
|
||||||
Connect all procs in the world using the env:// init
|
Connect all procs in the world using the env:// init
|
||||||
Use the first node as the root address
|
Use the first node as the root address
|
||||||
|
@ -379,6 +379,10 @@ class Trainer(TrainerIO):
|
||||||
:param tries:
|
:param tries:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
|
port = os.environ['MASTER_PORT']
|
||||||
|
except Exception as e:
|
||||||
|
port = 12910
|
||||||
|
|
||||||
root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
|
root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
|
||||||
os.environ['MASTER_ADDR'] = root_node
|
os.environ['MASTER_ADDR'] = root_node
|
||||||
|
|
Loading…
Reference in New Issue