fixed nccl init
This commit is contained in:
parent
dc87a4fc91
commit
e82014ec6c
|
@ -383,8 +383,8 @@ class Trainer(TrainerIO):
|
||||||
root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
|
root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
|
||||||
os.environ['MASTER_ADDR'] = root_node
|
os.environ['MASTER_ADDR'] = root_node
|
||||||
os.environ['MASTER_PORT'] = f'{port}'
|
os.environ['MASTER_PORT'] = f'{port}'
|
||||||
# dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size)
|
dist.init_process_group("nccl", rank=self.proc_rank)
|
||||||
dist.init_process_group("nccl")
|
# dist.init_process_group("nccl")
|
||||||
|
|
||||||
def __run_pretrain_routine(self, model):
|
def __run_pretrain_routine(self, model):
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue