using slurm flag to fine node nb

This commit is contained in:
William Falcon 2019-07-08 14:07:04 -04:00
parent 94da5431cd
commit 8552a911bf
1 changed files with 4 additions and 2 deletions

View File

@ -329,7 +329,8 @@ class Trainer(TrainerIO):
world_size = self.nb_gpu_nodes * len(self.data_parallel_device_ids)
# set up server using proc 0's ip address
ip = self.__get_root_node_ip(self.proc_rank, self.nb_gpu_nodes, self.exp_save_path)
ip_tables_dir = os.path.join(self.cluster.log_path, 'ip_tables')
ip = self.__get_root_node_ip(self.proc_rank, self.nb_gpu_nodes, ip_tables_dir)
dist.init_process_group("nccl", init_method=f'tcp://{ip}:12001', rank=self.proc_rank, world_size=world_size)
print(f"GPU: {gpu_nb} - Rank: {self.proc_rank}")
@ -358,7 +359,8 @@ class Trainer(TrainerIO):
# the first gpu in the world becomes the host
# this is based on its global rank
# saves the ip to disk
ip_file = os.path.join(ip_file_dir, '.ip_meta')
ip_table_name = f'.ip_meta_' + os.environ['SLURM_JOB_ID']
ip_file = os.path.join(ip_file_dir, ip_table_name)
if world_gpu_nb == 0:
# get the proc 0 IP
root_ip = subprocess.run(['hostname', '-I'], stdout=subprocess.PIPE).stdout.decode('utf-8')