added multi-node locked ip search

This commit is contained in:
William Falcon 2019-07-08 12:59:10 -04:00
parent c0e3cb784a
commit a83d00456b
1 changed files with 0 additions and 6 deletions

View File

@ -366,8 +366,6 @@ class Trainer(TrainerIO):
if nb_gpu_nodes == 1:
return 0, '127.0.0.1'
# on multi-node, every node rank > 0 waits until rank 0
# saves the ip to disk
ip_file = os.path.join(ip_file_dir, '.ip_meta')
@ -376,10 +374,6 @@ class Trainer(TrainerIO):
my_ip = subprocess.run(['hostname', '-I'], stdout=subprocess.PIPE).stdout.decode('utf-8')
my_ip = my_ip.split(' ')[0]
test_name = f'{os.getpid()}_{my_ip}'
ip_dir = os.path.join(ip_file_dir, '.ips', test_name)
os.makedirs(ip_dir, exist_ok=True)
# save the ip to the file
# block file so only one process can access at a time
ip_dir = os.path.join(ip_file_dir, '.ips', my_ip)