diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 09c44e674c..5f6a207a4c 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -371,23 +371,18 @@ class Trainer(TrainerIO): # every process writes their process id + ip to a shared file my_ip = subprocess.run(['hostname', '-I'], stdout=subprocess.PIPE).stdout.decode('utf-8') my_ip = my_ip.split(' ')[0] - to_write = f'{my_ip}\n' # save the ip to the file # block file so only one process can access at a time - with open(file=ip_file, mode='a') as f: - fcntl.flock(f, fcntl.LOCK_EX) - f.write(to_write) - fcntl.flock(f, fcntl.LOCK_UN) + ip_dir = os.path.join(ip_file_dir, '.ips', my_ip) + os.makedirs(ip_dir, exist_ok=True) # now everyone waits until the file has world_size entries for i in range(0, 120): sleep(1.0) - print('sleeping...') - if os.path.exists(ip_file): - lines = list(open(file=ip_file, mode='r')) - if len(lines) == world_size: - break + nb_folders = [x for x in os.listdir(os.path.join(ip_file_dir, '.ips')) if '.' in x] + if nb_folders == nb_gpu_nodes: + break # the ip_table is written at this point # now every process reads it and decides what rank they are based on their node @@ -396,7 +391,6 @@ class Trainer(TrainerIO): return my_node_rank, root_ip - def __determine_my_node_rank(self, ip_pid_lines, my_ip): # the node rank is the index of my_ip in the world-size ip table # when de-duped and sorted