From c2987d3b4071748f678df2d1d9e76b701472e710 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 8 Jul 2019 12:51:07 -0400 Subject: [PATCH] added multi-node locked ip search --- pytorch_lightning/models/trainer.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 09c44e674c..5f6a207a4c 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -371,23 +371,18 @@ class Trainer(TrainerIO): # every process writes their process id + ip to a shared file my_ip = subprocess.run(['hostname', '-I'], stdout=subprocess.PIPE).stdout.decode('utf-8') my_ip = my_ip.split(' ')[0] - to_write = f'{my_ip}\n' # save the ip to the file # block file so only one process can access at a time - with open(file=ip_file, mode='a') as f: - fcntl.flock(f, fcntl.LOCK_EX) - f.write(to_write) - fcntl.flock(f, fcntl.LOCK_UN) + ip_dir = os.path.join(ip_file_dir, '.ips', my_ip) + os.makedirs(ip_dir, exist_ok=True) # now everyone waits until the file has world_size entries for i in range(0, 120): sleep(1.0) - print('sleeping...') - if os.path.exists(ip_file): - lines = list(open(file=ip_file, mode='r')) - if len(lines) == world_size: - break + nb_folders = [x for x in os.listdir(os.path.join(ip_file_dir, '.ips')) if '.' in x] + if nb_folders == nb_gpu_nodes: + break # the ip_table is written at this point # now every process reads it and decides what rank they are based on their node @@ -396,7 +391,6 @@ class Trainer(TrainerIO): return my_node_rank, root_ip - def __determine_my_node_rank(self, ip_pid_lines, my_ip): # the node rank is the index of my_ip in the world-size ip table # when de-duped and sorted