From 8552a911bf79ab58f1bfd50396d8304ffc8bdd06 Mon Sep 17 00:00:00 2001
From: William Falcon <falc@fb.com>
Date: Mon, 8 Jul 2019 14:07:04 -0400
Subject: [PATCH] using slurm flag to fine node nb

---
 pytorch_lightning/models/trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py
index 4729985ca2..d07f52d5dc 100644
--- a/pytorch_lightning/models/trainer.py
+++ b/pytorch_lightning/models/trainer.py
@@ -329,7 +329,8 @@ class Trainer(TrainerIO):
         world_size = self.nb_gpu_nodes * len(self.data_parallel_device_ids)
 
         # set up server using proc 0's ip address
-        ip = self.__get_root_node_ip(self.proc_rank, self.nb_gpu_nodes, self.exp_save_path)
+        ip_tables_dir = os.path.join(self.cluster.log_path, 'ip_tables')
+        ip = self.__get_root_node_ip(self.proc_rank, self.nb_gpu_nodes, ip_tables_dir)
         dist.init_process_group("nccl", init_method=f'tcp://{ip}:12001', rank=self.proc_rank, world_size=world_size)
         print(f"GPU: {gpu_nb} - Rank: {self.proc_rank}")
 
@@ -358,7 +359,8 @@ class Trainer(TrainerIO):
         # the first gpu in the world becomes the host
         # this is based on its global rank
         # saves the ip to disk
-        ip_file = os.path.join(ip_file_dir, '.ip_meta')
+        ip_table_name = f'.ip_meta_' + os.environ['SLURM_JOB_ID']
+        ip_file = os.path.join(ip_file_dir, ip_table_name)
         if world_gpu_nb == 0:
             # get the proc 0 IP
             root_ip = subprocess.run(['hostname', '-I'], stdout=subprocess.PIPE).stdout.decode('utf-8')