From 476911d60cc7986e5507d26ebaccb12827fb5cee Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Thu, 18 Jun 2020 00:19:06 -0400
Subject: [PATCH] Pid port + duplicate rank_zero logging (#2231)

* init the port using a seed that matches process id for ddp

* init the port using a seed that matches process id for ddp

* init the port using a seed that matches process id for ddp

* init the port using a seed that matches process id for ddp

* init the port using a seed that matches process id for ddp

* init the port using a seed that matches process id for ddp

* init the port using a seed that matches process id for ddp

Co-authored-by: Zhaofeng Wu <zfw7@cs.washington.edu>
---
 pytorch_lightning/trainer/distrib_data_parallel.py | 10 ++++++----
 pytorch_lightning/trainer/trainer.py               | 11 ++++++++++-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index 53a47c130d..3b94c9ae3b 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -372,14 +372,16 @@ class TrainerDDPMixin(ABC):
     def __set_random_port(self):
         """
         When running DDP NOT managed by SLURM, the ports might collide
-        :return:
         """
         try:
             default_port = os.environ['MASTER_PORT']
         except Exception:
-            import random
-            default_port = random.randint(10000, 19000)
-            os.environ['MASTER_PORT'] = str(default_port)
+            # use the process id as a seed to a generator for port only
+            pid = os.getpid()
+            rng1 = np.random.RandomState(pid)
+            default_port = rng1.randint(10000, 19999, 1)[0]
+
+        os.environ['MASTER_PORT'] = str(default_port)
 
     def spawn_ddp_children(self, model):
         self.__set_random_port()
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index d41f5ff3fd..0f76c07229 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -32,7 +32,7 @@ from pytorch_lightning.trainer.training_loop import TrainerTrainLoopMixin
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
 from pytorch_lightning.trainer.lr_finder import TrainerLRFinderMixin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities import rank_zero_warn, parsing, rank_zero_info
+from pytorch_lightning.utilities import rank_zero_warn, parsing, rank_zero_info, rank_zero_only
 
 try:
     from apex import amp
@@ -322,6 +322,14 @@ class Trainer(
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
 
+        # init the default rank if exists
+        # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
+        # this way we only show it on rank 0
+        if 'LOCAL_RANK' in os.environ:
+            rank_zero_only.rank = os.environ['LOCAL_RANK']
+        if 'SLURM_JOB_ID' in os.environ:
+            rank_zero_only.rank = os.environ['SLURM_JOB_ID']
+
         # Init callbacks
         self.prepare_data_per_node = prepare_data_per_node
         self.callbacks = callbacks or []
@@ -892,6 +900,7 @@ class Trainer(
                 mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,))
 
             elif self.distributed_backend == 'ddp_spawn':
+                self.__set_random_port()
                 model.share_memory()
 
                 # spin up peers