From 476911d60cc7986e5507d26ebaccb12827fb5cee Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 18 Jun 2020 00:19:06 -0400 Subject: [PATCH] Pid port + duplicate rank_zero logging (#2231) * init the port using a seed that matches process id for ddp * init the port using a seed that matches process id for ddp * init the port using a seed that matches process id for ddp * init the port using a seed that matches process id for ddp * init the port using a seed that matches process id for ddp * init the port using a seed that matches process id for ddp * init the port using a seed that matches process id for ddp Co-authored-by: Zhaofeng Wu --- pytorch_lightning/trainer/distrib_data_parallel.py | 10 ++++++---- pytorch_lightning/trainer/trainer.py | 11 ++++++++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 53a47c130d..3b94c9ae3b 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -372,14 +372,16 @@ class TrainerDDPMixin(ABC): def __set_random_port(self): """ When running DDP NOT managed by SLURM, the ports might collide - :return: """ try: default_port = os.environ['MASTER_PORT'] except Exception: - import random - default_port = random.randint(10000, 19000) - os.environ['MASTER_PORT'] = str(default_port) + # use the process id as a seed to a generator for port only + pid = os.getpid() + rng1 = np.random.RandomState(pid) + default_port = rng1.randint(10000, 19999, 1)[0] + + os.environ['MASTER_PORT'] = str(default_port) def spawn_ddp_children(self, model): self.__set_random_port() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d41f5ff3fd..0f76c07229 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -32,7 +32,7 @@ from pytorch_lightning.trainer.training_loop import TrainerTrainLoopMixin from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin from pytorch_lightning.trainer.lr_finder import TrainerLRFinderMixin from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities import rank_zero_warn, parsing, rank_zero_info +from pytorch_lightning.utilities import rank_zero_warn, parsing, rank_zero_info, rank_zero_only try: from apex import amp @@ -322,6 +322,14 @@ class Trainer( # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) + # init the default rank if exists + # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks + # this way we only show it on rank 0 + if 'LOCAL_RANK' in os.environ: + rank_zero_only.rank = os.environ['LOCAL_RANK'] + if 'SLURM_JOB_ID' in os.environ: + rank_zero_only.rank = os.environ['SLURM_JOB_ID'] + # Init callbacks self.prepare_data_per_node = prepare_data_per_node self.callbacks = callbacks or [] @@ -892,6 +900,7 @@ class Trainer( mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,)) elif self.distributed_backend == 'ddp_spawn': + self.__set_random_port() model.share_memory() # spin up peers