diff --git a/CHANGELOG.md b/CHANGELOG.md index 35ef6fb59b..7bea96a383 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -271,6 +271,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed bug where data-loading functions where not getting the correct running stage passed ([#8858](https://github.com/PyTorchLightning/pytorch-lightning/pull/8858)) +- Fixed error handling in DDP process reconciliation when `_sync_dir` was not initialized ([#9267](https://github.com/PyTorchLightning/pytorch-lightning/pull/9267)) + + ## [1.4.5] - 2021-08-31 - Fixed reduction using `self.log(sync_dict=True, reduce_fx={mean,max})` ([#9142](https://github.com/PyTorchLightning/pytorch-lightning/pull/9142)) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 2396670a49..143518262b 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -375,6 +375,9 @@ class DDPPlugin(ParallelPlugin): return [self.root_device.index] def pre_dispatch(self): + # share ddp pids to all processes + self._share_information_to_prevent_deadlock() + # move the model to the correct device self.model_to_device() @@ -386,9 +389,6 @@ class DDPPlugin(ParallelPlugin): if trainer_fn == TrainerFn.FITTING: self.configure_ddp() - # share ddp pids to all processes - self._share_information_to_prevent_deadlock() - def post_dispatch(self) -> None: self.cluster_environment.teardown() @@ -489,6 +489,10 @@ class DDPPlugin(ParallelPlugin): sync_dir = self._sync_dir + if not sync_dir: + rank_zero_warn("Error handling mechanism for deadlock detection is uninitialized. Skipping check.") + return + # The cluster may be configured to periodically purge the `/tmp` # directory, in which case `sync_dir` may not exist anymore at this # point. Idempotently create it to ensure its existence.