Add check for uninitialized _sync_dir in DDP Plugin to avoid errors during error handling (#9267)
This commit is contained in:
parent
071ae49808
commit
69cdb79e33
|
@ -271,6 +271,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
- Fixed bug where data-loading functions where not getting the correct running stage passed ([#8858](https://github.com/PyTorchLightning/pytorch-lightning/pull/8858))
|
||||
|
||||
|
||||
- Fixed error handling in DDP process reconciliation when `_sync_dir` was not initialized ([#9267](https://github.com/PyTorchLightning/pytorch-lightning/pull/9267))
|
||||
|
||||
|
||||
## [1.4.5] - 2021-08-31
|
||||
|
||||
- Fixed reduction using `self.log(sync_dict=True, reduce_fx={mean,max})` ([#9142](https://github.com/PyTorchLightning/pytorch-lightning/pull/9142))
|
||||
|
|
|
@ -375,6 +375,9 @@ class DDPPlugin(ParallelPlugin):
|
|||
return [self.root_device.index]
|
||||
|
||||
def pre_dispatch(self):
|
||||
# share ddp pids to all processes
|
||||
self._share_information_to_prevent_deadlock()
|
||||
|
||||
# move the model to the correct device
|
||||
self.model_to_device()
|
||||
|
||||
|
@ -386,9 +389,6 @@ class DDPPlugin(ParallelPlugin):
|
|||
if trainer_fn == TrainerFn.FITTING:
|
||||
self.configure_ddp()
|
||||
|
||||
# share ddp pids to all processes
|
||||
self._share_information_to_prevent_deadlock()
|
||||
|
||||
def post_dispatch(self) -> None:
|
||||
self.cluster_environment.teardown()
|
||||
|
||||
|
@ -489,6 +489,10 @@ class DDPPlugin(ParallelPlugin):
|
|||
|
||||
sync_dir = self._sync_dir
|
||||
|
||||
if not sync_dir:
|
||||
rank_zero_warn("Error handling mechanism for deadlock detection is uninitialized. Skipping check.")
|
||||
return
|
||||
|
||||
# The cluster may be configured to periodically purge the `/tmp`
|
||||
# directory, in which case `sync_dir` may not exist anymore at this
|
||||
# point. Idempotently create it to ensure its existence.
|
||||
|
|
Loading…
Reference in New Issue