diff --git a/CHANGELOG.md b/CHANGELOG.md index cc6c395a3c..6355b86096 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -472,6 +472,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed an issue where `HorovodStrategy.teardown()` did not complete gracefully if an exception was thrown during callback setup [#11752](https://github.com/PyTorchLightning/pytorch-lightning/pull/11752) + - Fixed security vulnerabilities CVE-2020-1747 and CVE-2020-14343 caused by the `PyYAML` dependency ([#11099](https://github.com/PyTorchLightning/pytorch-lightning/pull/11099)) diff --git a/pytorch_lightning/strategies/horovod.py b/pytorch_lightning/strategies/horovod.py index 3eca681add..bf21be4c74 100644 --- a/pytorch_lightning/strategies/horovod.py +++ b/pytorch_lightning/strategies/horovod.py @@ -197,8 +197,10 @@ class HorovodStrategy(ParallelStrategy): def teardown(self) -> None: super().teardown() - self._exit_stack.__exit__(None, None, None) - self._exit_stack = None + # teardown may be called before `_exit_stack` is set + if self._exit_stack: + self._exit_stack.__exit__(None, None, None) + self._exit_stack = None # Make sure all workers have finished training before returning to the user self.join() if self.root_device.type == "cuda":