diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b34107386..e0afbb8bad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -143,6 +143,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Ensure the existence of `DDPPlugin._sync_dir` in `reconciliate_processes` ([#8939](https://github.com/PyTorchLightning/pytorch-lightning/pull/8939)) + - Restore original loaders if replaced by entrypoint ([#8885](https://github.com/PyTorchLightning/pytorch-lightning/pull/8885)) - Fixed `trainer.fit_loop.split_idx` always returning `None` ([#8601](https://github.com/PyTorchLightning/pytorch-lightning/pull/8601)) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 8348a565fa..5c1caa16c1 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -19,6 +19,7 @@ import subprocess import sys import tempfile import time +from pathlib import Path from time import sleep from typing import Any, Dict, List, Optional, Union @@ -441,6 +442,11 @@ class DDPPlugin(ParallelPlugin): sync_dir = self._sync_dir + # The cluster may be configured to periodically purge the `/tmp` + # directory, in which case `sync_dir` may not exist anymore at this + # point. Idempotently create it to ensure its existence. + Path(sync_dir).mkdir(parents=True, exist_ok=True) + # save a file locally. torch.save(True, os.path.join(sync_dir, f"{self.global_rank}.pl"))