diff --git a/CHANGELOG.md b/CHANGELOG.md index f6a173971c..0989524395 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) ## [1.0.3] - 2020-10-20 diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index d9b8adacf2..8e1969cc93 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -52,7 +52,8 @@ class Accelerator(object): pass def teardown(self): - pass + # Ensure if necessary all processes are finished + self.barrier() def barrier(self, name: Optional[str] = None): pass diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py index dda19d0431..0a6eac607d 100644 --- a/pytorch_lightning/accelerators/dp_accelerator.py +++ b/pytorch_lightning/accelerators/dp_accelerator.py @@ -101,6 +101,7 @@ class DataParallelAccelerator(Accelerator): def teardown(self): # replace the original fwd function self.trainer.model.forward = self.model_autocast_original_forward + self.barrier() def training_step(self, args): if self.trainer.amp_backend == AMPType.NATIVE: diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py index 3267e1d7f1..91a5400999 100644 --- a/pytorch_lightning/accelerators/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/horovod_accelerator.py @@ -107,9 +107,6 @@ class HorovodAccelerator(Accelerator): hvd.join() return results - def teardown(self): - pass - def training_step(self, args): if self.trainer.on_gpu: batch = args[0]