From 5641b266d56d2324e1d9cb3ba12a60b47ab10558 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Sat, 24 Oct 2020 21:55:49 +0100 Subject: [PATCH] Bug/4319 ddp checkpoint (#4323) * Broadcast best model path to ensure we sync with main process + wait for main process to save * Add barrier call to ensure all processes are in sync * Added changelog commit * Move sync of best model path/score to model checkpoint, keep barrier to ensure all processes complete * Ensure we broadcast as tuple * Add init check * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: ananthsub * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: ananthsub * Removed model checkpoint code, added barrier to trainer to enforce we syncronize and wait for all processes to finish before completing training * Add barrier within teardown call, removed horovod teardown to inherit from base accelerator Co-authored-by: ananthsub --- CHANGELOG.md | 1 + pytorch_lightning/accelerators/accelerator.py | 3 ++- pytorch_lightning/accelerators/dp_accelerator.py | 1 + pytorch_lightning/accelerators/horovod_accelerator.py | 3 --- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f6a173971c..0989524395 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) ## [1.0.3] - 2020-10-20 diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index d9b8adacf2..8e1969cc93 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -52,7 +52,8 @@ class Accelerator(object): pass def teardown(self): - pass + # Ensure if necessary all processes are finished + self.barrier() def barrier(self, name: Optional[str] = None): pass diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py index dda19d0431..0a6eac607d 100644 --- a/pytorch_lightning/accelerators/dp_accelerator.py +++ b/pytorch_lightning/accelerators/dp_accelerator.py @@ -101,6 +101,7 @@ class DataParallelAccelerator(Accelerator): def teardown(self): # replace the original fwd function self.trainer.model.forward = self.model_autocast_original_forward + self.barrier() def training_step(self, args): if self.trainer.amp_backend == AMPType.NATIVE: diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py index 3267e1d7f1..91a5400999 100644 --- a/pytorch_lightning/accelerators/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/horovod_accelerator.py @@ -107,9 +107,6 @@ class HorovodAccelerator(Accelerator): hvd.join() return results - def teardown(self): - pass - def training_step(self, args): if self.trainer.on_gpu: batch = args[0]