From 5641b266d56d2324e1d9cb3ba12a60b47ab10558 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Sat, 24 Oct 2020 21:55:49 +0100
Subject: [PATCH] Bug/4319 ddp checkpoint (#4323)

* Broadcast best model path to ensure we sync with main process + wait for main process to save

* Add barrier call to ensure all processes are in sync

* Added changelog commit

* Move sync of best model path/score to model checkpoint, keep barrier to ensure all processes complete

* Ensure we broadcast as tuple

* Add init check

* Update pytorch_lightning/callbacks/model_checkpoint.py

Co-authored-by: ananthsub <ananth.subramaniam@gmail.com>

* Update pytorch_lightning/callbacks/model_checkpoint.py

Co-authored-by: ananthsub <ananth.subramaniam@gmail.com>

* Removed model checkpoint code, added barrier to trainer to enforce we syncronize and wait for all processes to finish before completing training

* Add barrier within teardown call, removed horovod teardown to inherit from base accelerator

Co-authored-by: ananthsub <ananth.subramaniam@gmail.com>
---
 CHANGELOG.md                                          | 1 +
 pytorch_lightning/accelerators/accelerator.py         | 3 ++-
 pytorch_lightning/accelerators/dp_accelerator.py      | 1 +
 pytorch_lightning/accelerators/horovod_accelerator.py | 3 ---
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f6a173971c..0989524395 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -31,6 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323))
 
 ## [1.0.3] - 2020-10-20
 
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index d9b8adacf2..8e1969cc93 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -52,7 +52,8 @@ class Accelerator(object):
         pass
 
     def teardown(self):
-        pass
+        # Ensure if necessary all processes are finished
+        self.barrier()
 
     def barrier(self, name: Optional[str] = None):
         pass
diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py
index dda19d0431..0a6eac607d 100644
--- a/pytorch_lightning/accelerators/dp_accelerator.py
+++ b/pytorch_lightning/accelerators/dp_accelerator.py
@@ -101,6 +101,7 @@ class DataParallelAccelerator(Accelerator):
     def teardown(self):
         # replace the original fwd function
         self.trainer.model.forward = self.model_autocast_original_forward
+        self.barrier()
 
     def training_step(self, args):
         if self.trainer.amp_backend == AMPType.NATIVE:
diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py
index 3267e1d7f1..91a5400999 100644
--- a/pytorch_lightning/accelerators/horovod_accelerator.py
+++ b/pytorch_lightning/accelerators/horovod_accelerator.py
@@ -107,9 +107,6 @@ class HorovodAccelerator(Accelerator):
         hvd.join()
         return results
 
-    def teardown(self):
-        pass
-
     def training_step(self, args):
         if self.trainer.on_gpu:
             batch = args[0]