From 7ec1e66e17e74170657ebd5823780bf8585c52c1 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Mon, 7 Feb 2022 17:00:29 +0530
Subject: [PATCH] reduce only loss with dp (#11594)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Aki Nitta <nitta@akihironitta.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 CHANGELOG.md                       |  3 +++
 pytorch_lightning/strategies/dp.py | 17 +++++++----------
 tests/accelerators/test_dp.py      | 26 ++++++++++++++++++++------
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5a182fdf50..4a1708ec78 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -259,6 +259,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Avoid enforcing `shuffle=False` for eval dataloaders ([#11575](https://github.com/PyTorchLightning/pytorch-lightning/pull/11575))
 
 
+- When using DP (data-parallel), Lightning will no longer automatically reduce all tensors returned in training_step; it will only reduce the loss unless `training_step_end` is overridden ([#11594](https://github.com/PyTorchLightning/pytorch-lightning/pull/11594))
+
+- When using DP (data-parallel), the `training_epoch_end` hook will no longer receive reduced outputs from `training_step` and instead get the full tensor of results from all GPUs ([#11594](https://github.com/PyTorchLightning/pytorch-lightning/pull/11594))
 ### Deprecated
 
 - Deprecated `Trainer.{validated,tested,predicted}_ckpt_path` and replaced with read-only property `Trainer.ckpt_path` set when checkpoints loaded via `Trainer.{fit,validate,test,predict}` ([#11696](https://github.com/PyTorchLightning/pytorch-lightning/pull/11696))
diff --git a/pytorch_lightning/strategies/dp.py b/pytorch_lightning/strategies/dp.py
index 71d0090e2c..7a2cbb30ba 100644
--- a/pytorch_lightning/strategies/dp.py
+++ b/pytorch_lightning/strategies/dp.py
@@ -137,18 +137,15 @@ class DataParallelStrategy(ParallelStrategy):
             return self.model(*args, **kwargs)
 
     def training_step_end(self, output):
-        if not is_overridden("training_step_end", self.lightning_module):
-            return self.reduce(output)
-        return output
+        if is_overridden("training_step_end", self.lightning_module):
+            return output
 
-    def validation_step_end(self, output):
-        if not is_overridden("validation_step_end", self.lightning_module):
-            return self.reduce(output)
-        return output
+        if isinstance(output, dict) and "loss" in output:
+            output["loss"] = self.reduce(output["loss"])
+
+        elif isinstance(output, torch.Tensor):
+            output = self.reduce(output)
 
-    def test_step_end(self, output):
-        if not is_overridden("test_step_end", self.lightning_module):
-            return self.reduce(output)
         return output
 
     def teardown(self) -> None:
diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py
index 7313728256..eb72e7a731 100644
--- a/tests/accelerators/test_dp.py
+++ b/tests/accelerators/test_dp.py
@@ -134,8 +134,24 @@ class ReductionTestModel(BoringModel):
 
     def training_epoch_end(self, outputs):
         assert outputs[0]["loss"].shape == torch.Size([])
-        assert outputs[0]["reduce_int"].item() == 0  # mean([0, 1]) = 0
-        assert outputs[0]["reduce_float"].item() == 0.5  # mean([0., 1.]) = 0.5
+        self._assert_extra_outputs(outputs)
+
+    def validation_epoch_end(self, outputs):
+        assert outputs[0]["x"].shape == torch.Size([2])
+        self._assert_extra_outputs(outputs)
+
+    def test_epoch_end(self, outputs):
+        assert outputs[0]["y"].shape == torch.Size([2])
+        self._assert_extra_outputs(outputs)
+
+    def _assert_extra_outputs(self, outputs):
+        out = outputs[0]["reduce_int"]
+        assert torch.eq(out, torch.tensor([0, 1], device="cuda:0")).all()
+        assert out.dtype is torch.int
+
+        out = outputs[0]["reduce_float"]
+        assert torch.eq(out, torch.tensor([0.0, 1.0], device="cuda:0")).all()
+        assert out.dtype is torch.float
 
 
 def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir, monkeypatch):
@@ -188,11 +204,9 @@ def test_dp_training_step_dict(tmpdir):
 
     trainer = pl.Trainer(
         default_root_dir=tmpdir,
-        max_epochs=1,
-        limit_train_batches=1,
-        limit_val_batches=1,
-        limit_test_batches=1,
+        fast_dev_run=True,
         gpus=2,
         strategy="dp",
     )
     trainer.fit(model)
+    trainer.test(model)