From e6b34ef90d5f3eac70154b305b476614a64f1981 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 2 May 2020 17:01:44 +0200
Subject: [PATCH] [WIP] Reduction when batch size < num gpus (#1609)

* reduce if <= num_gpus

* add test with explanation

* chlog

* fix changelog

Co-authored-by: J. Borovec <jirka.borovec@seznam.cz>
---
 CHANGELOG.md                         |  2 ++
 pytorch_lightning/trainer/logging.py |  8 ++---
 tests/trainer/test_dataloaders.py    | 45 ++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef025c6a85..5457a6e980 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added transfer learning example (for a binary classification task in computer vision) ([#1564](https://github.com/PyTorchLightning/pytorch-lightning/pull/1564))
 
 ### Changed
+ 
+- Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609))
 
 ### Deprecated
 
diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py
index c1d598dc71..978ac5df78 100644
--- a/pytorch_lightning/trainer/logging.py
+++ b/pytorch_lightning/trainer/logging.py
@@ -196,8 +196,8 @@ class TrainerLoggingMixin(ABC):
             elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0:
                 pass
 
-            # reduce only metrics that have the same number of gpus
-            elif output[k].size(0) == num_gpus:
-                reduced = torch.mean(output[k])
-                output[k] = reduced
+            # do not reduce metrics that have batch size > num gpus
+            elif output[k].size(0) <= num_gpus:
+                output[k] = torch.mean(output[k])
+
         return output
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 0d528474d3..b6f6262ee9 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -2,6 +2,8 @@ import platform
 
 import pytest
 import torch
+from torch.utils.data.dataloader import DataLoader
+from torch.utils.data.dataset import Subset
 
 import tests.base.utils as tutils
 from pytorch_lightning import Trainer
@@ -482,3 +484,46 @@ def test_dataloader_reinit_for_subclass():
     assert isinstance(result, torch.utils.data.DataLoader)
     assert isinstance(result, CustomDataLoader)
     assert hasattr(result, 'dummy_kwarg')
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 3, reason='Test requires multiple GPUs')
+def test_batch_size_smaller_than_num_gpus():
+    # we need at least 3 gpus for this test
+    num_gpus = 3
+    batch_size = 3
+
+    class CurrentTestModel(
+        LightTrainDataloader,
+        TestModelBase,
+    ):
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self.c_d1_bn = torch.nn.ReLU()
+
+        def train_dataloader(self):
+            dataloader = super().train_dataloader()
+            # construct a dataset with a size that is not divisible by num_gpus
+            # therefore the last batch will have a size < num_gpus
+            size = num_gpus * batch_size + (num_gpus - 1)
+            dataset = Subset(dataloader.dataset, range(size))
+            dataloader = DataLoader(
+                dataset,
+                batch_size=self.hparams.batch_size,
+                drop_last=False,
+            )
+            return dataloader
+
+    hparams = tutils.get_default_hparams()
+    hparams.batch_size = batch_size
+    model = CurrentTestModel(hparams)
+
+    trainer = Trainer(
+        max_epochs=1,
+        gpus=num_gpus,
+    )
+
+    # we expect the reduction for the metrics also to happen on the last batch
+    # where we will get fewer metrics than gpus
+    result = trainer.fit(model)
+    assert 1 == result