From e6b34ef90d5f3eac70154b305b476614a64f1981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 2 May 2020 17:01:44 +0200 Subject: [PATCH] [WIP] Reduction when batch size < num gpus (#1609) * reduce if <= num_gpus * add test with explanation * chlog * fix changelog Co-authored-by: J. Borovec --- CHANGELOG.md | 2 ++ pytorch_lightning/trainer/logging.py | 8 ++--- tests/trainer/test_dataloaders.py | 45 ++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef025c6a85..5457a6e980 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added transfer learning example (for a binary classification task in computer vision) ([#1564](https://github.com/PyTorchLightning/pytorch-lightning/pull/1564)) ### Changed + +- Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609)) ### Deprecated diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index c1d598dc71..978ac5df78 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -196,8 +196,8 @@ class TrainerLoggingMixin(ABC): elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0: pass - # reduce only metrics that have the same number of gpus - elif output[k].size(0) == num_gpus: - reduced = torch.mean(output[k]) - output[k] = reduced + # do not reduce metrics that have batch size > num gpus + elif output[k].size(0) <= num_gpus: + output[k] = torch.mean(output[k]) + return output diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 0d528474d3..b6f6262ee9 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -2,6 +2,8 @@ import platform import pytest import torch +from torch.utils.data.dataloader import DataLoader +from torch.utils.data.dataset import Subset import tests.base.utils as tutils from pytorch_lightning import Trainer @@ -482,3 +484,46 @@ def test_dataloader_reinit_for_subclass(): assert isinstance(result, torch.utils.data.DataLoader) assert isinstance(result, CustomDataLoader) assert hasattr(result, 'dummy_kwarg') + + +@pytest.mark.skipif(torch.cuda.device_count() < 3, reason='Test requires multiple GPUs') +def test_batch_size_smaller_than_num_gpus(): + # we need at least 3 gpus for this test + num_gpus = 3 + batch_size = 3 + + class CurrentTestModel( + LightTrainDataloader, + TestModelBase, + ): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.c_d1_bn = torch.nn.ReLU() + + def train_dataloader(self): + dataloader = super().train_dataloader() + # construct a dataset with a size that is not divisible by num_gpus + # therefore the last batch will have a size < num_gpus + size = num_gpus * batch_size + (num_gpus - 1) + dataset = Subset(dataloader.dataset, range(size)) + dataloader = DataLoader( + dataset, + batch_size=self.hparams.batch_size, + drop_last=False, + ) + return dataloader + + hparams = tutils.get_default_hparams() + hparams.batch_size = batch_size + model = CurrentTestModel(hparams) + + trainer = Trainer( + max_epochs=1, + gpus=num_gpus, + ) + + # we expect the reduction for the metrics also to happen on the last batch + # where we will get fewer metrics than gpus + result = trainer.fit(model) + assert 1 == result