lightning/tests/accelerators/test_dp.py

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

import pytorch_lightning as pl
import tests.helpers.pipelines as tpipes
import tests.helpers.utils as tutils
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.core import memory
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers import BoringModel, RandomDataset
from tests.helpers.datamodules import ClassifDataModule
from tests.helpers.runif import RunIf
from tests.helpers.simple_models import ClassificationModel


class CustomClassificationModelDP(ClassificationModel):

    def _step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        return {'logits': logits, 'y': y}

    def training_step(self, batch, batch_idx):
        out = self._step(batch, batch_idx)
        loss = F.cross_entropy(out['logits'], out['y'])
        return loss

    def validation_step(self, batch, batch_idx):
        return self._step(batch, batch_idx)

    def test_step(self, batch, batch_idx):
        return self._step(batch, batch_idx)

    def validation_step_end(self, outputs):
        self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))

    def test_step_end(self, outputs):
        self.log('test_acc', self.test_acc(outputs['logits'], outputs['y']))


@RunIf(min_gpus=2)
def test_multi_gpu_early_stop_dp(tmpdir):
    """Make sure DDP works. with early stopping"""
    tutils.set_random_master_port()

    dm = ClassifDataModule()
    model = CustomClassificationModelDP()

    trainer_options = dict(
        default_root_dir=tmpdir,
        callbacks=[EarlyStopping(monitor='val_acc')],
        max_epochs=50,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        accelerator='dp',
    )

    tpipes.run_model_test(trainer_options, model, dm)


@RunIf(min_gpus=2)
def test_multi_gpu_model_dp(tmpdir):
    tutils.set_random_master_port()

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=10,
        limit_val_batches=10,
        gpus=[0, 1],
        accelerator='dp',
        progress_bar_refresh_rate=0,
    )

    model = BoringModel()

    tpipes.run_model_test(trainer_options, model)

    # test memory helper functions
    memory.get_memory_profile('min_max')


class ReductionTestModel(BoringModel):

    def train_dataloader(self):
        return DataLoader(RandomDataset(32, 64), batch_size=2)

    def val_dataloader(self):
        return DataLoader(RandomDataset(32, 64), batch_size=2)

    def test_dataloader(self):
        return DataLoader(RandomDataset(32, 64), batch_size=2)

    def add_outputs(self, output, device):
        output.update({
            "reduce_int": torch.tensor(device.index, dtype=torch.int, device=device),
            "reduce_float": torch.tensor(device.index, dtype=torch.float, device=device),
        })

    def training_step(self, batch, batch_idx):
        output = super().training_step(batch, batch_idx)
        self.add_outputs(output, batch.device)
        return output

    def validation_step(self, batch, batch_idx):
        output = super().validation_step(batch, batch_idx)
        self.add_outputs(output, batch.device)
        return output

    def test_step(self, batch, batch_idx):
        output = super().test_step(batch, batch_idx)
        self.add_outputs(output, batch.device)
        return output

    def training_epoch_end(self, outputs):
        assert outputs[0]["loss"].shape == torch.Size([])
        assert outputs[0]["reduce_int"].item() == 0  # mean([0, 1]) = 0
        assert outputs[0]["reduce_float"].item() == 0.5  # mean([0., 1.]) = 0.5


def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir, monkeypatch):
    """
    Test that an exception is raised when overriding batch_transfer_hooks in DP model.
    """
    monkeypatch.setattr("torch.cuda.device_count", lambda: 2)

    class CustomModel(BoringModel):

        def transfer_batch_to_device(self, batch, device):
            batch = batch.to(device)
            return batch

    trainer_options = dict(
        default_root_dir=tmpdir,
        max_steps=7,
        gpus=[0, 1],
        accelerator='dp',
    )

    trainer = Trainer(**trainer_options)
    model = CustomModel()

    with pytest.raises(MisconfigurationException, match=r'Overriding `transfer_batch_to_device` is not .* in DP'):
        trainer.fit(model)

    class CustomModel(BoringModel):

        def on_before_batch_transfer(self, batch, dataloader_idx):
            batch += 1
            return batch

    trainer = Trainer(**trainer_options)
    model = CustomModel()

    with pytest.raises(MisconfigurationException, match=r'Overriding `on_before_batch_transfer` is not .* in DP'):
        trainer.fit(model)

    class CustomModel(BoringModel):

        def on_after_batch_transfer(self, batch, dataloader_idx):
            batch += 1
            return batch

    trainer = Trainer(**trainer_options)
    model = CustomModel()

    with pytest.raises(MisconfigurationException, match=r'Overriding `on_after_batch_transfer` is not .* in DP'):
        trainer.fit(model)


@RunIf(min_gpus=2)
def test_dp_training_step_dict(tmpdir):
    """ This test verifies that dp properly reduces dictionaries """
    model = ReductionTestModel()
    model.training_step_end = None
    model.validation_step_end = None
    model.test_step_end = None

    trainer = pl.Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=1,
        limit_val_batches=1,
        limit_test_batches=1,
        gpus=2,
        accelerator='dp',
    )
    trainer.fit(model)
notices (#4118) 2020-10-13 11:18:07 +00:00			`# Copyright The PyTorch Lightning team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
Disable batch transfer in DP mode (#6098) * add exceptions and test * hook * fix * clean up * clean up * regex * regex * docs * rev * comment and docs * chlog * Apply suggestions from code review Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Apply suggestions from code review Co-authored-by: chaton <thomas@grid.ai> * Monkey-patch device count * docs * pep * api_change Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: chaton <thomas@grid.ai> 2021-03-11 15:51:10 +00:00			`import pytest`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00			`import torch`
fixing miss-leading tested acc values (#5876) * fixing tested values * . * tests * yapf * softmax * hvd * rename * lr * duplicate * drop * classif * rm EvalModel * Revert "rm EvalModel" This reverts commit 6c3fb39ebe0c4bfb52357bccfd050438f2c0f31c. * update tests * fix * azure * azure * self * cpu * Apply suggestions from code review Co-authored-by: rohitgr7 <rohitgr1998@gmail.com> 2021-02-23 22:08:46 +00:00			`import torch.nn.functional as F`
fix dp reduction test (#6404) * fix * update * fix * move the class outside 2021-03-08 18:11:20 +00:00			`from torch.utils.data import DataLoader`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00
Fix pre-commit isort failure on tests/backends/.py (#5430) Fix pre-commit isort failure on tests/backends/.py Remove tests.backends from skipped module in pyproject.toml 2021-01-15 00:32:41 +00:00			`import pytorch_lightning as pl`
Refactor simplify tests (#5861) * add new * restructure * yapf * move * fix 2021-02-08 10:52:02 +00:00			`import tests.helpers.pipelines as tpipes`
			`import tests.helpers.utils as tutils`
Disable batch transfer in DP mode (#6098) * add exceptions and test * hook * fix * clean up * clean up * regex * regex * docs * rev * comment and docs * chlog * Apply suggestions from code review Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Apply suggestions from code review Co-authored-by: chaton <thomas@grid.ai> * Monkey-patch device count * docs * pep * api_change Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: chaton <thomas@grid.ai> 2021-03-11 15:51:10 +00:00			`from pytorch_lightning import Trainer`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00			`from pytorch_lightning.callbacks import EarlyStopping`
			`from pytorch_lightning.core import memory`
Disable batch transfer in DP mode (#6098) * add exceptions and test * hook * fix * clean up * clean up * regex * regex * docs * rev * comment and docs * chlog * Apply suggestions from code review Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Apply suggestions from code review Co-authored-by: chaton <thomas@grid.ai> * Monkey-patch device count * docs * pep * api_change Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: chaton <thomas@grid.ai> 2021-03-11 15:51:10 +00:00			`from pytorch_lightning.utilities.exceptions import MisconfigurationException`
fix dp reduction test (#6404) * fix * update * fix * move the class outside 2021-03-08 18:11:20 +00:00			`from tests.helpers import BoringModel, RandomDataset`
fixing miss-leading tested acc values (#5876) * fixing tested values * . * tests * yapf * softmax * hvd * rename * lr * duplicate * drop * classif * rm EvalModel * Revert "rm EvalModel" This reverts commit 6c3fb39ebe0c4bfb52357bccfd050438f2c0f31c. * update tests * fix * azure * azure * self * cpu * Apply suggestions from code review Co-authored-by: rohitgr7 <rohitgr1998@gmail.com> 2021-02-23 22:08:46 +00:00			`from tests.helpers.datamodules import ClassifDataModule`
Refactor: skipif for Windows 2/n (#6268) * win * isort * flake8 2021-03-02 09:36:01 +00:00			`from tests.helpers.runif import RunIf`
fixing miss-leading tested acc values (#5876) * fixing tested values * . * tests * yapf * softmax * hvd * rename * lr * duplicate * drop * classif * rm EvalModel * Revert "rm EvalModel" This reverts commit 6c3fb39ebe0c4bfb52357bccfd050438f2c0f31c. * update tests * fix * azure * azure * self * cpu * Apply suggestions from code review Co-authored-by: rohitgr7 <rohitgr1998@gmail.com> 2021-02-23 22:08:46 +00:00			`from tests.helpers.simple_models import ClassificationModel`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00

fixing miss-leading tested acc values (#5876) * fixing tested values * . * tests * yapf * softmax * hvd * rename * lr * duplicate * drop * classif * rm EvalModel * Revert "rm EvalModel" This reverts commit 6c3fb39ebe0c4bfb52357bccfd050438f2c0f31c. * update tests * fix * azure * azure * self * cpu * Apply suggestions from code review Co-authored-by: rohitgr7 <rohitgr1998@gmail.com> 2021-02-23 22:08:46 +00:00			`class CustomClassificationModelDP(ClassificationModel):`

			`def _step(self, batch, batch_idx):`
			`x, y = batch`
			`logits = self(x)`
			`return {'logits': logits, 'y': y}`

			`def training_step(self, batch, batch_idx):`
			`out = self._step(batch, batch_idx)`
			`loss = F.cross_entropy(out['logits'], out['y'])`
			`return loss`

			`def validation_step(self, batch, batch_idx):`
			`return self._step(batch, batch_idx)`

			`def test_step(self, batch, batch_idx):`
			`return self._step(batch, batch_idx)`

			`def validation_step_end(self, outputs):`
			`self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))`

			`def test_step_end(self, outputs):`
			`self.log('test_acc', self.test_acc(outputs['logits'], outputs['y']))`


Refactor: skipif for multi - gpus 1/n (#6266) * ngpus * gpu * isort * pt * flake8 2021-03-02 08:03:32 +00:00			`@RunIf(min_gpus=2)`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00			`def test_multi_gpu_early_stop_dp(tmpdir):`
			`"""Make sure DDP works. with early stopping"""`
			`tutils.set_random_master_port()`

fixing miss-leading tested acc values (#5876) * fixing tested values * . * tests * yapf * softmax * hvd * rename * lr * duplicate * drop * classif * rm EvalModel * Revert "rm EvalModel" This reverts commit 6c3fb39ebe0c4bfb52357bccfd050438f2c0f31c. * update tests * fix * azure * azure * self * cpu * Apply suggestions from code review Co-authored-by: rohitgr7 <rohitgr1998@gmail.com> 2021-02-23 22:08:46 +00:00			`dm = ClassifDataModule()`
			`model = CustomClassificationModelDP()`

Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00			`trainer_options = dict(`
			`default_root_dir=tmpdir,`
fixing miss-leading tested acc values (#5876) * fixing tested values * . * tests * yapf * softmax * hvd * rename * lr * duplicate * drop * classif * rm EvalModel * Revert "rm EvalModel" This reverts commit 6c3fb39ebe0c4bfb52357bccfd050438f2c0f31c. * update tests * fix * azure * azure * self * cpu * Apply suggestions from code review Co-authored-by: rohitgr7 <rohitgr1998@gmail.com> 2021-02-23 22:08:46 +00:00			`callbacks=[EarlyStopping(monitor='val_acc')],`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00			`max_epochs=50,`
			`limit_train_batches=10,`
			`limit_val_batches=10,`
			`gpus=[0, 1],`
drop usage of deprecated distributed_backend (#5009) Co-authored-by: chaton <thomas@grid.ai> Co-authored-by: Roger Shieh <sh.rog@protonmail.ch> 2020-12-09 08:18:23 +00:00			`accelerator='dp',`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00			`)`

fixing miss-leading tested acc values (#5876) * fixing tested values * . * tests * yapf * softmax * hvd * rename * lr * duplicate * drop * classif * rm EvalModel * Revert "rm EvalModel" This reverts commit 6c3fb39ebe0c4bfb52357bccfd050438f2c0f31c. * update tests * fix * azure * azure * self * cpu * Apply suggestions from code review Co-authored-by: rohitgr7 <rohitgr1998@gmail.com> 2021-02-23 22:08:46 +00:00			`tpipes.run_model_test(trainer_options, model, dm)`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00

Refactor: skipif for multi - gpus 1/n (#6266) * ngpus * gpu * isort * pt * flake8 2021-03-02 08:03:32 +00:00			`@RunIf(min_gpus=2)`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00			`def test_multi_gpu_model_dp(tmpdir):`
			`tutils.set_random_master_port()`

			`trainer_options = dict(`
			`default_root_dir=tmpdir,`
			`max_epochs=1,`
			`limit_train_batches=10,`
			`limit_val_batches=10,`
			`gpus=[0, 1],`
drop usage of deprecated distributed_backend (#5009) Co-authored-by: chaton <thomas@grid.ai> Co-authored-by: Roger Shieh <sh.rog@protonmail.ch> 2020-12-09 08:18:23 +00:00			`accelerator='dp',`
			`progress_bar_refresh_rate=0,`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00			`)`

fixing miss-leading tested acc values (#5876) * fixing tested values * . * tests * yapf * softmax * hvd * rename * lr * duplicate * drop * classif * rm EvalModel * Revert "rm EvalModel" This reverts commit 6c3fb39ebe0c4bfb52357bccfd050438f2c0f31c. * update tests * fix * azure * azure * self * cpu * Apply suggestions from code review Co-authored-by: rohitgr7 <rohitgr1998@gmail.com> 2021-02-23 22:08:46 +00:00			`model = BoringModel()`
Covv1 (#4072) * temporary drop metrics tests while speeding them up * cov * cov * docs 2020-10-11 14:21:53 +00:00
			`tpipes.run_model_test(trainer_options, model)`

			`# test memory helper functions`
			`memory.get_memory_profile('min_max')`


fix dp reduction test (#6404) * fix * update * fix * move the class outside 2021-03-08 18:11:20 +00:00			`class ReductionTestModel(BoringModel):`

			`def train_dataloader(self):`
			`return DataLoader(RandomDataset(32, 64), batch_size=2)`

			`def val_dataloader(self):`
			`return DataLoader(RandomDataset(32, 64), batch_size=2)`

			`def test_dataloader(self):`
			`return DataLoader(RandomDataset(32, 64), batch_size=2)`

			`def add_outputs(self, output, device):`
			`output.update({`
			`"reduce_int": torch.tensor(device.index, dtype=torch.int, device=device),`
			`"reduce_float": torch.tensor(device.index, dtype=torch.float, device=device),`
			`})`

			`def training_step(self, batch, batch_idx):`
			`output = super().training_step(batch, batch_idx)`
			`self.add_outputs(output, batch.device)`
			`return output`

			`def validation_step(self, batch, batch_idx):`
			`output = super().validation_step(batch, batch_idx)`
			`self.add_outputs(output, batch.device)`
			`return output`

			`def test_step(self, batch, batch_idx):`
			`output = super().test_step(batch, batch_idx)`
			`self.add_outputs(output, batch.device)`
			`return output`

			`def training_epoch_end(self, outputs):`
			`assert outputs[0]["loss"].shape == torch.Size([])`
			`assert outputs[0]["reduce_int"].item() == 0 # mean([0, 1]) = 0`
			`assert outputs[0]["reduce_float"].item() == 0.5 # mean([0., 1.]) = 0.5`


Disable batch transfer in DP mode (#6098) * add exceptions and test * hook * fix * clean up * clean up * regex * regex * docs * rev * comment and docs * chlog * Apply suggestions from code review Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Apply suggestions from code review Co-authored-by: chaton <thomas@grid.ai> * Monkey-patch device count * docs * pep * api_change Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: chaton <thomas@grid.ai> 2021-03-11 15:51:10 +00:00			`def test_dp_raise_exception_with_batch_transfer_hooks(tmpdir, monkeypatch):`
			`"""`
			`Test that an exception is raised when overriding batch_transfer_hooks in DP model.`
			`"""`
			`monkeypatch.setattr("torch.cuda.device_count", lambda: 2)`

			`class CustomModel(BoringModel):`

			`def transfer_batch_to_device(self, batch, device):`
			`batch = batch.to(device)`
			`return batch`

			`trainer_options = dict(`
			`default_root_dir=tmpdir,`
			`max_steps=7,`
			`gpus=[0, 1],`
			`accelerator='dp',`
			`)`

			`trainer = Trainer(**trainer_options)`
			`model = CustomModel()`

			with pytest.raises(MisconfigurationException, match=r'Overriding `transfer_batch_to_device` is not .* in DP'):
			`trainer.fit(model)`

			`class CustomModel(BoringModel):`

			`def on_before_batch_transfer(self, batch, dataloader_idx):`
			`batch += 1`
			`return batch`

			`trainer = Trainer(**trainer_options)`
			`model = CustomModel()`

			with pytest.raises(MisconfigurationException, match=r'Overriding `on_before_batch_transfer` is not .* in DP'):
			`trainer.fit(model)`

			`class CustomModel(BoringModel):`

			`def on_after_batch_transfer(self, batch, dataloader_idx):`
			`batch += 1`
			`return batch`

			`trainer = Trainer(**trainer_options)`
			`model = CustomModel()`

			with pytest.raises(MisconfigurationException, match=r'Overriding `on_after_batch_transfer` is not .* in DP'):
			`trainer.fit(model)`


[bugfix] Perform reduction for dict in training_step and DP (#6324) * fix * update * update * add changelog * Update CHANGELOG.md Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update tests/accelerators/test_dp.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update changelog Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2021-03-04 23:10:52 +00:00			`@RunIf(min_gpus=2)`
			`def test_dp_training_step_dict(tmpdir):`
fix dp reduction test (#6404) * fix * update * fix * move the class outside 2021-03-08 18:11:20 +00:00			`""" This test verifies that dp properly reduces dictionaries """`
			`model = ReductionTestModel()`
[bugfix] Perform reduction for dict in training_step and DP (#6324) * fix * update * update * add changelog * Update CHANGELOG.md Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update tests/accelerators/test_dp.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update changelog Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2021-03-04 23:10:52 +00:00			`model.training_step_end = None`
fix dp reduction test (#6404) * fix * update * fix * move the class outside 2021-03-08 18:11:20 +00:00			`model.validation_step_end = None`
			`model.test_step_end = None`

[bugfix] Perform reduction for dict in training_step and DP (#6324) * fix * update * update * add changelog * Update CHANGELOG.md Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update tests/accelerators/test_dp.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update changelog Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2021-03-04 23:10:52 +00:00			`trainer = pl.Trainer(`
			`default_root_dir=tmpdir,`
			`max_epochs=1,`
fix dp reduction test (#6404) * fix * update * fix * move the class outside 2021-03-08 18:11:20 +00:00			`limit_train_batches=1,`
			`limit_val_batches=1,`
			`limit_test_batches=1,`
[bugfix] Perform reduction for dict in training_step and DP (#6324) * fix * update * update * add changelog * Update CHANGELOG.md Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update tests/accelerators/test_dp.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update changelog Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2021-03-04 23:10:52 +00:00			`gpus=2,`
			`accelerator='dp',`
			`)`
			`trainer.fit(model)`