lightning/tests/accelerators/test_ipu.py

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Optional
from unittest import mock

import pytest
import torch
import torch.nn.functional as F
from torch.utils.data import DistributedSampler

from pytorch_lightning import Callback, seed_everything, Trainer
from pytorch_lightning.accelerators import IPUAccelerator
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.plugins import IPUPrecisionPlugin
from pytorch_lightning.strategies.ipu import IPUStrategy
from pytorch_lightning.trainer.states import RunningStage, TrainerFn
from pytorch_lightning.trainer.supporters import CombinedLoader
from pytorch_lightning.utilities import _IPU_AVAILABLE
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers.boring_model import BoringModel
from tests.helpers.datamodules import ClassifDataModule
from tests.helpers.runif import RunIf
from tests.helpers.simple_models import ClassificationModel

if _IPU_AVAILABLE:
    import poptorch


class IPUModel(BoringModel):
    def training_step(self, batch, batch_idx):
        output = self(batch)
        loss = self.loss(batch, output)
        return loss

    def validation_step(self, batch, batch_idx):
        output = self(batch)
        loss = self.loss(batch, output)
        return loss

    def test_step(self, batch, batch_idx):
        output = self(batch)
        loss = self.loss(batch, output)
        return loss

    def training_epoch_end(self, outputs) -> None:
        pass

    def validation_epoch_end(self, outputs) -> None:
        pass

    def test_epoch_end(self, outputs) -> None:
        pass


class IPUClassificationModel(ClassificationModel):
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = self.accuracy(logits, y)
        return acc

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = self.accuracy(logits, y)
        return acc

    def accuracy(self, logits, y):
        # todo (sean): currently IPU poptorch doesn't implicit convert bools to tensor
        # hence we use an explicit calculation for accuracy here. Once fixed in poptorch
        # we can use the accuracy metric.
        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
        return acc

    def validation_epoch_end(self, outputs) -> None:
        self.log("val_acc", torch.stack(outputs).mean())

    def test_epoch_end(self, outputs) -> None:
        self.log("test_acc", torch.stack(outputs).mean())


@pytest.mark.skipif(_IPU_AVAILABLE, reason="test requires non-IPU machine")
@mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True)
def test_fail_if_no_ipus(mock_ipu_acc_avail, tmpdir):
    with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"):
        Trainer(default_root_dir=tmpdir, ipus=1)

    with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"):
        Trainer(default_root_dir=tmpdir, ipus=1, accelerator="ipu")


@RunIf(ipu=True)
def test_accelerator_selected(tmpdir):
    assert IPUAccelerator.is_available()
    trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=1)
    assert isinstance(trainer.accelerator, IPUAccelerator)


@RunIf(ipu=True)
def test_warning_if_ipus_not_used():
    with pytest.warns(UserWarning, match="IPU available but not used. Set `accelerator` and `devices`"):
        Trainer(accelerator="cpu")


@RunIf(ipu=True)
def test_no_warning_plugin(tmpdir):
    with pytest.warns(None) as record:
        Trainer(default_root_dir=tmpdir, max_epochs=1, strategy=IPUStrategy(training_opts=poptorch.Options()))
    assert len(record) == 0


@RunIf(ipu=True)
@pytest.mark.parametrize("devices", [1, 4])
def test_all_stages(tmpdir, devices):
    model = IPUModel()
    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="ipu", devices=devices)
    trainer.fit(model)
    trainer.validate(model)
    trainer.test(model)
    trainer.predict(model)


@RunIf(ipu=True)
@pytest.mark.parametrize("devices", [1, 4])
def test_inference_only(tmpdir, devices):
    model = IPUModel()

    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="ipu", devices=devices)
    trainer.validate(model)
    trainer.test(model)
    trainer.predict(model)


@RunIf(ipu=True)
def test_optimization(tmpdir):
    seed_everything(42)

    dm = ClassifDataModule(length=1024)
    model = IPUClassificationModel()

    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="ipu", devices=2)

    # fit model
    trainer.fit(model, dm)
    assert trainer.state.finished, f"Training failed with {trainer.state}"
    assert dm.trainer is not None

    # validate
    result = trainer.validate(datamodule=dm)
    assert dm.trainer is not None
    assert result[0]["val_acc"] > 0.7

    # test
    result = trainer.test(model, datamodule=dm)
    assert dm.trainer is not None
    test_result = result[0]["test_acc"]
    assert test_result > 0.6

    # test saved model
    model_path = os.path.join(tmpdir, "model.pt")
    trainer.save_checkpoint(model_path)

    model = IPUClassificationModel.load_from_checkpoint(model_path)

    trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=2)

    result = trainer.test(model, datamodule=dm)
    saved_result = result[0]["test_acc"]
    assert saved_result == test_result


@RunIf(ipu=True)
def test_mixed_precision(tmpdir):
    class TestCallback(Callback):
        def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None:
            assert trainer.strategy.model.precision == 16
            raise SystemExit

    model = IPUModel()
    trainer = Trainer(
        default_root_dir=tmpdir, fast_dev_run=True, accelerator="ipu", devices=1, precision=16, callbacks=TestCallback()
    )
    assert isinstance(trainer.strategy.precision_plugin, IPUPrecisionPlugin)
    assert trainer.strategy.precision_plugin.precision == 16
    with pytest.raises(SystemExit):
        trainer.fit(model)


@RunIf(ipu=True)
def test_pure_half_precision(tmpdir):
    class TestCallback(Callback):
        def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
            assert trainer.strategy.model.precision == 16
            for param in trainer.strategy.model.parameters():
                assert param.dtype == torch.float16
            raise SystemExit

    model = IPUModel()
    model = model.half()
    trainer = Trainer(
        default_root_dir=tmpdir, fast_dev_run=True, accelerator="ipu", devices=1, precision=16, callbacks=TestCallback()
    )

    assert isinstance(trainer.strategy, IPUStrategy)
    assert isinstance(trainer.strategy.precision_plugin, IPUPrecisionPlugin)
    assert trainer.strategy.precision_plugin.precision == 16

    with pytest.raises(SystemExit):
        trainer.fit(model)


@RunIf(ipu=True)
def test_device_iterations_ipu_plugin(tmpdir):
    class TestCallback(Callback):
        def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
            assert trainer.strategy.device_iterations == 2
            # assert device iterations has been set correctly within the poptorch options
            poptorch_model = trainer.strategy.poptorch_models[RunningStage.TRAINING]
            assert poptorch_model._options.toDict()["device_iterations"] == 2
            raise SystemExit

    model = IPUModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        accelerator="ipu",
        devices=1,
        strategy=IPUStrategy(device_iterations=2),
        callbacks=TestCallback(),
    )
    assert isinstance(trainer.strategy, IPUStrategy)
    with pytest.raises(SystemExit):
        trainer.fit(model)


@RunIf(ipu=True)
def test_accumulated_batches(tmpdir):
    class TestCallback(Callback):
        def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
            # ensure the accumulation_scheduler is overridden to accumulate every batch
            # since ipu handle accumulation
            assert trainer.accumulation_scheduler.scheduling == {0: 1}
            # assert poptorch option have been set correctly
            poptorch_model = trainer.strategy.poptorch_models[RunningStage.TRAINING]
            assert poptorch_model._options.Training.toDict()["gradient_accumulation"] == 2
            raise SystemExit

    model = IPUModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        accelerator="ipu",
        devices=1,
        accumulate_grad_batches=2,
        callbacks=TestCallback(),
    )
    with pytest.raises(SystemExit):
        trainer.fit(model)


@RunIf(ipu=True)
def test_stages_correct(tmpdir):
    """Ensure all stages correctly are traced correctly by asserting the output for each stage."""

    class StageModel(IPUModel):
        def training_step(self, batch, batch_idx):
            loss = super().training_step(batch, batch_idx)
            # tracing requires a loss value that depends on the model.
            # force it to be a value but ensure we use the loss.
            return (loss - loss) + torch.tensor(1)

        def validation_step(self, batch, batch_idx):
            loss = super().validation_step(batch, batch_idx)
            return (loss - loss) + torch.tensor(2)

        def test_step(self, batch, batch_idx):
            loss = super().validation_step(batch, batch_idx)
            return (loss - loss) + torch.tensor(3)

        def predict_step(self, batch, batch_idx, dataloader_idx=0):
            output = super().predict_step(batch, batch_idx)
            return (output - output) + torch.tensor(4)

    class TestCallback(Callback):
        def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
            assert outputs["loss"].item() == 1

        def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
            assert outputs.item() == 2

        def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
            assert outputs.item() == 3

        def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
            assert torch.all(outputs == 4).item()

    model = StageModel()
    trainer = Trainer(
        default_root_dir=tmpdir, fast_dev_run=True, accelerator="ipu", devices=1, callbacks=TestCallback()
    )
    trainer.fit(model)
    trainer.test(model)
    trainer.validate(model)
    trainer.predict(model, model.test_dataloader())


@RunIf(ipu=True)
def test_different_accumulate_grad_batches_fails(tmpdir):
    model = IPUModel()
    trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=1, accumulate_grad_batches={1: 2})
    with pytest.raises(
        MisconfigurationException, match="IPUs currently does not support different `accumulate_grad_batches`"
    ):
        trainer.fit(model)


@RunIf(ipu=True)
def test_clip_gradients_fails(tmpdir):
    model = IPUModel()
    trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=1, gradient_clip_val=10)
    with pytest.raises(MisconfigurationException, match="IPUs currently do not support clipping gradients."):
        trainer.fit(model)


@RunIf(ipu=True)
def test_autoreport(tmpdir):
    """Ensure autoreport dumps to a file."""
    model = IPUModel()
    autoreport_path = os.path.join(tmpdir, "report/")
    trainer = Trainer(
        default_root_dir=tmpdir,
        accelerator="ipu",
        devices=1,
        fast_dev_run=True,
        strategy=IPUStrategy(autoreport=True, autoreport_dir=autoreport_path),
    )
    trainer.fit(model)
    assert os.path.exists(autoreport_path)
    assert os.path.isfile(autoreport_path + "training/profile.pop")


@RunIf(ipu=True)
def test_manual_poptorch_dataloader(tmpdir):
    model_options = poptorch.Options()

    class IPUTestModel(IPUModel):
        def train_dataloader(self):
            dataloader = super().train_dataloader()
            # save to instance to compare the reference later
            self.poptorch_dataloader = poptorch.DataLoader(model_options, dataloader.dataset, drop_last=True)
            return self.poptorch_dataloader

    model = IPUTestModel()
    other_options = poptorch.Options()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        accelerator="ipu",
        devices=2,
        strategy=IPUStrategy(training_opts=other_options),
    )
    trainer.fit(model)

    assert isinstance(trainer.strategy, IPUStrategy)
    assert trainer.strategy.training_opts is other_options
    dataloader = trainer.train_dataloader.loaders
    assert dataloader is model.poptorch_dataloader  # exact object, was not recreated
    # dataloader uses the options in the model, not the strategy
    assert dataloader.options is model_options
    assert dataloader.options is not other_options
    assert dataloader.drop_last  # was kept


@RunIf(ipu=True)
def test_manual_poptorch_opts(tmpdir):
    """Ensure if the user passes manual poptorch Options, we run with the correct object."""
    model = IPUModel()
    inference_opts = poptorch.Options()
    training_opts = poptorch.Options()

    trainer = Trainer(
        default_root_dir=tmpdir,
        ipus=2,
        fast_dev_run=True,
        strategy=IPUStrategy(inference_opts=inference_opts, training_opts=training_opts),
    )
    trainer.fit(model)

    assert isinstance(trainer.strategy, IPUStrategy)
    assert trainer.strategy.training_opts == training_opts
    assert trainer.strategy.inference_opts == inference_opts

    dataloader = trainer.train_dataloader.loaders
    assert isinstance(dataloader, poptorch.DataLoader)
    assert dataloader.options == training_opts
    assert trainer.num_devices > 1  # testing this only makes sense in a distributed setting
    assert not isinstance(dataloader.sampler, DistributedSampler)


@RunIf(ipu=True)
def test_manual_poptorch_opts_custom(tmpdir):
    """Ensure if the user passes manual poptorch Options with custom parameters set, we respect them in our
    poptorch options and the dataloaders."""

    model = IPUModel()
    training_opts = poptorch.Options()
    training_opts.deviceIterations(8)
    training_opts.replicationFactor(2)
    training_opts.Training.gradientAccumulation(2)

    inference_opts = poptorch.Options()
    inference_opts.deviceIterations(16)
    inference_opts.replicationFactor(1)
    inference_opts.Training.gradientAccumulation(1)

    class TestCallback(Callback):
        def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
            # ensure dataloaders were correctly set up during training.
            plugin = trainer.strategy
            assert isinstance(plugin, IPUStrategy)
            assert plugin.training_opts.replication_factor == 2
            assert plugin.inference_opts.replication_factor == 1

            val_dataloader = trainer.val_dataloaders[0]
            train_dataloader = trainer.train_dataloader
            assert isinstance(train_dataloader, CombinedLoader)
            train_dataloader = train_dataloader.loaders
            assert isinstance(val_dataloader, poptorch.DataLoader)
            assert isinstance(train_dataloader, poptorch.DataLoader)
            assert train_dataloader.options.replication_factor == 2
            assert val_dataloader.options.replication_factor == 1

    plugin = IPUStrategy(inference_opts=inference_opts, training_opts=training_opts)
    # ensure we default to the training options replication factor
    assert plugin.replication_factor == 2
    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy=plugin, callbacks=TestCallback())
    trainer.fit(model)

    plugin = trainer.strategy
    assert isinstance(plugin, IPUStrategy)

    training_opts = plugin.training_opts
    assert training_opts.device_iterations == 8
    assert training_opts.replication_factor == 2
    assert training_opts.Training.gradient_accumulation == 2

    inference_opts = plugin.inference_opts
    assert inference_opts.device_iterations == 16
    assert inference_opts.replication_factor == 1
    assert inference_opts.Training.gradient_accumulation == 1


@RunIf(ipu=True)
def test_replication_factor(tmpdir):
    """Ensure if the user passes manual poptorch Options with custom parameters set, we set them correctly in the
    dataloaders."""

    plugin = IPUStrategy()
    trainer = Trainer(accelerator="ipu", devices=2, default_root_dir=tmpdir, fast_dev_run=True, strategy=plugin)
    assert isinstance(trainer.accelerator, IPUAccelerator)
    assert trainer.num_devices == 2
    assert trainer.strategy.replication_factor == 2

    model = BoringModel()
    training_opts = poptorch.Options()
    inference_opts = poptorch.Options()
    training_opts.replicationFactor(8)
    inference_opts.replicationFactor(7)
    plugin = IPUStrategy(inference_opts=inference_opts, training_opts=training_opts)

    trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=1, strategy=plugin)
    trainer.optimizers = model.configure_optimizers()[0]
    plugin.model = model
    model.trainer = trainer
    trainer.state.fn = TrainerFn.FITTING
    trainer.strategy.setup(trainer)

    trainer.state.stage = RunningStage.TRAINING
    assert trainer.strategy.replication_factor == 8
    trainer.state.stage = RunningStage.VALIDATING
    assert trainer.strategy.replication_factor == 7

    for fn, stage in (
        (TrainerFn.VALIDATING, RunningStage.VALIDATING),
        (TrainerFn.TESTING, RunningStage.TESTING),
        (TrainerFn.PREDICTING, RunningStage.PREDICTING),
    ):
        trainer.state.fn = fn
        trainer.state.stage = stage
        trainer.strategy.setup(trainer)
        assert trainer.strategy.replication_factor == 7


@RunIf(ipu=True)
def test_default_opts(tmpdir):
    """Ensure default opts are set correctly in the IPUStrategy."""

    model = IPUModel()

    trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=1, fast_dev_run=True)
    trainer.fit(model)
    assert isinstance(trainer.strategy, IPUStrategy)
    inference_opts = trainer.strategy.inference_opts
    training_opts = trainer.strategy.training_opts
    for opts in (inference_opts, training_opts):
        assert isinstance(opts, poptorch.Options)
        assert opts.Training.gradient_accumulation == 1
        assert opts.device_iterations == 1
        assert opts.replication_factor == 1


@RunIf(ipu=True)
def test_multi_optimizers_fails(tmpdir):
    """Ensure if there are multiple optimizers, we throw an exception."""

    class TestModel(IPUModel):
        def configure_optimizers(self):
            return [torch.optim.Adam(self.parameters()), torch.optim.Adam(self.parameters())]

    model = TestModel()

    trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=1)
    with pytest.raises(MisconfigurationException, match="IPUs currently only support one optimizer."):
        trainer.fit(model)


@RunIf(ipu=True)
def test_precision_plugin(tmpdir):
    """Ensure precision plugin value is set correctly."""

    plugin = IPUPrecisionPlugin(precision=16)
    assert plugin.precision == 16


@RunIf(ipu=True)
def test_accelerator_ipu():
    trainer = Trainer(accelerator="ipu", ipus=1)
    assert isinstance(trainer.accelerator, IPUAccelerator)

    trainer = Trainer(accelerator="ipu")
    assert isinstance(trainer.accelerator, IPUAccelerator)

    trainer = Trainer(accelerator="auto", ipus=8)
    assert isinstance(trainer.accelerator, IPUAccelerator)


@RunIf(ipu=True)
def test_accelerator_ipu_with_devices():

    trainer = Trainer(accelerator="ipu", devices=8)
    assert isinstance(trainer.strategy, IPUStrategy)
    assert isinstance(trainer.accelerator, IPUAccelerator)
    assert trainer.num_devices == 8


@RunIf(ipu=True)
def test_accelerator_auto_with_devices_ipu():
    trainer = Trainer(accelerator="auto", devices=8)
    assert isinstance(trainer.accelerator, IPUAccelerator)
    assert trainer.num_devices == 8


@RunIf(ipu=True)
def test_accelerator_ipu_with_ipus_priority():
    """Test for checking `ipus` flag takes priority over `devices`."""

    ipus = 8
    with pytest.warns(UserWarning, match="The flag `devices=1` will be ignored,"):
        trainer = Trainer(accelerator="ipu", devices=1, ipus=ipus)

    assert isinstance(trainer.accelerator, IPUAccelerator)
    assert trainer.num_devices == ipus


@RunIf(ipu=True)
def test_set_devices_if_none_ipu():

    trainer = Trainer(accelerator="ipu", ipus=8)
    assert trainer.num_devices == 8


@RunIf(ipu=True)
def test_strategy_choice_ipu_plugin(tmpdir):
    trainer = Trainer(strategy=IPUStrategy(), accelerator="ipu", devices=8)
    assert isinstance(trainer.strategy, IPUStrategy)


@RunIf(ipu=True)
def test_device_type_when_training_plugin_ipu_passed(tmpdir):
    trainer = Trainer(strategy=IPUStrategy(), accelerator="ipu", devices=8)
    assert isinstance(trainer.strategy, IPUStrategy)
    assert isinstance(trainer.accelerator, IPUAccelerator)


@RunIf(ipu=True)
def test_poptorch_models_at_different_stages(tmpdir):
    plugin = IPUStrategy()
    trainer = Trainer(default_root_dir=tmpdir, strategy=plugin, accelerator="ipu", devices=8)
    model = BoringModel()
    model.trainer = trainer
    plugin.model = model

    trainer.optimizers = model.configure_optimizers()[0]
    trainer.state.fn = TrainerFn.FITTING
    trainer.strategy.setup(trainer)
    assert list(trainer.strategy.poptorch_models) == [RunningStage.TRAINING, RunningStage.VALIDATING]

    for fn, stage in (
        (TrainerFn.VALIDATING, RunningStage.VALIDATING),
        (TrainerFn.TESTING, RunningStage.TESTING),
        (TrainerFn.PREDICTING, RunningStage.PREDICTING),
    ):
        trainer.state.fn = fn
        trainer.state.stage = stage
        trainer.strategy.setup(trainer)
        assert list(trainer.strategy.poptorch_models) == [stage]


@RunIf(ipu=True)
def test_devices_auto_choice_ipu():
    trainer = Trainer(accelerator="auto", devices="auto")
    assert trainer.num_devices == 4
    assert isinstance(trainer.accelerator, IPUAccelerator)