lightning/tests/accelerators/test_hpu.py

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import pytest
import torch

from pytorch_lightning import Callback, seed_everything, Trainer
from pytorch_lightning.accelerators import HPUAccelerator
from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy
from pytorch_lightning.utilities import _HPU_AVAILABLE
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers.boring_model import BoringModel
from tests.helpers.datamodules import ClassifDataModule
from tests.helpers.runif import RunIf
from tests.helpers.simple_models import ClassificationModel


@RunIf(hpu=True)
def test_availability():
    assert HPUAccelerator.is_available()


@pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")
def test_fail_if_no_hpus():
    with pytest.raises(MisconfigurationException, match="HPUAccelerator can not run on your system"):
        Trainer(accelerator="hpu", devices=1)


@RunIf(hpu=True)
def test_accelerator_selected():
    trainer = Trainer(accelerator="hpu")
    assert isinstance(trainer.accelerator, HPUAccelerator)


@RunIf(hpu=True)
def test_all_stages(tmpdir, hpus):
    """Tests all the model stages using BoringModel on HPU."""
    model = BoringModel()

    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=True,
        accelerator="hpu",
        devices=hpus,
        precision=16,
    )
    trainer.fit(model)
    trainer.validate(model)
    trainer.test(model)
    trainer.predict(model)


@RunIf(hpu=True)
def test_optimization(tmpdir):
    seed_everything(42)

    dm = ClassifDataModule(length=1024)
    model = ClassificationModel()

    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="hpu", devices=1)

    # fit model
    trainer.fit(model, dm)
    assert trainer.state.finished, f"Training failed with {trainer.state}"
    assert dm.trainer is not None

    # validate
    result = trainer.validate(datamodule=dm)
    assert dm.trainer is not None
    assert result[0]["val_acc"] > 0.7

    # test
    result = trainer.test(model, datamodule=dm)
    assert dm.trainer is not None
    test_result = result[0]["test_acc"]
    assert test_result > 0.6

    # test saved model
    model_path = os.path.join(tmpdir, "model.pt")
    trainer.save_checkpoint(model_path)

    model = ClassificationModel.load_from_checkpoint(model_path)

    trainer = Trainer(default_root_dir=tmpdir, accelerator="hpu", devices=1)

    result = trainer.test(model, datamodule=dm)
    saved_result = result[0]["test_acc"]
    assert saved_result == test_result


@RunIf(hpu=True)
def test_stages_correct(tmpdir):
    """Ensure all stages correctly are traced correctly by asserting the output for each stage."""

    class StageModel(BoringModel):
        def training_step(self, batch, batch_idx):
            loss = super().training_step(batch, batch_idx)
            loss = loss.get("loss")
            # tracing requires a loss value that depends on the model.
            # force it to be a value but ensure we use the loss.
            loss = (loss - loss) + torch.tensor(1)
            return {"loss": loss}

        def validation_step(self, batch, batch_idx):
            loss = super().validation_step(batch, batch_idx)
            x = loss.get("x")
            x = (x - x) + torch.tensor(2)
            return {"x": x}

        def test_step(self, batch, batch_idx):
            loss = super().test_step(batch, batch_idx)
            y = loss.get("y")
            y = (y - y) + torch.tensor(3)
            return {"y": y}

        def predict_step(self, batch, batch_idx, dataloader_idx=None):
            output = super().predict_step(batch, batch_idx)
            return (output - output) + torch.tensor(4)

    class TestCallback(Callback):
        def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:
            assert outputs["loss"].item() == 1

        def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
            assert outputs["x"].item() == 2

        def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
            assert outputs["y"].item() == 3

        def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:
            assert torch.all(outputs == 4).item()

    model = StageModel()
    trainer = Trainer(
        default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=1, callbacks=TestCallback()
    )
    trainer.fit(model)
    trainer.test(model)
    trainer.validate(model)
    trainer.predict(model)


@RunIf(hpu=True)
def test_accelerator_hpu():

    trainer = Trainer(accelerator="hpu", devices=1)
    assert isinstance(trainer.accelerator, HPUAccelerator)
    assert trainer.num_devices == 1

    trainer = Trainer(accelerator="hpu")
    assert isinstance(trainer.accelerator, HPUAccelerator)
    assert trainer.num_devices == 8

    trainer = Trainer(accelerator="auto", devices=8)
    assert isinstance(trainer.accelerator, HPUAccelerator)
    assert trainer.num_devices == 8


@RunIf(hpu=True)
def test_accelerator_hpu_with_single_device():

    trainer = Trainer(accelerator="hpu", devices=1)

    assert isinstance(trainer.strategy, SingleHPUStrategy)
    assert isinstance(trainer.accelerator, HPUAccelerator)


@RunIf(hpu=True)
def test_accelerator_hpu_with_multiple_devices():

    trainer = Trainer(accelerator="hpu", devices=8)

    assert isinstance(trainer.strategy, HPUParallelStrategy)
    assert isinstance(trainer.accelerator, HPUAccelerator)


@RunIf(hpu=True)
def test_accelerator_auto_with_devices_hpu():

    trainer = Trainer(accelerator="auto", devices=8)

    assert isinstance(trainer.strategy, HPUParallelStrategy)


@RunIf(hpu=True)
def test_strategy_choice_hpu_plugin():
    trainer = Trainer(strategy=SingleHPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
    assert isinstance(trainer.strategy, SingleHPUStrategy)

    trainer = Trainer(accelerator="hpu", devices=1)
    assert isinstance(trainer.strategy, SingleHPUStrategy)


@RunIf(hpu=True)
def test_strategy_choice_hpu_parallel_plugin():
    trainer = Trainer(
        strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")] * 8), accelerator="hpu", devices=8
    )
    assert isinstance(trainer.strategy, HPUParallelStrategy)

    trainer = Trainer(accelerator="hpu", devices=8)
    assert isinstance(trainer.strategy, HPUParallelStrategy)


@RunIf(hpu=True)
def test_devices_auto_choice_hpu():
    trainer = Trainer(accelerator="auto", devices="auto")
    assert trainer.num_devices == 8


@RunIf(hpu=True)
@pytest.mark.parametrize("hpus", [1])
def test_inference_only(tmpdir, hpus):
    model = BoringModel()

    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=hpus)
    trainer.validate(model)
    trainer.test(model)
    trainer.predict(model)


def test_hpu_auto_device_count():
    assert HPUAccelerator.auto_device_count() == 8


@RunIf(hpu=True)
def test_hpu_unsupported_device_type():
    with pytest.raises(MisconfigurationException, match="`devices` for `HPUAccelerator` must be int, string or None."):
        Trainer(accelerator="hpu", devices=[1])
Add support for Habana accelerator (HPU) (#11808) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com> Co-authored-by: Aki Nitta <nitta@akihironitta.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: four4fish <88516121+four4fish@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kaushik B <kaushikbokka@gmail.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Co-authored-by: jjenniferdai <89552168+jjenniferdai@users.noreply.github.com> Co-authored-by: Kushashwa Ravi Shrimali <kushashwaravishrimali@gmail.com> Co-authored-by: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.comk-Pro.local> 2022-03-25 10:24:52 +00:00			`# Copyright The PyTorch Lightning team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`import os`

			`import pytest`
			`import torch`

			`from pytorch_lightning import Callback, seed_everything, Trainer`
			`from pytorch_lightning.accelerators import HPUAccelerator`
			`from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy`
			`from pytorch_lightning.strategies.single_hpu import SingleHPUStrategy`
			`from pytorch_lightning.utilities import _HPU_AVAILABLE`
			`from pytorch_lightning.utilities.exceptions import MisconfigurationException`
			`from tests.helpers.boring_model import BoringModel`
			`from tests.helpers.datamodules import ClassifDataModule`
			`from tests.helpers.runif import RunIf`
			`from tests.helpers.simple_models import ClassificationModel`


			`@RunIf(hpu=True)`
			`def test_availability():`
			`assert HPUAccelerator.is_available()`


			`@pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")`
			`def test_fail_if_no_hpus():`
			`with pytest.raises(MisconfigurationException, match="HPUAccelerator can not run on your system"):`
			`Trainer(accelerator="hpu", devices=1)`


			`@RunIf(hpu=True)`
			`def test_accelerator_selected():`
			`trainer = Trainer(accelerator="hpu")`
			`assert isinstance(trainer.accelerator, HPUAccelerator)`


			`@RunIf(hpu=True)`
			`def test_all_stages(tmpdir, hpus):`
			`"""Tests all the model stages using BoringModel on HPU."""`
			`model = BoringModel()`

			`trainer = Trainer(`
			`default_root_dir=tmpdir,`
			`fast_dev_run=True,`
			`accelerator="hpu",`
			`devices=hpus,`
			`precision=16,`
			`)`
			`trainer.fit(model)`
			`trainer.validate(model)`
			`trainer.test(model)`
			`trainer.predict(model)`


			`@RunIf(hpu=True)`
			`def test_optimization(tmpdir):`
			`seed_everything(42)`

			`dm = ClassifDataModule(length=1024)`
			`model = ClassificationModel()`

			`trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="hpu", devices=1)`

			`# fit model`
			`trainer.fit(model, dm)`
			`assert trainer.state.finished, f"Training failed with {trainer.state}"`
			`assert dm.trainer is not None`

			`# validate`
			`result = trainer.validate(datamodule=dm)`
			`assert dm.trainer is not None`
			`assert result[0]["val_acc"] > 0.7`

			`# test`
			`result = trainer.test(model, datamodule=dm)`
			`assert dm.trainer is not None`
			`test_result = result[0]["test_acc"]`
			`assert test_result > 0.6`

			`# test saved model`
			`model_path = os.path.join(tmpdir, "model.pt")`
			`trainer.save_checkpoint(model_path)`

			`model = ClassificationModel.load_from_checkpoint(model_path)`

			`trainer = Trainer(default_root_dir=tmpdir, accelerator="hpu", devices=1)`

			`result = trainer.test(model, datamodule=dm)`
			`saved_result = result[0]["test_acc"]`
			`assert saved_result == test_result`


			`@RunIf(hpu=True)`
			`def test_stages_correct(tmpdir):`
			`"""Ensure all stages correctly are traced correctly by asserting the output for each stage."""`

			`class StageModel(BoringModel):`
			`def training_step(self, batch, batch_idx):`
			`loss = super().training_step(batch, batch_idx)`
			`loss = loss.get("loss")`
			`# tracing requires a loss value that depends on the model.`
			`# force it to be a value but ensure we use the loss.`
			`loss = (loss - loss) + torch.tensor(1)`
			`return {"loss": loss}`

			`def validation_step(self, batch, batch_idx):`
			`loss = super().validation_step(batch, batch_idx)`
			`x = loss.get("x")`
			`x = (x - x) + torch.tensor(2)`
			`return {"x": x}`

			`def test_step(self, batch, batch_idx):`
			`loss = super().test_step(batch, batch_idx)`
			`y = loss.get("y")`
			`y = (y - y) + torch.tensor(3)`
			`return {"y": y}`

			`def predict_step(self, batch, batch_idx, dataloader_idx=None):`
			`output = super().predict_step(batch, batch_idx)`
			`return (output - output) + torch.tensor(4)`

			`class TestCallback(Callback):`
			`def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None:`
			`assert outputs["loss"].item() == 1`

			`def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:`
			`assert outputs["x"].item() == 2`

			`def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:`
			`assert outputs["y"].item() == 3`

			`def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None:`
			`assert torch.all(outputs == 4).item()`

			`model = StageModel()`
			`trainer = Trainer(`
			`default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=1, callbacks=TestCallback()`
			`)`
			`trainer.fit(model)`
			`trainer.test(model)`
			`trainer.validate(model)`
			`trainer.predict(model)`


			`@RunIf(hpu=True)`
			`def test_accelerator_hpu():`

			`trainer = Trainer(accelerator="hpu", devices=1)`
			`assert isinstance(trainer.accelerator, HPUAccelerator)`
			`assert trainer.num_devices == 1`

			`trainer = Trainer(accelerator="hpu")`
			`assert isinstance(trainer.accelerator, HPUAccelerator)`
			`assert trainer.num_devices == 8`

			`trainer = Trainer(accelerator="auto", devices=8)`
			`assert isinstance(trainer.accelerator, HPUAccelerator)`
			`assert trainer.num_devices == 8`


			`@RunIf(hpu=True)`
			`def test_accelerator_hpu_with_single_device():`

			`trainer = Trainer(accelerator="hpu", devices=1)`

			`assert isinstance(trainer.strategy, SingleHPUStrategy)`
			`assert isinstance(trainer.accelerator, HPUAccelerator)`


			`@RunIf(hpu=True)`
			`def test_accelerator_hpu_with_multiple_devices():`

			`trainer = Trainer(accelerator="hpu", devices=8)`

			`assert isinstance(trainer.strategy, HPUParallelStrategy)`
			`assert isinstance(trainer.accelerator, HPUAccelerator)`


			`@RunIf(hpu=True)`
			`def test_accelerator_auto_with_devices_hpu():`

			`trainer = Trainer(accelerator="auto", devices=8)`

			`assert isinstance(trainer.strategy, HPUParallelStrategy)`


			`@RunIf(hpu=True)`
			`def test_strategy_choice_hpu_plugin():`
			`trainer = Trainer(strategy=SingleHPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)`
			`assert isinstance(trainer.strategy, SingleHPUStrategy)`

			`trainer = Trainer(accelerator="hpu", devices=1)`
			`assert isinstance(trainer.strategy, SingleHPUStrategy)`


			`@RunIf(hpu=True)`
			`def test_strategy_choice_hpu_parallel_plugin():`
			`trainer = Trainer(`
			`strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")] * 8), accelerator="hpu", devices=8`
			`)`
			`assert isinstance(trainer.strategy, HPUParallelStrategy)`

			`trainer = Trainer(accelerator="hpu", devices=8)`
			`assert isinstance(trainer.strategy, HPUParallelStrategy)`


			`@RunIf(hpu=True)`
			`def test_devices_auto_choice_hpu():`
			`trainer = Trainer(accelerator="auto", devices="auto")`
			`assert trainer.num_devices == 8`


			`@RunIf(hpu=True)`
			`@pytest.mark.parametrize("hpus", [1])`
			`def test_inference_only(tmpdir, hpus):`
			`model = BoringModel()`

			`trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="hpu", devices=hpus)`
			`trainer.validate(model)`
			`trainer.test(model)`
			`trainer.predict(model)`


			`def test_hpu_auto_device_count():`
			`assert HPUAccelerator.auto_device_count() == 8`


			`@RunIf(hpu=True)`
			`def test_hpu_unsupported_device_type():`
			with pytest.raises(MisconfigurationException, match="`devices` for `HPUAccelerator` must be int, string or None."):
			`Trainer(accelerator="hpu", devices=[1])`