[IPU] Allow poptorch.Options to override Trainer (#8233)

* Add test for poptorch Options * Hacks to get manual plugin support * Revert changes * Fix tests + ensure logic follow suit * Update pytorch_lightning/plugins/training_type/ipu.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Cleaner * Cleaner Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2021-07-05 14:42:00 +01:00 · 2021-07-05 14:42:00 +01:00 · 6d558961e3
parent 5cef9772a4
commit 6d558961e3
5 changed files with 96 additions and 162 deletions
--- a/pytorch_lightning/plugins/training_type/ipu.py
+++ b/pytorch_lightning/plugins/training_type/ipu.py
@ -26,7 +26,7 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEn
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.trainer.supporters import CombinedLoader
-from pytorch_lightning.utilities import _POPTORCH_AVAILABLE, rank_zero_warn
+from pytorch_lightning.utilities import _POPTORCH_AVAILABLE
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@ -129,10 +129,18 @@ class IPUPlugin(ParallelPlugin):
        self._handle_gradient_accumulation_steps()

    @property
-    def replication_factor(self):
+    def replication_factor(self) -> int:
+        if not self.lightning_module:
+            # The plugin has been passed in by the user and has not been connected to the Trainer.
+            # Check if the user has passed in custom poptorch.Options to infer number of IPUs being used.
+            # In this scenario we prioritize the training options.
+            if self._training_opts:
+                return self._training_opts.replication_factor
+            if self._inference_opts:
+                return self._inference_opts.replication_factor
        return len(self.parallel_devices)

-    def _create_opts(self, training: bool):
+    def _create_opts(self, training: bool) -> 'poptorch.Options':
        opts = poptorch.Options()
        opts.deviceIterations(self.device_iterations)
        opts.replicationFactor(self.replication_factor)
@ -147,71 +155,44 @@ class IPUPlugin(ParallelPlugin):
    def training_opts(self) -> 'poptorch.Options':
        if self._training_opts is None:
            self._training_opts = self._create_opts(training=True)
-        self._validate_opts(self._training_opts, training=True)
        return self._training_opts

    @property
    def inference_opts(self) -> 'poptorch.Options':
        if self._inference_opts is None:
            self._inference_opts = self._create_opts(training=False)
-        self._validate_opts(self._inference_opts, training=False)
        return self._inference_opts

-    def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None:
-        if opts is not None:
-            if opts.replication_factor != self.replication_factor:
-                rank_zero_warn(
-                    f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} "
-                    f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. "
-                    f"Setting to {self.replication_factor} in the poptorch.Options."
-                )
-                opts.set(replication_factor=self.replication_factor)
-            if training:
-                accumulate_grad_batches = self.accumulate_grad_batches
-                if opts.Training.gradient_accumulation != accumulate_grad_batches:
-                    rank_zero_warn(
-                        f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. "
-                        f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. "
-                        f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
-                        f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}"
-                    )
-                    opts.Training.set(gradient_accumulation=accumulate_grad_batches)
-            elif opts.Training.gradient_accumulation != 1:
-                rank_zero_warn(
-                    "Inference poptorch.Options should set gradientAccumulation to 1. "
-                    "Setting gradientAccumulation to 1 for inference options."
-                )
-                opts.Training.set(gradient_accumulation=1)
-
    @property
    def lightning_module(self) -> Optional['pl.LightningModule']:
        return self.model.module if isinstance(self.model, LightningIPUModule) else self.model

    def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
-        return self.process_dataloader(dataloader)
+        return self._process_dataloader(dataloader, is_training=True)

    def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
-        return self.process_dataloader(dataloader)
+        return self._process_dataloader(dataloader, is_training=False)

    def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
-        return self.process_dataloader(dataloader)
+        return self._process_dataloader(dataloader, is_training=False)

    def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
-        return self.process_dataloader(dataloader)
+        return self._process_dataloader(dataloader, is_training=False)

-    def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
+    def _process_dataloader(
+        self,
+        dataloader: Union[Iterable, DataLoader],
+        is_training: bool,
+    ) -> Union[Iterable, DataLoader]:
        if isinstance(dataloader, CombinedLoader):
            dataloader.loaders = apply_to_collection(
-                dataloader.loaders,
-                DataLoader,
-                self.process_dataloader,
+                dataloader.loaders, DataLoader, self._process_dataloader, is_training
            )
            return dataloader
        if isinstance(dataloader, list):
-            dataloader = apply_to_collection(dataloader, DataLoader, self.process_dataloader)
+            dataloader = apply_to_collection(dataloader, DataLoader, self._process_dataloader, is_training)
            return dataloader
        if not isinstance(dataloader, poptorch.DataLoader):
-            is_training = self.lightning_module.trainer.training
            opts = self.training_opts if is_training else self.inference_opts
            dataloader = self._convert_to_poptorch_loader(dataloader=dataloader, opts=opts)
        return dataloader
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@ -259,7 +259,7 @@ class AcceleratorConnector(object):

    @property
    def on_ipu(self) -> bool:
-        return self.ipus is not None
+        return self.ipus is not None or isinstance(self._training_type_plugin, IPUPlugin)

    @property
    def tpu_id(self) -> Optional[int]:
@ -327,6 +327,14 @@ class AcceleratorConnector(object):
            return 0
        return len(gpus)

+    @property
+    def num_ipus(self) -> int:
+        if isinstance(self.ipus, int):
+            return self.ipus
+        if isinstance(self._training_type_plugin, IPUPlugin):
+            return self._training_type_plugin.replication_factor
+        return 0
+
    @property
    def parallel_devices(self) -> List[Union[torch.device, int]]:
        if self.on_gpu:
@ -337,8 +345,7 @@ class AcceleratorConnector(object):
            if isinstance(self.tpu_cores, int):
                devices = list(range(self.tpu_cores))
        elif self.on_ipu:
-            if isinstance(self.ipus, int):
-                devices = list(range(self.ipus))
+            devices = list(range(self.num_ipus))
        else:
            devices = [torch.device("cpu")] * self.num_processes
        return devices
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@ -137,7 +137,7 @@ class TrainerProperties(ABC):

    @property
    def ipus(self) -> int:
-        return self.accelerator_connector.ipus
+        return self.accelerator_connector.num_ipus

    @property
    def num_gpus(self) -> int:
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@ -23,7 +23,7 @@ from weakref import proxy
 import torch

 import pytorch_lightning as pl
-from pytorch_lightning.accelerators import Accelerator
+from pytorch_lightning.accelerators import Accelerator, IPUAccelerator
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.core.memory import ModelSummary
@ -1209,7 +1209,7 @@ class Trainer(
                " `Trainer(tpu_cores=8)` or script `--tpu_cores=8`."
            )

-        if _IPU_AVAILABLE and self._device_type != DeviceType.IPU:
+        if _IPU_AVAILABLE and self._device_type != DeviceType.IPU and not isinstance(self.accelerator, IPUAccelerator):
            rank_zero_warn(
                "IPU available but not used. Set the `ipus` flag in your trainer"
                " `Trainer(ipus=8)` or script `--ipus=8`."
--- a/tests/accelerators/test_ipu.py
+++ b/tests/accelerators/test_ipu.py
@ -23,6 +23,7 @@ from pytorch_lightning.accelerators import IPUAccelerator
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin
 from pytorch_lightning.trainer.states import RunningStage
+from pytorch_lightning.trainer.supporters import CombinedLoader
 from pytorch_lightning.utilities import _IPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
@ -112,6 +113,19 @@ def test_accelerator_selected(tmpdir):
    assert isinstance(trainer.accelerator, IPUAccelerator)


+@RunIf(ipu=True)
+def test_warning_if_ipus_not_used(tmpdir):
+    with pytest.warns(UserWarning, match="IPU available but not used. Set the `ipus` flag in your trainer"):
+        Trainer(default_root_dir=tmpdir)
+
+
+@RunIf(ipu=True)
+def test_no_warning_plugin(tmpdir):
+    with pytest.warns(None) as record:
+        Trainer(default_root_dir=tmpdir, plugins=IPUPlugin(training_opts=poptorch.Options()))
+    assert len(record) == 0
+
+
@RunIf(ipu=True)
@pytest.mark.parametrize('ipus', [1, 4])
 def test_all_stages(tmpdir, ipus):
@ -363,141 +377,73 @@ def test_manual_poptorch_opts(tmpdir):
    assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts


-@RunIf(ipu=True)
-def test_manual_poptorch_opts_ipu_count(tmpdir):
-    """
-    Ensure if the user passes manual poptorch Options
-    and the number of ipus do not match, we warn and we set it for the user.
-    """
-
-    manual_ipus = 1
-    expected_ipus = 2
-    model = IPUModel()
-    inference_opts = poptorch.Options()
-    inference_opts.replicationFactor(manual_ipus)
-
-    training_opts = poptorch.Options()
-    training_opts.replicationFactor(manual_ipus)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        ipus=expected_ipus,
-        fast_dev_run=True,
-        plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
-    )
-    with pytest.warns(
-        UserWarning,
-        match=f"Manual poptorch.Options set replicationFactor to {manual_ipus} "
-        f"which differs to the ipus={expected_ipus} flag passed to the Trainer. "
-        f"Setting to {expected_ipus} in the poptorch.Options."
-    ):
-        trainer.fit(model)
-        assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
-        assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2
-        assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2
-
-
-@RunIf(ipu=True)
-def test_manual_poptorch_opts_inference_grad_accum(tmpdir):
-    """
-    Ensure if the user passes manual poptorch Options
-    and grad accumulation is set greater than 1 for inference, we warn and set to 1.
-    """
-
-    model = IPUModel()
-    inference_opts = poptorch.Options()
-    inference_opts.Training.gradientAccumulation(4)
-
-    training_opts = poptorch.Options()
-    training_opts.Training.gradientAccumulation(1)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        ipus=1,
-        fast_dev_run=True,
-        plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
-    )
-    with pytest.warns(
-        UserWarning,
-        match="Inference poptorch.Options should set gradientAccumulation to 1. "
-        "Setting gradientAccumulation to 1 for inference options.",
-    ):
-        trainer.fit(model)
-        assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
-        assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
-
-
-@RunIf(ipu=True)
-def test_manual_poptorch_opts_train_grad_accum(tmpdir):
-    """
-    Ensure if the user passes manual poptorch Options
-    and grad accumulation differs to accumulate_grad_batches, we
-    """
-
-    model = IPUModel()
-    inference_opts = poptorch.Options()
-    inference_opts.Training.gradientAccumulation(1)
-
-    training_opts = poptorch.Options()
-    training_opts.Training.gradientAccumulation(2)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        ipus=1,
-        fast_dev_run=True,
-        accumulate_grad_batches=1,
-        plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
-    )
-    with pytest.warns(
-        UserWarning,
-        match=f"Training poptorch.Options set gradientAccumulation to {2}. "
-        f"This is different to accumulate_grad_batches which was set to {1}. "
-        f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
-        f"Setting poptorch.Options gradientAccumulation to {1}",
-    ):
-        trainer.fit(model)
-        assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
-        assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
-
-
@RunIf(ipu=True)
 def test_manual_poptorch_opts_custom(tmpdir):
    """
    Ensure if the user passes manual poptorch Options with custom parameters set,
-    we respect them in our poptorch options.
+    we respect them in our poptorch options and the dataloaders.
    """

    model = IPUModel()
-    inference_opts = poptorch.Options()
-    inference_opts.deviceIterations(16)
-    inference_opts.replicationFactor(2)
-    inference_opts.Training.gradientAccumulation(1)
-
    training_opts = poptorch.Options()
    training_opts.deviceIterations(8)
    training_opts.replicationFactor(2)
    training_opts.Training.gradientAccumulation(2)

-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        ipus=2,
-        fast_dev_run=True,
-        accumulate_grad_batches=2,
-        plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
-    )
+    inference_opts = poptorch.Options()
+    inference_opts.deviceIterations(16)
+    inference_opts.replicationFactor(1)
+    inference_opts.Training.gradientAccumulation(1)
+
+    class TestCallback(Callback):
+
+        def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
+            # ensure dataloaders were correctly set up during training.
+            plugin = trainer.accelerator.training_type_plugin
+            assert isinstance(plugin, IPUPlugin)
+            assert plugin.training_opts.replication_factor == 2
+            assert plugin.inference_opts.replication_factor == 1
+
+            val_dataloader = trainer.val_dataloaders[0]
+            train_dataloader = trainer.train_dataloader
+            assert isinstance(train_dataloader, CombinedLoader)
+            train_dataloader = train_dataloader.loaders
+            assert isinstance(val_dataloader, poptorch.DataLoader)
+            assert isinstance(train_dataloader, poptorch.DataLoader)
+            assert train_dataloader.options.replication_factor == 2
+            assert val_dataloader.options.replication_factor == 1
+
+    plugin = IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
+    # ensure we default to the training options replication factor
+    assert plugin.replication_factor == 2
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin, callbacks=TestCallback())
    trainer.fit(model)
+
    plugin = trainer.accelerator.training_type_plugin
    assert isinstance(plugin, IPUPlugin)
-    inference_opts = plugin.inference_opts
-    training_opts = plugin.training_opts
-    assert inference_opts.device_iterations == 16
-    assert inference_opts.replication_factor == 2
-    assert inference_opts.Training.gradient_accumulation == 1

+    training_opts = plugin.training_opts
    assert training_opts.device_iterations == 8
    assert training_opts.replication_factor == 2
    assert training_opts.Training.gradient_accumulation == 2

+    inference_opts = plugin.inference_opts
+    assert inference_opts.device_iterations == 16
+    assert inference_opts.replication_factor == 1
+    assert inference_opts.Training.gradient_accumulation == 1
+
+
+@RunIf(ipu=True)
+def test_replication_factor(tmpdir):
+    """
+    Ensure if the user passes manual poptorch Options with custom parameters set,
+    we set them correctly in the dataloaders.
+    """
+
+    plugin = IPUPlugin()
+    trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin)
+    assert trainer.ipus == 2
+

@RunIf(ipu=True)
 def test_default_opts(tmpdir):