diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 9de4e81447..f498a4cd04 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -26,7 +26,7 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEn from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.trainer.supporters import CombinedLoader -from pytorch_lightning.utilities import _POPTORCH_AVAILABLE, rank_zero_warn +from pytorch_lightning.utilities import _POPTORCH_AVAILABLE from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -129,10 +129,18 @@ class IPUPlugin(ParallelPlugin): self._handle_gradient_accumulation_steps() @property - def replication_factor(self): + def replication_factor(self) -> int: + if not self.lightning_module: + # The plugin has been passed in by the user and has not been connected to the Trainer. + # Check if the user has passed in custom poptorch.Options to infer number of IPUs being used. + # In this scenario we prioritize the training options. + if self._training_opts: + return self._training_opts.replication_factor + if self._inference_opts: + return self._inference_opts.replication_factor return len(self.parallel_devices) - def _create_opts(self, training: bool): + def _create_opts(self, training: bool) -> 'poptorch.Options': opts = poptorch.Options() opts.deviceIterations(self.device_iterations) opts.replicationFactor(self.replication_factor) @@ -147,71 +155,44 @@ class IPUPlugin(ParallelPlugin): def training_opts(self) -> 'poptorch.Options': if self._training_opts is None: self._training_opts = self._create_opts(training=True) - self._validate_opts(self._training_opts, training=True) return self._training_opts @property def inference_opts(self) -> 'poptorch.Options': if self._inference_opts is None: self._inference_opts = self._create_opts(training=False) - self._validate_opts(self._inference_opts, training=False) return self._inference_opts - def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None: - if opts is not None: - if opts.replication_factor != self.replication_factor: - rank_zero_warn( - f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} " - f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. " - f"Setting to {self.replication_factor} in the poptorch.Options." - ) - opts.set(replication_factor=self.replication_factor) - if training: - accumulate_grad_batches = self.accumulate_grad_batches - if opts.Training.gradient_accumulation != accumulate_grad_batches: - rank_zero_warn( - f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. " - f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. " - f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " - f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}" - ) - opts.Training.set(gradient_accumulation=accumulate_grad_batches) - elif opts.Training.gradient_accumulation != 1: - rank_zero_warn( - "Inference poptorch.Options should set gradientAccumulation to 1. " - "Setting gradientAccumulation to 1 for inference options." - ) - opts.Training.set(gradient_accumulation=1) - @property def lightning_module(self) -> Optional['pl.LightningModule']: return self.model.module if isinstance(self.model, LightningIPUModule) else self.model def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: - return self.process_dataloader(dataloader) + return self._process_dataloader(dataloader, is_training=True) def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: - return self.process_dataloader(dataloader) + return self._process_dataloader(dataloader, is_training=False) def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: - return self.process_dataloader(dataloader) + return self._process_dataloader(dataloader, is_training=False) def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: - return self.process_dataloader(dataloader) + return self._process_dataloader(dataloader, is_training=False) - def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: + def _process_dataloader( + self, + dataloader: Union[Iterable, DataLoader], + is_training: bool, + ) -> Union[Iterable, DataLoader]: if isinstance(dataloader, CombinedLoader): dataloader.loaders = apply_to_collection( - dataloader.loaders, - DataLoader, - self.process_dataloader, + dataloader.loaders, DataLoader, self._process_dataloader, is_training ) return dataloader if isinstance(dataloader, list): - dataloader = apply_to_collection(dataloader, DataLoader, self.process_dataloader) + dataloader = apply_to_collection(dataloader, DataLoader, self._process_dataloader, is_training) return dataloader if not isinstance(dataloader, poptorch.DataLoader): - is_training = self.lightning_module.trainer.training opts = self.training_opts if is_training else self.inference_opts dataloader = self._convert_to_poptorch_loader(dataloader=dataloader, opts=opts) return dataloader diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index f283c38d4d..ea63494dfb 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -259,7 +259,7 @@ class AcceleratorConnector(object): @property def on_ipu(self) -> bool: - return self.ipus is not None + return self.ipus is not None or isinstance(self._training_type_plugin, IPUPlugin) @property def tpu_id(self) -> Optional[int]: @@ -327,6 +327,14 @@ class AcceleratorConnector(object): return 0 return len(gpus) + @property + def num_ipus(self) -> int: + if isinstance(self.ipus, int): + return self.ipus + if isinstance(self._training_type_plugin, IPUPlugin): + return self._training_type_plugin.replication_factor + return 0 + @property def parallel_devices(self) -> List[Union[torch.device, int]]: if self.on_gpu: @@ -337,8 +345,7 @@ class AcceleratorConnector(object): if isinstance(self.tpu_cores, int): devices = list(range(self.tpu_cores)) elif self.on_ipu: - if isinstance(self.ipus, int): - devices = list(range(self.ipus)) + devices = list(range(self.num_ipus)) else: devices = [torch.device("cpu")] * self.num_processes return devices diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index a9cf6e3bfb..f57110368e 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -137,7 +137,7 @@ class TrainerProperties(ABC): @property def ipus(self) -> int: - return self.accelerator_connector.ipus + return self.accelerator_connector.num_ipus @property def num_gpus(self) -> int: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 38eef32312..b984608c87 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -23,7 +23,7 @@ from weakref import proxy import torch import pytorch_lightning as pl -from pytorch_lightning.accelerators import Accelerator +from pytorch_lightning.accelerators import Accelerator, IPUAccelerator from pytorch_lightning.callbacks import Callback from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.memory import ModelSummary @@ -1209,7 +1209,7 @@ class Trainer( " `Trainer(tpu_cores=8)` or script `--tpu_cores=8`." ) - if _IPU_AVAILABLE and self._device_type != DeviceType.IPU: + if _IPU_AVAILABLE and self._device_type != DeviceType.IPU and not isinstance(self.accelerator, IPUAccelerator): rank_zero_warn( "IPU available but not used. Set the `ipus` flag in your trainer" " `Trainer(ipus=8)` or script `--ipus=8`." diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index 363648c9f6..78176b76f5 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -23,6 +23,7 @@ from pytorch_lightning.accelerators import IPUAccelerator from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin from pytorch_lightning.trainer.states import RunningStage +from pytorch_lightning.trainer.supporters import CombinedLoader from pytorch_lightning.utilities import _IPU_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel @@ -112,6 +113,19 @@ def test_accelerator_selected(tmpdir): assert isinstance(trainer.accelerator, IPUAccelerator) +@RunIf(ipu=True) +def test_warning_if_ipus_not_used(tmpdir): + with pytest.warns(UserWarning, match="IPU available but not used. Set the `ipus` flag in your trainer"): + Trainer(default_root_dir=tmpdir) + + +@RunIf(ipu=True) +def test_no_warning_plugin(tmpdir): + with pytest.warns(None) as record: + Trainer(default_root_dir=tmpdir, plugins=IPUPlugin(training_opts=poptorch.Options())) + assert len(record) == 0 + + @RunIf(ipu=True) @pytest.mark.parametrize('ipus', [1, 4]) def test_all_stages(tmpdir, ipus): @@ -363,141 +377,73 @@ def test_manual_poptorch_opts(tmpdir): assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts -@RunIf(ipu=True) -def test_manual_poptorch_opts_ipu_count(tmpdir): - """ - Ensure if the user passes manual poptorch Options - and the number of ipus do not match, we warn and we set it for the user. - """ - - manual_ipus = 1 - expected_ipus = 2 - model = IPUModel() - inference_opts = poptorch.Options() - inference_opts.replicationFactor(manual_ipus) - - training_opts = poptorch.Options() - training_opts.replicationFactor(manual_ipus) - - trainer = Trainer( - default_root_dir=tmpdir, - ipus=expected_ipus, - fast_dev_run=True, - plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) - ) - with pytest.warns( - UserWarning, - match=f"Manual poptorch.Options set replicationFactor to {manual_ipus} " - f"which differs to the ipus={expected_ipus} flag passed to the Trainer. " - f"Setting to {expected_ipus} in the poptorch.Options." - ): - trainer.fit(model) - assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) - assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2 - assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2 - - -@RunIf(ipu=True) -def test_manual_poptorch_opts_inference_grad_accum(tmpdir): - """ - Ensure if the user passes manual poptorch Options - and grad accumulation is set greater than 1 for inference, we warn and set to 1. - """ - - model = IPUModel() - inference_opts = poptorch.Options() - inference_opts.Training.gradientAccumulation(4) - - training_opts = poptorch.Options() - training_opts.Training.gradientAccumulation(1) - - trainer = Trainer( - default_root_dir=tmpdir, - ipus=1, - fast_dev_run=True, - plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) - ) - with pytest.warns( - UserWarning, - match="Inference poptorch.Options should set gradientAccumulation to 1. " - "Setting gradientAccumulation to 1 for inference options.", - ): - trainer.fit(model) - assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) - assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1 - - -@RunIf(ipu=True) -def test_manual_poptorch_opts_train_grad_accum(tmpdir): - """ - Ensure if the user passes manual poptorch Options - and grad accumulation differs to accumulate_grad_batches, we - """ - - model = IPUModel() - inference_opts = poptorch.Options() - inference_opts.Training.gradientAccumulation(1) - - training_opts = poptorch.Options() - training_opts.Training.gradientAccumulation(2) - - trainer = Trainer( - default_root_dir=tmpdir, - ipus=1, - fast_dev_run=True, - accumulate_grad_batches=1, - plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) - ) - with pytest.warns( - UserWarning, - match=f"Training poptorch.Options set gradientAccumulation to {2}. " - f"This is different to accumulate_grad_batches which was set to {1}. " - f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. " - f"Setting poptorch.Options gradientAccumulation to {1}", - ): - trainer.fit(model) - assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin) - assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1 - - @RunIf(ipu=True) def test_manual_poptorch_opts_custom(tmpdir): """ Ensure if the user passes manual poptorch Options with custom parameters set, - we respect them in our poptorch options. + we respect them in our poptorch options and the dataloaders. """ model = IPUModel() - inference_opts = poptorch.Options() - inference_opts.deviceIterations(16) - inference_opts.replicationFactor(2) - inference_opts.Training.gradientAccumulation(1) - training_opts = poptorch.Options() training_opts.deviceIterations(8) training_opts.replicationFactor(2) training_opts.Training.gradientAccumulation(2) - trainer = Trainer( - default_root_dir=tmpdir, - ipus=2, - fast_dev_run=True, - accumulate_grad_batches=2, - plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) - ) + inference_opts = poptorch.Options() + inference_opts.deviceIterations(16) + inference_opts.replicationFactor(1) + inference_opts.Training.gradientAccumulation(1) + + class TestCallback(Callback): + + def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None: + # ensure dataloaders were correctly set up during training. + plugin = trainer.accelerator.training_type_plugin + assert isinstance(plugin, IPUPlugin) + assert plugin.training_opts.replication_factor == 2 + assert plugin.inference_opts.replication_factor == 1 + + val_dataloader = trainer.val_dataloaders[0] + train_dataloader = trainer.train_dataloader + assert isinstance(train_dataloader, CombinedLoader) + train_dataloader = train_dataloader.loaders + assert isinstance(val_dataloader, poptorch.DataLoader) + assert isinstance(train_dataloader, poptorch.DataLoader) + assert train_dataloader.options.replication_factor == 2 + assert val_dataloader.options.replication_factor == 1 + + plugin = IPUPlugin(inference_opts=inference_opts, training_opts=training_opts) + # ensure we default to the training options replication factor + assert plugin.replication_factor == 2 + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin, callbacks=TestCallback()) trainer.fit(model) + plugin = trainer.accelerator.training_type_plugin assert isinstance(plugin, IPUPlugin) - inference_opts = plugin.inference_opts - training_opts = plugin.training_opts - assert inference_opts.device_iterations == 16 - assert inference_opts.replication_factor == 2 - assert inference_opts.Training.gradient_accumulation == 1 + training_opts = plugin.training_opts assert training_opts.device_iterations == 8 assert training_opts.replication_factor == 2 assert training_opts.Training.gradient_accumulation == 2 + inference_opts = plugin.inference_opts + assert inference_opts.device_iterations == 16 + assert inference_opts.replication_factor == 1 + assert inference_opts.Training.gradient_accumulation == 1 + + +@RunIf(ipu=True) +def test_replication_factor(tmpdir): + """ + Ensure if the user passes manual poptorch Options with custom parameters set, + we set them correctly in the dataloaders. + """ + + plugin = IPUPlugin() + trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin) + assert trainer.ipus == 2 + @RunIf(ipu=True) def test_default_opts(tmpdir):