[IPU] Allow poptorch.Options to override Trainer (#8233)

* Add test for poptorch Options

* Hacks to get manual plugin support

* Revert changes

* Fix tests + ensure logic follow suit

* Update pytorch_lightning/plugins/training_type/ipu.py

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Cleaner

* Cleaner

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Sean Naren 2021-07-05 14:42:00 +01:00 committed by GitHub
parent 5cef9772a4
commit 6d558961e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 96 additions and 162 deletions

View File

@ -26,7 +26,7 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEn
from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
from pytorch_lightning.trainer.states import RunningStage
from pytorch_lightning.trainer.supporters import CombinedLoader
from pytorch_lightning.utilities import _POPTORCH_AVAILABLE, rank_zero_warn
from pytorch_lightning.utilities import _POPTORCH_AVAILABLE
from pytorch_lightning.utilities.apply_func import apply_to_collection
from pytorch_lightning.utilities.cloud_io import get_filesystem
from pytorch_lightning.utilities.exceptions import MisconfigurationException
@ -129,10 +129,18 @@ class IPUPlugin(ParallelPlugin):
self._handle_gradient_accumulation_steps()
@property
def replication_factor(self):
def replication_factor(self) -> int:
if not self.lightning_module:
# The plugin has been passed in by the user and has not been connected to the Trainer.
# Check if the user has passed in custom poptorch.Options to infer number of IPUs being used.
# In this scenario we prioritize the training options.
if self._training_opts:
return self._training_opts.replication_factor
if self._inference_opts:
return self._inference_opts.replication_factor
return len(self.parallel_devices)
def _create_opts(self, training: bool):
def _create_opts(self, training: bool) -> 'poptorch.Options':
opts = poptorch.Options()
opts.deviceIterations(self.device_iterations)
opts.replicationFactor(self.replication_factor)
@ -147,71 +155,44 @@ class IPUPlugin(ParallelPlugin):
def training_opts(self) -> 'poptorch.Options':
if self._training_opts is None:
self._training_opts = self._create_opts(training=True)
self._validate_opts(self._training_opts, training=True)
return self._training_opts
@property
def inference_opts(self) -> 'poptorch.Options':
if self._inference_opts is None:
self._inference_opts = self._create_opts(training=False)
self._validate_opts(self._inference_opts, training=False)
return self._inference_opts
def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None:
if opts is not None:
if opts.replication_factor != self.replication_factor:
rank_zero_warn(
f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} "
f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. "
f"Setting to {self.replication_factor} in the poptorch.Options."
)
opts.set(replication_factor=self.replication_factor)
if training:
accumulate_grad_batches = self.accumulate_grad_batches
if opts.Training.gradient_accumulation != accumulate_grad_batches:
rank_zero_warn(
f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. "
f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. "
f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}"
)
opts.Training.set(gradient_accumulation=accumulate_grad_batches)
elif opts.Training.gradient_accumulation != 1:
rank_zero_warn(
"Inference poptorch.Options should set gradientAccumulation to 1. "
"Setting gradientAccumulation to 1 for inference options."
)
opts.Training.set(gradient_accumulation=1)
@property
def lightning_module(self) -> Optional['pl.LightningModule']:
return self.model.module if isinstance(self.model, LightningIPUModule) else self.model
def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
return self.process_dataloader(dataloader)
return self._process_dataloader(dataloader, is_training=True)
def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
return self.process_dataloader(dataloader)
return self._process_dataloader(dataloader, is_training=False)
def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
return self.process_dataloader(dataloader)
return self._process_dataloader(dataloader, is_training=False)
def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
return self.process_dataloader(dataloader)
return self._process_dataloader(dataloader, is_training=False)
def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
def _process_dataloader(
self,
dataloader: Union[Iterable, DataLoader],
is_training: bool,
) -> Union[Iterable, DataLoader]:
if isinstance(dataloader, CombinedLoader):
dataloader.loaders = apply_to_collection(
dataloader.loaders,
DataLoader,
self.process_dataloader,
dataloader.loaders, DataLoader, self._process_dataloader, is_training
)
return dataloader
if isinstance(dataloader, list):
dataloader = apply_to_collection(dataloader, DataLoader, self.process_dataloader)
dataloader = apply_to_collection(dataloader, DataLoader, self._process_dataloader, is_training)
return dataloader
if not isinstance(dataloader, poptorch.DataLoader):
is_training = self.lightning_module.trainer.training
opts = self.training_opts if is_training else self.inference_opts
dataloader = self._convert_to_poptorch_loader(dataloader=dataloader, opts=opts)
return dataloader

View File

@ -259,7 +259,7 @@ class AcceleratorConnector(object):
@property
def on_ipu(self) -> bool:
return self.ipus is not None
return self.ipus is not None or isinstance(self._training_type_plugin, IPUPlugin)
@property
def tpu_id(self) -> Optional[int]:
@ -327,6 +327,14 @@ class AcceleratorConnector(object):
return 0
return len(gpus)
@property
def num_ipus(self) -> int:
if isinstance(self.ipus, int):
return self.ipus
if isinstance(self._training_type_plugin, IPUPlugin):
return self._training_type_plugin.replication_factor
return 0
@property
def parallel_devices(self) -> List[Union[torch.device, int]]:
if self.on_gpu:
@ -337,8 +345,7 @@ class AcceleratorConnector(object):
if isinstance(self.tpu_cores, int):
devices = list(range(self.tpu_cores))
elif self.on_ipu:
if isinstance(self.ipus, int):
devices = list(range(self.ipus))
devices = list(range(self.num_ipus))
else:
devices = [torch.device("cpu")] * self.num_processes
return devices

View File

@ -137,7 +137,7 @@ class TrainerProperties(ABC):
@property
def ipus(self) -> int:
return self.accelerator_connector.ipus
return self.accelerator_connector.num_ipus
@property
def num_gpus(self) -> int:

View File

@ -23,7 +23,7 @@ from weakref import proxy
import torch
import pytorch_lightning as pl
from pytorch_lightning.accelerators import Accelerator
from pytorch_lightning.accelerators import Accelerator, IPUAccelerator
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.core.datamodule import LightningDataModule
from pytorch_lightning.core.memory import ModelSummary
@ -1209,7 +1209,7 @@ class Trainer(
" `Trainer(tpu_cores=8)` or script `--tpu_cores=8`."
)
if _IPU_AVAILABLE and self._device_type != DeviceType.IPU:
if _IPU_AVAILABLE and self._device_type != DeviceType.IPU and not isinstance(self.accelerator, IPUAccelerator):
rank_zero_warn(
"IPU available but not used. Set the `ipus` flag in your trainer"
" `Trainer(ipus=8)` or script `--ipus=8`."

View File

@ -23,6 +23,7 @@ from pytorch_lightning.accelerators import IPUAccelerator
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin
from pytorch_lightning.trainer.states import RunningStage
from pytorch_lightning.trainer.supporters import CombinedLoader
from pytorch_lightning.utilities import _IPU_AVAILABLE
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers.boring_model import BoringModel
@ -112,6 +113,19 @@ def test_accelerator_selected(tmpdir):
assert isinstance(trainer.accelerator, IPUAccelerator)
@RunIf(ipu=True)
def test_warning_if_ipus_not_used(tmpdir):
with pytest.warns(UserWarning, match="IPU available but not used. Set the `ipus` flag in your trainer"):
Trainer(default_root_dir=tmpdir)
@RunIf(ipu=True)
def test_no_warning_plugin(tmpdir):
with pytest.warns(None) as record:
Trainer(default_root_dir=tmpdir, plugins=IPUPlugin(training_opts=poptorch.Options()))
assert len(record) == 0
@RunIf(ipu=True)
@pytest.mark.parametrize('ipus', [1, 4])
def test_all_stages(tmpdir, ipus):
@ -363,141 +377,73 @@ def test_manual_poptorch_opts(tmpdir):
assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts
@RunIf(ipu=True)
def test_manual_poptorch_opts_ipu_count(tmpdir):
"""
Ensure if the user passes manual poptorch Options
and the number of ipus do not match, we warn and we set it for the user.
"""
manual_ipus = 1
expected_ipus = 2
model = IPUModel()
inference_opts = poptorch.Options()
inference_opts.replicationFactor(manual_ipus)
training_opts = poptorch.Options()
training_opts.replicationFactor(manual_ipus)
trainer = Trainer(
default_root_dir=tmpdir,
ipus=expected_ipus,
fast_dev_run=True,
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
)
with pytest.warns(
UserWarning,
match=f"Manual poptorch.Options set replicationFactor to {manual_ipus} "
f"which differs to the ipus={expected_ipus} flag passed to the Trainer. "
f"Setting to {expected_ipus} in the poptorch.Options."
):
trainer.fit(model)
assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2
assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2
@RunIf(ipu=True)
def test_manual_poptorch_opts_inference_grad_accum(tmpdir):
"""
Ensure if the user passes manual poptorch Options
and grad accumulation is set greater than 1 for inference, we warn and set to 1.
"""
model = IPUModel()
inference_opts = poptorch.Options()
inference_opts.Training.gradientAccumulation(4)
training_opts = poptorch.Options()
training_opts.Training.gradientAccumulation(1)
trainer = Trainer(
default_root_dir=tmpdir,
ipus=1,
fast_dev_run=True,
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
)
with pytest.warns(
UserWarning,
match="Inference poptorch.Options should set gradientAccumulation to 1. "
"Setting gradientAccumulation to 1 for inference options.",
):
trainer.fit(model)
assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
@RunIf(ipu=True)
def test_manual_poptorch_opts_train_grad_accum(tmpdir):
"""
Ensure if the user passes manual poptorch Options
and grad accumulation differs to accumulate_grad_batches, we
"""
model = IPUModel()
inference_opts = poptorch.Options()
inference_opts.Training.gradientAccumulation(1)
training_opts = poptorch.Options()
training_opts.Training.gradientAccumulation(2)
trainer = Trainer(
default_root_dir=tmpdir,
ipus=1,
fast_dev_run=True,
accumulate_grad_batches=1,
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
)
with pytest.warns(
UserWarning,
match=f"Training poptorch.Options set gradientAccumulation to {2}. "
f"This is different to accumulate_grad_batches which was set to {1}. "
f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
f"Setting poptorch.Options gradientAccumulation to {1}",
):
trainer.fit(model)
assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
@RunIf(ipu=True)
def test_manual_poptorch_opts_custom(tmpdir):
"""
Ensure if the user passes manual poptorch Options with custom parameters set,
we respect them in our poptorch options.
we respect them in our poptorch options and the dataloaders.
"""
model = IPUModel()
inference_opts = poptorch.Options()
inference_opts.deviceIterations(16)
inference_opts.replicationFactor(2)
inference_opts.Training.gradientAccumulation(1)
training_opts = poptorch.Options()
training_opts.deviceIterations(8)
training_opts.replicationFactor(2)
training_opts.Training.gradientAccumulation(2)
trainer = Trainer(
default_root_dir=tmpdir,
ipus=2,
fast_dev_run=True,
accumulate_grad_batches=2,
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
)
inference_opts = poptorch.Options()
inference_opts.deviceIterations(16)
inference_opts.replicationFactor(1)
inference_opts.Training.gradientAccumulation(1)
class TestCallback(Callback):
def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
# ensure dataloaders were correctly set up during training.
plugin = trainer.accelerator.training_type_plugin
assert isinstance(plugin, IPUPlugin)
assert plugin.training_opts.replication_factor == 2
assert plugin.inference_opts.replication_factor == 1
val_dataloader = trainer.val_dataloaders[0]
train_dataloader = trainer.train_dataloader
assert isinstance(train_dataloader, CombinedLoader)
train_dataloader = train_dataloader.loaders
assert isinstance(val_dataloader, poptorch.DataLoader)
assert isinstance(train_dataloader, poptorch.DataLoader)
assert train_dataloader.options.replication_factor == 2
assert val_dataloader.options.replication_factor == 1
plugin = IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
# ensure we default to the training options replication factor
assert plugin.replication_factor == 2
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin, callbacks=TestCallback())
trainer.fit(model)
plugin = trainer.accelerator.training_type_plugin
assert isinstance(plugin, IPUPlugin)
inference_opts = plugin.inference_opts
training_opts = plugin.training_opts
assert inference_opts.device_iterations == 16
assert inference_opts.replication_factor == 2
assert inference_opts.Training.gradient_accumulation == 1
training_opts = plugin.training_opts
assert training_opts.device_iterations == 8
assert training_opts.replication_factor == 2
assert training_opts.Training.gradient_accumulation == 2
inference_opts = plugin.inference_opts
assert inference_opts.device_iterations == 16
assert inference_opts.replication_factor == 1
assert inference_opts.Training.gradient_accumulation == 1
@RunIf(ipu=True)
def test_replication_factor(tmpdir):
"""
Ensure if the user passes manual poptorch Options with custom parameters set,
we set them correctly in the dataloaders.
"""
plugin = IPUPlugin()
trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin)
assert trainer.ipus == 2
@RunIf(ipu=True)
def test_default_opts(tmpdir):