[IPU] Allow poptorch.Options to override Trainer (#8233)
* Add test for poptorch Options * Hacks to get manual plugin support * Revert changes * Fix tests + ensure logic follow suit * Update pytorch_lightning/plugins/training_type/ipu.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Cleaner * Cleaner Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
5cef9772a4
commit
6d558961e3
|
@ -26,7 +26,7 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEn
|
|||
from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
|
||||
from pytorch_lightning.trainer.states import RunningStage
|
||||
from pytorch_lightning.trainer.supporters import CombinedLoader
|
||||
from pytorch_lightning.utilities import _POPTORCH_AVAILABLE, rank_zero_warn
|
||||
from pytorch_lightning.utilities import _POPTORCH_AVAILABLE
|
||||
from pytorch_lightning.utilities.apply_func import apply_to_collection
|
||||
from pytorch_lightning.utilities.cloud_io import get_filesystem
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
|
@ -129,10 +129,18 @@ class IPUPlugin(ParallelPlugin):
|
|||
self._handle_gradient_accumulation_steps()
|
||||
|
||||
@property
|
||||
def replication_factor(self):
|
||||
def replication_factor(self) -> int:
|
||||
if not self.lightning_module:
|
||||
# The plugin has been passed in by the user and has not been connected to the Trainer.
|
||||
# Check if the user has passed in custom poptorch.Options to infer number of IPUs being used.
|
||||
# In this scenario we prioritize the training options.
|
||||
if self._training_opts:
|
||||
return self._training_opts.replication_factor
|
||||
if self._inference_opts:
|
||||
return self._inference_opts.replication_factor
|
||||
return len(self.parallel_devices)
|
||||
|
||||
def _create_opts(self, training: bool):
|
||||
def _create_opts(self, training: bool) -> 'poptorch.Options':
|
||||
opts = poptorch.Options()
|
||||
opts.deviceIterations(self.device_iterations)
|
||||
opts.replicationFactor(self.replication_factor)
|
||||
|
@ -147,71 +155,44 @@ class IPUPlugin(ParallelPlugin):
|
|||
def training_opts(self) -> 'poptorch.Options':
|
||||
if self._training_opts is None:
|
||||
self._training_opts = self._create_opts(training=True)
|
||||
self._validate_opts(self._training_opts, training=True)
|
||||
return self._training_opts
|
||||
|
||||
@property
|
||||
def inference_opts(self) -> 'poptorch.Options':
|
||||
if self._inference_opts is None:
|
||||
self._inference_opts = self._create_opts(training=False)
|
||||
self._validate_opts(self._inference_opts, training=False)
|
||||
return self._inference_opts
|
||||
|
||||
def _validate_opts(self, opts: 'poptorch.Options', training: bool) -> None:
|
||||
if opts is not None:
|
||||
if opts.replication_factor != self.replication_factor:
|
||||
rank_zero_warn(
|
||||
f"Manual poptorch.Options set replicationFactor to {opts.replication_factor} "
|
||||
f"which differs to the ipus={self.replication_factor} flag passed to the Trainer. "
|
||||
f"Setting to {self.replication_factor} in the poptorch.Options."
|
||||
)
|
||||
opts.set(replication_factor=self.replication_factor)
|
||||
if training:
|
||||
accumulate_grad_batches = self.accumulate_grad_batches
|
||||
if opts.Training.gradient_accumulation != accumulate_grad_batches:
|
||||
rank_zero_warn(
|
||||
f"Training poptorch.Options set gradientAccumulation to {opts.Training.gradient_accumulation}. "
|
||||
f"This is different to accumulate_grad_batches which was set to {accumulate_grad_batches}. "
|
||||
f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
|
||||
f"Setting poptorch.Options gradientAccumulation to {accumulate_grad_batches}"
|
||||
)
|
||||
opts.Training.set(gradient_accumulation=accumulate_grad_batches)
|
||||
elif opts.Training.gradient_accumulation != 1:
|
||||
rank_zero_warn(
|
||||
"Inference poptorch.Options should set gradientAccumulation to 1. "
|
||||
"Setting gradientAccumulation to 1 for inference options."
|
||||
)
|
||||
opts.Training.set(gradient_accumulation=1)
|
||||
|
||||
@property
|
||||
def lightning_module(self) -> Optional['pl.LightningModule']:
|
||||
return self.model.module if isinstance(self.model, LightningIPUModule) else self.model
|
||||
|
||||
def on_reset_train_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
|
||||
return self.process_dataloader(dataloader)
|
||||
return self._process_dataloader(dataloader, is_training=True)
|
||||
|
||||
def on_reset_val_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
|
||||
return self.process_dataloader(dataloader)
|
||||
return self._process_dataloader(dataloader, is_training=False)
|
||||
|
||||
def on_reset_test_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
|
||||
return self.process_dataloader(dataloader)
|
||||
return self._process_dataloader(dataloader, is_training=False)
|
||||
|
||||
def on_reset_predict_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
|
||||
return self.process_dataloader(dataloader)
|
||||
return self._process_dataloader(dataloader, is_training=False)
|
||||
|
||||
def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
|
||||
def _process_dataloader(
|
||||
self,
|
||||
dataloader: Union[Iterable, DataLoader],
|
||||
is_training: bool,
|
||||
) -> Union[Iterable, DataLoader]:
|
||||
if isinstance(dataloader, CombinedLoader):
|
||||
dataloader.loaders = apply_to_collection(
|
||||
dataloader.loaders,
|
||||
DataLoader,
|
||||
self.process_dataloader,
|
||||
dataloader.loaders, DataLoader, self._process_dataloader, is_training
|
||||
)
|
||||
return dataloader
|
||||
if isinstance(dataloader, list):
|
||||
dataloader = apply_to_collection(dataloader, DataLoader, self.process_dataloader)
|
||||
dataloader = apply_to_collection(dataloader, DataLoader, self._process_dataloader, is_training)
|
||||
return dataloader
|
||||
if not isinstance(dataloader, poptorch.DataLoader):
|
||||
is_training = self.lightning_module.trainer.training
|
||||
opts = self.training_opts if is_training else self.inference_opts
|
||||
dataloader = self._convert_to_poptorch_loader(dataloader=dataloader, opts=opts)
|
||||
return dataloader
|
||||
|
|
|
@ -259,7 +259,7 @@ class AcceleratorConnector(object):
|
|||
|
||||
@property
|
||||
def on_ipu(self) -> bool:
|
||||
return self.ipus is not None
|
||||
return self.ipus is not None or isinstance(self._training_type_plugin, IPUPlugin)
|
||||
|
||||
@property
|
||||
def tpu_id(self) -> Optional[int]:
|
||||
|
@ -327,6 +327,14 @@ class AcceleratorConnector(object):
|
|||
return 0
|
||||
return len(gpus)
|
||||
|
||||
@property
|
||||
def num_ipus(self) -> int:
|
||||
if isinstance(self.ipus, int):
|
||||
return self.ipus
|
||||
if isinstance(self._training_type_plugin, IPUPlugin):
|
||||
return self._training_type_plugin.replication_factor
|
||||
return 0
|
||||
|
||||
@property
|
||||
def parallel_devices(self) -> List[Union[torch.device, int]]:
|
||||
if self.on_gpu:
|
||||
|
@ -337,8 +345,7 @@ class AcceleratorConnector(object):
|
|||
if isinstance(self.tpu_cores, int):
|
||||
devices = list(range(self.tpu_cores))
|
||||
elif self.on_ipu:
|
||||
if isinstance(self.ipus, int):
|
||||
devices = list(range(self.ipus))
|
||||
devices = list(range(self.num_ipus))
|
||||
else:
|
||||
devices = [torch.device("cpu")] * self.num_processes
|
||||
return devices
|
||||
|
|
|
@ -137,7 +137,7 @@ class TrainerProperties(ABC):
|
|||
|
||||
@property
|
||||
def ipus(self) -> int:
|
||||
return self.accelerator_connector.ipus
|
||||
return self.accelerator_connector.num_ipus
|
||||
|
||||
@property
|
||||
def num_gpus(self) -> int:
|
||||
|
|
|
@ -23,7 +23,7 @@ from weakref import proxy
|
|||
import torch
|
||||
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.accelerators import Accelerator
|
||||
from pytorch_lightning.accelerators import Accelerator, IPUAccelerator
|
||||
from pytorch_lightning.callbacks import Callback
|
||||
from pytorch_lightning.core.datamodule import LightningDataModule
|
||||
from pytorch_lightning.core.memory import ModelSummary
|
||||
|
@ -1209,7 +1209,7 @@ class Trainer(
|
|||
" `Trainer(tpu_cores=8)` or script `--tpu_cores=8`."
|
||||
)
|
||||
|
||||
if _IPU_AVAILABLE and self._device_type != DeviceType.IPU:
|
||||
if _IPU_AVAILABLE and self._device_type != DeviceType.IPU and not isinstance(self.accelerator, IPUAccelerator):
|
||||
rank_zero_warn(
|
||||
"IPU available but not used. Set the `ipus` flag in your trainer"
|
||||
" `Trainer(ipus=8)` or script `--ipus=8`."
|
||||
|
|
|
@ -23,6 +23,7 @@ from pytorch_lightning.accelerators import IPUAccelerator
|
|||
from pytorch_lightning.core.lightning import LightningModule
|
||||
from pytorch_lightning.plugins import IPUPlugin, IPUPrecisionPlugin
|
||||
from pytorch_lightning.trainer.states import RunningStage
|
||||
from pytorch_lightning.trainer.supporters import CombinedLoader
|
||||
from pytorch_lightning.utilities import _IPU_AVAILABLE
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
from tests.helpers.boring_model import BoringModel
|
||||
|
@ -112,6 +113,19 @@ def test_accelerator_selected(tmpdir):
|
|||
assert isinstance(trainer.accelerator, IPUAccelerator)
|
||||
|
||||
|
||||
@RunIf(ipu=True)
|
||||
def test_warning_if_ipus_not_used(tmpdir):
|
||||
with pytest.warns(UserWarning, match="IPU available but not used. Set the `ipus` flag in your trainer"):
|
||||
Trainer(default_root_dir=tmpdir)
|
||||
|
||||
|
||||
@RunIf(ipu=True)
|
||||
def test_no_warning_plugin(tmpdir):
|
||||
with pytest.warns(None) as record:
|
||||
Trainer(default_root_dir=tmpdir, plugins=IPUPlugin(training_opts=poptorch.Options()))
|
||||
assert len(record) == 0
|
||||
|
||||
|
||||
@RunIf(ipu=True)
|
||||
@pytest.mark.parametrize('ipus', [1, 4])
|
||||
def test_all_stages(tmpdir, ipus):
|
||||
|
@ -363,141 +377,73 @@ def test_manual_poptorch_opts(tmpdir):
|
|||
assert trainer.accelerator.training_type_plugin.inference_opts == inference_opts
|
||||
|
||||
|
||||
@RunIf(ipu=True)
|
||||
def test_manual_poptorch_opts_ipu_count(tmpdir):
|
||||
"""
|
||||
Ensure if the user passes manual poptorch Options
|
||||
and the number of ipus do not match, we warn and we set it for the user.
|
||||
"""
|
||||
|
||||
manual_ipus = 1
|
||||
expected_ipus = 2
|
||||
model = IPUModel()
|
||||
inference_opts = poptorch.Options()
|
||||
inference_opts.replicationFactor(manual_ipus)
|
||||
|
||||
training_opts = poptorch.Options()
|
||||
training_opts.replicationFactor(manual_ipus)
|
||||
|
||||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
ipus=expected_ipus,
|
||||
fast_dev_run=True,
|
||||
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
|
||||
)
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match=f"Manual poptorch.Options set replicationFactor to {manual_ipus} "
|
||||
f"which differs to the ipus={expected_ipus} flag passed to the Trainer. "
|
||||
f"Setting to {expected_ipus} in the poptorch.Options."
|
||||
):
|
||||
trainer.fit(model)
|
||||
assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
|
||||
assert trainer.accelerator.training_type_plugin.training_opts.replication_factor == 2
|
||||
assert trainer.accelerator.training_type_plugin.inference_opts.replication_factor == 2
|
||||
|
||||
|
||||
@RunIf(ipu=True)
|
||||
def test_manual_poptorch_opts_inference_grad_accum(tmpdir):
|
||||
"""
|
||||
Ensure if the user passes manual poptorch Options
|
||||
and grad accumulation is set greater than 1 for inference, we warn and set to 1.
|
||||
"""
|
||||
|
||||
model = IPUModel()
|
||||
inference_opts = poptorch.Options()
|
||||
inference_opts.Training.gradientAccumulation(4)
|
||||
|
||||
training_opts = poptorch.Options()
|
||||
training_opts.Training.gradientAccumulation(1)
|
||||
|
||||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
ipus=1,
|
||||
fast_dev_run=True,
|
||||
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
|
||||
)
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match="Inference poptorch.Options should set gradientAccumulation to 1. "
|
||||
"Setting gradientAccumulation to 1 for inference options.",
|
||||
):
|
||||
trainer.fit(model)
|
||||
assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
|
||||
assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
|
||||
|
||||
|
||||
@RunIf(ipu=True)
|
||||
def test_manual_poptorch_opts_train_grad_accum(tmpdir):
|
||||
"""
|
||||
Ensure if the user passes manual poptorch Options
|
||||
and grad accumulation differs to accumulate_grad_batches, we
|
||||
"""
|
||||
|
||||
model = IPUModel()
|
||||
inference_opts = poptorch.Options()
|
||||
inference_opts.Training.gradientAccumulation(1)
|
||||
|
||||
training_opts = poptorch.Options()
|
||||
training_opts.Training.gradientAccumulation(2)
|
||||
|
||||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
ipus=1,
|
||||
fast_dev_run=True,
|
||||
accumulate_grad_batches=1,
|
||||
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
|
||||
)
|
||||
with pytest.warns(
|
||||
UserWarning,
|
||||
match=f"Training poptorch.Options set gradientAccumulation to {2}. "
|
||||
f"This is different to accumulate_grad_batches which was set to {1}. "
|
||||
f"To change gradientAccumulation, please set accumulate_grad_batches in the Trainer. "
|
||||
f"Setting poptorch.Options gradientAccumulation to {1}",
|
||||
):
|
||||
trainer.fit(model)
|
||||
assert isinstance(trainer.accelerator.training_type_plugin, IPUPlugin)
|
||||
assert trainer.accelerator.training_type_plugin.inference_opts.Training.gradient_accumulation == 1
|
||||
|
||||
|
||||
@RunIf(ipu=True)
|
||||
def test_manual_poptorch_opts_custom(tmpdir):
|
||||
"""
|
||||
Ensure if the user passes manual poptorch Options with custom parameters set,
|
||||
we respect them in our poptorch options.
|
||||
we respect them in our poptorch options and the dataloaders.
|
||||
"""
|
||||
|
||||
model = IPUModel()
|
||||
inference_opts = poptorch.Options()
|
||||
inference_opts.deviceIterations(16)
|
||||
inference_opts.replicationFactor(2)
|
||||
inference_opts.Training.gradientAccumulation(1)
|
||||
|
||||
training_opts = poptorch.Options()
|
||||
training_opts.deviceIterations(8)
|
||||
training_opts.replicationFactor(2)
|
||||
training_opts.Training.gradientAccumulation(2)
|
||||
|
||||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
ipus=2,
|
||||
fast_dev_run=True,
|
||||
accumulate_grad_batches=2,
|
||||
plugins=IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
|
||||
)
|
||||
inference_opts = poptorch.Options()
|
||||
inference_opts.deviceIterations(16)
|
||||
inference_opts.replicationFactor(1)
|
||||
inference_opts.Training.gradientAccumulation(1)
|
||||
|
||||
class TestCallback(Callback):
|
||||
|
||||
def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
|
||||
# ensure dataloaders were correctly set up during training.
|
||||
plugin = trainer.accelerator.training_type_plugin
|
||||
assert isinstance(plugin, IPUPlugin)
|
||||
assert plugin.training_opts.replication_factor == 2
|
||||
assert plugin.inference_opts.replication_factor == 1
|
||||
|
||||
val_dataloader = trainer.val_dataloaders[0]
|
||||
train_dataloader = trainer.train_dataloader
|
||||
assert isinstance(train_dataloader, CombinedLoader)
|
||||
train_dataloader = train_dataloader.loaders
|
||||
assert isinstance(val_dataloader, poptorch.DataLoader)
|
||||
assert isinstance(train_dataloader, poptorch.DataLoader)
|
||||
assert train_dataloader.options.replication_factor == 2
|
||||
assert val_dataloader.options.replication_factor == 1
|
||||
|
||||
plugin = IPUPlugin(inference_opts=inference_opts, training_opts=training_opts)
|
||||
# ensure we default to the training options replication factor
|
||||
assert plugin.replication_factor == 2
|
||||
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin, callbacks=TestCallback())
|
||||
trainer.fit(model)
|
||||
|
||||
plugin = trainer.accelerator.training_type_plugin
|
||||
assert isinstance(plugin, IPUPlugin)
|
||||
inference_opts = plugin.inference_opts
|
||||
training_opts = plugin.training_opts
|
||||
assert inference_opts.device_iterations == 16
|
||||
assert inference_opts.replication_factor == 2
|
||||
assert inference_opts.Training.gradient_accumulation == 1
|
||||
|
||||
training_opts = plugin.training_opts
|
||||
assert training_opts.device_iterations == 8
|
||||
assert training_opts.replication_factor == 2
|
||||
assert training_opts.Training.gradient_accumulation == 2
|
||||
|
||||
inference_opts = plugin.inference_opts
|
||||
assert inference_opts.device_iterations == 16
|
||||
assert inference_opts.replication_factor == 1
|
||||
assert inference_opts.Training.gradient_accumulation == 1
|
||||
|
||||
|
||||
@RunIf(ipu=True)
|
||||
def test_replication_factor(tmpdir):
|
||||
"""
|
||||
Ensure if the user passes manual poptorch Options with custom parameters set,
|
||||
we set them correctly in the dataloaders.
|
||||
"""
|
||||
|
||||
plugin = IPUPlugin()
|
||||
trainer = Trainer(ipus=2, default_root_dir=tmpdir, fast_dev_run=True, plugins=plugin)
|
||||
assert trainer.ipus == 2
|
||||
|
||||
|
||||
@RunIf(ipu=True)
|
||||
def test_default_opts(tmpdir):
|
||||
|
|
Loading…
Reference in New Issue