diff --git a/.actions/assistant.py b/.actions/assistant.py index 7ffef03b5b..929c08c3f3 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -442,6 +442,7 @@ class AssistantCLI: target_dir: str = "docs/source-pytorch/XXX", checkout: str = "tags/1.0.0", source_dir: str = "docs/source", + as_orphan: bool = False, ) -> None: """Pull docs pages from external source and append to local docs.""" import zipfile @@ -473,7 +474,17 @@ class AssistantCLI: if os.path.isfile(new_rst): logging.warning(f"Page {new_rst} already exists in the local tree so it will be skipped.") continue - shutil.copy(rst, new_rst) + AssistantCLI._copy_rst(rst, new_rst, as_orphan=as_orphan) + + @staticmethod + def _copy_rst(rst_in, rst_out, as_orphan: bool = False): + """Copy RST page with optional inserting orphan statement.""" + with open(rst_in, encoding="utf-8") as fopen: + page = fopen.read() + if as_orphan and ":orphan:" not in page: + page = ":orphan:\n\n" + page + with open(rst_in, "w", encoding="utf-8") as fopen: + fopen.write(page) if __name__ == "__main__": diff --git a/.gitignore b/.gitignore index fe3a13e9e0..598ad521af 100644 --- a/.gitignore +++ b/.gitignore @@ -22,7 +22,9 @@ docs/source-pytorch/notebooks docs/source-pytorch/_static/images/course_UvA-DL docs/source-pytorch/_static/images/lightning_examples docs/source-pytorch/_static/fetched-s3-assets +docs/source-pytorch/_static/images/ipu/ docs/source-pytorch/integrations/hpu +docs/source-pytorch/integrations/ipu docs/source-fabric/*/generated diff --git a/docs/source-pytorch/accelerators/accelerator_prepare.rst b/docs/source-pytorch/accelerators/accelerator_prepare.rst index 0647cbf957..59d0779cb3 100644 --- a/docs/source-pytorch/accelerators/accelerator_prepare.rst +++ b/docs/source-pytorch/accelerators/accelerator_prepare.rst @@ -1,7 +1,5 @@ :orphan: -.. _gpu_prepare: - ######################################## Hardware agnostic training (preparation) ######################################## diff --git a/docs/source-pytorch/accelerators/ipu_advanced.rst b/docs/source-pytorch/accelerators/ipu_advanced.rst deleted file mode 100644 index 98c1de5825..0000000000 --- a/docs/source-pytorch/accelerators/ipu_advanced.rst +++ /dev/null @@ -1,144 +0,0 @@ -:orphan: - -.. _ipu_advanced: - -Accelerator: IPU training -========================= -**Audience:** Users looking to customize IPU training for massive models. - -.. warning:: This is an :ref:`experimental <versioning:Experimental API>` feature. - ----- - -Advanced IPU options --------------------- - -IPUs provide further optimizations to speed up training. By using the ``IPUStrategy`` we can set the ``device_iterations``, which controls the number of iterations run directly on the IPU devices before returning to the host. Increasing the number of on-device iterations will improve throughput, as there is less device to host communication required. - -.. note:: - - When using model parallelism, it is a hard requirement to increase the number of device iterations to ensure we fully saturate the devices via micro-batching. see :ref:`ipu-model-parallelism` for more information. - -.. code-block:: python - - import lightning.pytorch as pl - from lightning_graphcore import IPUStrategy - - model = MyLightningModule() - trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=32)) - trainer.fit(model) - -Note that by default we return the last device iteration loss. You can override this by passing in your own ``poptorch.Options`` and setting the AnchorMode as described in the `PopTorch documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.Options.anchorMode>`__. - -.. code-block:: python - - import poptorch - import lightning.pytorch as pl - from lightning_graphcore import IPUStrategy - - model = MyLightningModule() - inference_opts = poptorch.Options() - inference_opts.deviceIterations(32) - - training_opts = poptorch.Options() - training_opts.anchorMode(poptorch.AnchorMode.All) - training_opts.deviceIterations(32) - - trainer = Trainer( - accelerator="ipu", devices=8, strategy=IPUStrategy(inference_opts=inference_opts, training_opts=training_opts) - ) - trainer.fit(model) - -You can also override all options by passing the ``poptorch.Options`` to the plugin. See `PopTorch options documentation <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/batching.html>`__ for more information. - ----- - -.. _ipu-model-parallelism: - -Model parallelism ------------------ - -Due to the IPU architecture, larger models should be parallelized across IPUs by design. Currently PopTorch provides the capabilities via annotations as described in `parallel execution strategies <https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/overview.html#execution-strategies>`__. - -Below is an example using the block annotation in a LightningModule. - -.. note:: - - Currently, when using model parallelism we do not infer the number of IPUs required for you. This is done via the annotations themselves. If you specify 4 different IDs when defining Blocks, this means your model will be split onto 4 different IPUs. - - This is also mutually exclusive with the Trainer flag. In other words, if your model is split onto 2 IPUs and you set ``Trainer(accelerator="ipu", devices=4)`` this will require 8 IPUs in total: data parallelism will be used to replicate the two-IPU model 4 times. - - When pipelining the model you must also increase the `device_iterations` to ensure full data saturation of the devices data, i.e whilst one device in the model pipeline processes a batch of data, the other device can start on the next batch. For example if the model is split onto 4 IPUs, we require `device_iterations` to be at-least 4. - - -.. code-block:: python - - import lightning.pytorch as pl - import poptorch - - - class MyLightningModule(pl.LightningModule): - def __init__(self): - super().__init__() - # This will place layer1, layer2+layer3, layer4, softmax on different IPUs at runtime. - # BeginBlock will start a new id for all layers within this block - self.layer1 = poptorch.BeginBlock(torch.nn.Linear(5, 10), ipu_id=0) - - # This layer starts a new block, - # adding subsequent layers to this current block at runtime - # till the next block has been declared - self.layer2 = poptorch.BeginBlock(torch.nn.Linear(10, 5), ipu_id=1) - self.layer3 = torch.nn.Linear(5, 5) - - # Create new blocks - self.layer4 = poptorch.BeginBlock(torch.nn.Linear(5, 5), ipu_id=2) - self.softmax = poptorch.BeginBlock(torch.nn.Softmax(dim=1), ipu_id=3) - - ... - - - model = MyLightningModule() - trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=20)) - trainer.fit(model) - - -You can also use the block context manager within the forward function, or any of the step functions. - -.. code-block:: python - - import lightning.pytorch as pl - import poptorch - - - class MyLightningModule(pl.LightningModule): - def __init__(self): - super().__init__() - self.layer1 = torch.nn.Linear(5, 10) - self.layer2 = torch.nn.Linear(10, 5) - self.layer3 = torch.nn.Linear(5, 5) - self.layer4 = torch.nn.Linear(5, 5) - - self.act = torch.nn.ReLU() - self.softmax = torch.nn.Softmax(dim=1) - - def forward(self, x): - with poptorch.Block(ipu_id=0): - x = self.act(self.layer1(x)) - - with poptorch.Block(ipu_id=1): - x = self.act(self.layer2(x)) - - with poptorch.Block(ipu_id=2): - x = self.act(self.layer3(x)) - x = self.act(self.layer4(x)) - - with poptorch.Block(ipu_id=3): - x = self.softmax(x) - return x - - ... - - - model = MyLightningModule() - trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(device_iterations=20)) - trainer.fit(model) diff --git a/docs/source-pytorch/accelerators/ipu_basic.rst b/docs/source-pytorch/accelerators/ipu_basic.rst deleted file mode 100644 index 8381b2648c..0000000000 --- a/docs/source-pytorch/accelerators/ipu_basic.rst +++ /dev/null @@ -1,72 +0,0 @@ -:orphan: - -.. _ipu_basic: - -Accelerator: IPU training -========================= -**Audience:** Users looking to save money and run large models faster using single or multiple IPU devices. - -.. warning:: This is an :ref:`experimental <versioning:Experimental API>` feature. - ----- - -What is an IPU? ---------------- - -The Graphcore `Intelligence Processing Unit (IPU) <https://www.graphcore.ai/products/ipu>`__, built for Artificial Intelligence and Machine Learning, consists of many individual cores, called *tiles*, allowing highly parallel computation. Due to the high bandwidth between tiles, IPUs facilitate machine learning loads where parallelization is essential. Because computation is heavily parallelized, - -IPUs operate in a different way to conventional accelerators such as CPU/GPUs. IPUs do not require large batch sizes for maximum parallelization, can provide optimizations across the compiled graph and rely on model parallelism to fully utilize tiles for larger models. - -IPUs are used to build IPU-PODs, rack-based systems of IPU-Machines for larger workloads. See the `IPU Architecture <https://www.graphcore.ai/products/ipu>`__ for more information. - -See the `Graphcore Glossary <https://docs.graphcore.ai/projects/graphcore-glossary/>`__ for the definitions of other IPU-specific terminology. - ----- - -Run on IPU ----------- - -To enable PyTorch Lightning to utilize the IPU accelerator, simply provide ``accelerator="ipu"`` parameter to the Trainer class. - -To use multiple IPUs set the devices to a number that is a power of 2 (i.e: 2, 4, 8, 16, ...) - -.. code-block:: python - - # run on as many IPUs as available by default - trainer = Trainer(accelerator="auto", devices="auto", strategy="auto") - # equivalent to - trainer = Trainer() - - # run on one IPU - trainer = Trainer(accelerator="ipu", devices=1) - # run on multiple IPUs - trainer = Trainer(accelerator="ipu", devices=8) - # choose the number of devices automatically - trainer = Trainer(accelerator="ipu", devices="auto") - ----- - -How to access IPUs ------------------- - -To use IPUs you must have access to a system with IPU devices. To get access see `get started <https://www.graphcore.ai/getstarted>`__. - -You must ensure that the IPU system has enabled the PopART and Poplar packages from the SDK. Instructions are in the Get Started guide for your IPU system, on the Graphcore `documents portal <https://docs.graphcore.ai/page/getting-started.html>`__. - ----- - -.. _known-limitations: - -Known limitations ------------------ - -Currently there are some known limitations that are being addressed in the near future to make the experience seamless when moving from different devices. - -Please see the `MNIST example <https://github.com/Lightning-AI/lightning/blob/master/examples/pytorch/ipu/mnist_sample.py>`__ which displays most of the limitations and how to overcome them till they are resolved. - -* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. -* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code. -* Clipping gradients is not supported. -* It is not possible to use :class:`torch.utils.data.BatchSampler` in your dataloaders if you are using multiple IPUs. -* IPUs handle the data transfer to the device on the host, hence the hooks :meth:`~lightning.pytorch.core.hooks.ModelHooks.transfer_batch_to_device` and - :meth:`~lightning.pytorch.core.hooks.ModelHooks.on_after_batch_transfer` do not apply here and if you have overridden any of them, an exception will be raised. diff --git a/docs/source-pytorch/accelerators/ipu_intermediate.rst b/docs/source-pytorch/accelerators/ipu_intermediate.rst deleted file mode 100644 index 251004fd17..0000000000 --- a/docs/source-pytorch/accelerators/ipu_intermediate.rst +++ /dev/null @@ -1,65 +0,0 @@ -:orphan: - -.. _ipu_intermediate: - -Accelerator: IPU training -========================= -**Audience:** IPU users looking to increase performance via mixed precision and analysis tools. - -.. warning:: This is an :ref:`experimental <versioning:Experimental API>` feature. - ----- - -Mixed precision & 16 bit precision ----------------------------------- - -Lightning also supports training in mixed precision with IPUs. -By default, IPU training will use 32-bit precision. To enable mixed precision, -set the precision flag. - -.. note:: - Currently there is no dynamic scaling of the loss with mixed precision training. - -.. code-block:: python - - import lightning.pytorch as pl - - model = MyLightningModule() - trainer = pl.Trainer(accelerator="ipu", devices=8, precision=16) - trainer.fit(model) - -You can also use pure 16-bit training, where the weights are also in 16-bit precision. - -.. code-block:: python - - import lightning.pytorch as pl - from lightning_graphcore import IPUStrategy - - model = MyLightningModule() - model = model.half() - trainer = pl.Trainer(accelerator="ipu", devices=8, precision=16) - trainer.fit(model) - ----- - -PopVision Graph Analyser ------------------------- - -.. figure:: ../_static/images/accelerator/ipus/profiler.png - :alt: PopVision Graph Analyser - :width: 500 - -Lightning supports integration with the `PopVision Graph Analyser Tool <https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/>`__. This helps to look at utilization of IPU devices and provides helpful metrics during the lifecycle of your trainer. Once you have gained access, The PopVision Graph Analyser Tool can be downloaded via the `GraphCore download website <https://downloads.graphcore.ai/>`__. - -Lightning supports dumping all reports to a directory to open using the tool. - -.. code-block:: python - - import lightning.pytorch as pl - from lightning_graphcore import IPUStrategy - - model = MyLightningModule() - trainer = pl.Trainer(accelerator="ipu", devices=8, strategy=IPUStrategy(autoreport_dir="report_dir/")) - trainer.fit(model) - -This will dump all reports to ``report_dir/`` which can then be opened using the Graph Analyser Tool, see `Opening Reports <https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/opening-reports.html>`__. diff --git a/docs/source-pytorch/common/index.rst b/docs/source-pytorch/common/index.rst index 48f1946ac6..573cd48300 100644 --- a/docs/source-pytorch/common/index.rst +++ b/docs/source-pytorch/common/index.rst @@ -17,7 +17,7 @@ ../advanced/model_parallel Train on single or multiple GPUs <../accelerators/gpu> Train on single or multiple HPUs <../integrations/hpu/index> - Train on single or multiple IPUs <../accelerators/ipu> + Train on single or multiple IPUs <../integrations/ipu/index> Train on single or multiple TPUs <../accelerators/tpu> Train on MPS <../accelerators/mps> Use a pretrained model <../advanced/pretrained> @@ -171,7 +171,7 @@ How-to Guides .. displayitem:: :header: Train on single or multiple IPUs :description: Train models faster with IPU accelerators - :button_link: ../accelerators/ipu.html + :button_link: ../integrations/ipu/index.html :col_css: col-md-4 :height: 180 diff --git a/docs/source-pytorch/common_usecases.rst b/docs/source-pytorch/common_usecases.rst index 9bbc9856b5..4af75de9dd 100644 --- a/docs/source-pytorch/common_usecases.rst +++ b/docs/source-pytorch/common_usecases.rst @@ -137,7 +137,7 @@ Customize and extend Lightning for things like custom hardware or distributed st :header: Train on single or multiple IPUs :description: Train models faster with IPUs. :col_css: col-md-12 - :button_link: accelerators/ipu.html + :button_link: integrations/ipu/index.html :height: 100 .. displayitem:: diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index b38c318ad4..1899fdb8d9 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -14,6 +14,7 @@ import glob import os import shutil +import urllib.request import warnings from importlib.util import module_from_spec, spec_from_file_location from types import ModuleType @@ -101,6 +102,18 @@ assist_local.AssistantCLI.pull_docs_files( target_dir="docs/source-pytorch/integrations/hpu", checkout="tags/1.1.0", ) +assist_local.AssistantCLI.pull_docs_files( + gh_user_repo="Lightning-AI/lightning-Graphcore", + target_dir="docs/source-pytorch/integrations/ipu", + checkout="tags/v0.1.0", + as_orphan=True, # todo: this can be dropped after new IPU release +) +# the IPU also need one image +URL_RAW_DOCS_GRAPHCORE = "https://raw.githubusercontent.com/Lightning-AI/lightning-Graphcore/v0.1.0/docs/source" +for img in ["_static/images/ipu/profiler.png"]: + os.makedirs(os.path.dirname(os.path.join(_PATH_HERE, img)), exist_ok=True) + urllib.request.urlretrieve(f"{URL_RAW_DOCS_GRAPHCORE}/{img}", os.path.join(_PATH_HERE, img)) + if _FETCH_S3_ASSETS: fetch_external_assets( diff --git a/docs/source-pytorch/extensions/accelerator.rst b/docs/source-pytorch/extensions/accelerator.rst index 45f4b72500..0589d9850c 100644 --- a/docs/source-pytorch/extensions/accelerator.rst +++ b/docs/source-pytorch/extensions/accelerator.rst @@ -10,7 +10,7 @@ Currently there are accelerators for: - CPU - :doc:`GPU <../accelerators/gpu>` - :doc:`TPU <../accelerators/tpu>` -- :doc:`IPU <../accelerators/ipu>` +- :doc:`IPU <../integrations/ipu/index>` - :doc:`HPU <../integrations/hpu/index>` - :doc:`MPS <../accelerators/mps>` diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst index e8cde20528..2dd69859dd 100644 --- a/docs/source-pytorch/extensions/strategy.rst +++ b/docs/source-pytorch/extensions/strategy.rst @@ -89,7 +89,7 @@ The below table lists all relevant strategies available in Lightning with their - Strategy for training on a single HPU device. :doc:`Learn more. <../integrations/hpu/index>` * - ipu_strategy - ``IPUStrategy`` - - Plugin for training on IPU devices. :doc:`Learn more. <../accelerators/ipu>` + - Plugin for training on IPU devices. :doc:`Learn more. <../integrations/ipu/index>` * - xla - :class:`~lightning.pytorch.strategies.XLAStrategy` - Strategy for training on multiple TPU devices using the :func:`torch_xla.distributed.xla_multiprocessing.spawn` method. :doc:`Learn more. <../accelerators/tpu>` diff --git a/docs/source-pytorch/glossary/index.rst b/docs/source-pytorch/glossary/index.rst index e98bb050e9..319b97a69c 100644 --- a/docs/source-pytorch/glossary/index.rst +++ b/docs/source-pytorch/glossary/index.rst @@ -19,7 +19,7 @@ Half precision <../common/precision> HPU <../integrations/hpu/index> Inference <../deploy/production_intermediate> - IPU <../accelerators/ipu> + IPU <../integrations/ipu/index> Lightning CLI <../cli/lightning_cli> LightningDataModule <../data/datamodule> LightningModule <../common/lightning_module> @@ -170,7 +170,7 @@ Glossary :header: IPU :description: Graphcore Intelligence Processing Unit for faster training :col_css: col-md-12 - :button_link: ../accelerators/ipu.html + :button_link: ../integrations/ipu/index.html :height: 100 .. displayitem:: diff --git a/docs/source-pytorch/accelerators/ipu.rst b/docs/source-pytorch/integrations/ipu/index.rst similarity index 100% rename from docs/source-pytorch/accelerators/ipu.rst rename to docs/source-pytorch/integrations/ipu/index.rst diff --git a/docs/source-pytorch/levels/advanced_level_19.rst b/docs/source-pytorch/levels/advanced_level_19.rst index 28edaa9082..eba1a9bc14 100644 --- a/docs/source-pytorch/levels/advanced_level_19.rst +++ b/docs/source-pytorch/levels/advanced_level_19.rst @@ -27,7 +27,7 @@ Explore Intelligence Processing Unit (IPU) for model scaling. :header: Train models on IPUs :description: Learn the basics of single and multi-IPU training. :col_css: col-md-4 - :button_link: ../accelerators/ipu_basic.html + :button_link: ../integrations/ipu/basic.html :height: 150 :tag: basic @@ -35,7 +35,7 @@ Explore Intelligence Processing Unit (IPU) for model scaling. :header: Optimize models training on IPUs :description: Tune model performance with mixed precision and the performance analyser. :col_css: col-md-4 - :button_link: ../accelerators/ipu_intermediate.html + :button_link: ../integrations/ipu/intermediate.html :height: 150 :tag: intermediate