diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 8510b70d4e..0c2c5a69c4 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -27,7 +27,6 @@ Brief description of all our automation tools used for boosting development perf - GPU: 2 x NVIDIA RTX 3090 - TPU: [Google TPU v4-8](https://cloud.google.com/tpu/docs) - - IPU: [Colossus MK1 IPU](https://www.graphcore.ai/products/ipu) - To check which versions of Python or PyTorch are used for testing in our CI, see the corresponding workflow files or checkgroup config file at [`.github/checkgroup.yml`](../checkgroup.yml). diff --git a/.gitignore b/.gitignore index 598ad521af..fe3a13e9e0 100644 --- a/.gitignore +++ b/.gitignore @@ -22,9 +22,7 @@ docs/source-pytorch/notebooks docs/source-pytorch/_static/images/course_UvA-DL docs/source-pytorch/_static/images/lightning_examples docs/source-pytorch/_static/fetched-s3-assets -docs/source-pytorch/_static/images/ipu/ docs/source-pytorch/integrations/hpu -docs/source-pytorch/integrations/ipu docs/source-fabric/*/generated diff --git a/docs/source-app/quickstart.rst b/docs/source-app/quickstart.rst index 6df9b26b7b..99872c5f8a 100644 --- a/docs/source-app/quickstart.rst +++ b/docs/source-app/quickstart.rst @@ -53,7 +53,6 @@ And that's it! GPU available: True (mps), used: False TPU available: False, using: 0 TPU cores - IPU available: False, using: 0 IPUs | Name | Type | Params | In sizes | Out sizes ------------------------------------------------------------------ diff --git a/docs/source-pytorch/advanced/speed.rst b/docs/source-pytorch/advanced/speed.rst index 3abbc73dc8..1f79393d2f 100644 --- a/docs/source-pytorch/advanced/speed.rst +++ b/docs/source-pytorch/advanced/speed.rst @@ -20,7 +20,7 @@ Training on Accelerators **Use when:** Whenever possible! -With Lightning, running on GPUs, TPUs, IPUs on multiple nodes is a simple switch of a flag. +With Lightning, running on GPUs, TPUs, HPUs on multiple nodes is a simple switch of a flag. GPU Training ============ diff --git a/docs/source-pytorch/common/index.rst b/docs/source-pytorch/common/index.rst index 573cd48300..84d4e331cf 100644 --- a/docs/source-pytorch/common/index.rst +++ b/docs/source-pytorch/common/index.rst @@ -17,7 +17,6 @@ ../advanced/model_parallel Train on single or multiple GPUs <../accelerators/gpu> Train on single or multiple HPUs <../integrations/hpu/index> - Train on single or multiple IPUs <../integrations/ipu/index> Train on single or multiple TPUs <../accelerators/tpu> Train on MPS <../accelerators/mps> Use a pretrained model <../advanced/pretrained> @@ -168,13 +167,6 @@ How-to Guides :col_css: col-md-4 :height: 180 -.. displayitem:: - :header: Train on single or multiple IPUs - :description: Train models faster with IPU accelerators - :button_link: ../integrations/ipu/index.html - :col_css: col-md-4 - :height: 180 - .. displayitem:: :header: Train on single or multiple TPUs :description: TTrain models faster with TPU accelerators diff --git a/docs/source-pytorch/common/precision_basic.rst b/docs/source-pytorch/common/precision_basic.rst index eb7fe0f9e9..1134524b51 100644 --- a/docs/source-pytorch/common/precision_basic.rst +++ b/docs/source-pytorch/common/precision_basic.rst @@ -103,31 +103,26 @@ Precision support by accelerator ******************************** .. list-table:: Precision with Accelerators - :widths: 20 20 20 20 20 + :widths: 20 20 20 20 :header-rows: 1 * - Precision - CPU - GPU - TPU - - IPU * - 16 Mixed - No - Yes - No - - Yes * - BFloat16 Mixed - Yes - Yes - Yes - - No * - 32 True - Yes - Yes - Yes - - Yes * - 64 True - Yes - Yes - No - - No diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index 37c279e311..0ad4592754 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -175,7 +175,7 @@ Trainer flags accelerator ^^^^^^^^^^^ -Supports passing different accelerator types (``"cpu", "gpu", "tpu", "ipu", "auto"``) +Supports passing different accelerator types (``"cpu", "gpu", "tpu", "hpu", "auto"``) as well as custom accelerator instances. .. code-block:: python @@ -393,9 +393,6 @@ Number of devices to train on (``int``), which devices to train on (``list`` or # Training with TPU Accelerator using 8 tpu cores trainer = Trainer(devices="auto", accelerator="tpu") - # Training with IPU Accelerator using 4 ipus - trainer = Trainer(devices="auto", accelerator="ipu") - .. note:: If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and fetch the ``auto_device_count`` diff --git a/docs/source-pytorch/common_usecases.rst b/docs/source-pytorch/common_usecases.rst index 4af75de9dd..7e6ed91d0c 100644 --- a/docs/source-pytorch/common_usecases.rst +++ b/docs/source-pytorch/common_usecases.rst @@ -133,13 +133,6 @@ Customize and extend Lightning for things like custom hardware or distributed st :button_link: integrations/hpu/index.html :height: 100 -.. displayitem:: - :header: Train on single or multiple IPUs - :description: Train models faster with IPUs. - :col_css: col-md-12 - :button_link: integrations/ipu/index.html - :height: 100 - .. displayitem:: :header: Train on single or multiple TPUs :description: Train models faster with TPUs. diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index 829f1f2c9e..a7fa7ce559 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -94,18 +94,6 @@ assist_local.AssistantCLI.pull_docs_files( target_dir="docs/source-pytorch/integrations/hpu", checkout="refs/tags/1.3.0", ) -assist_local.AssistantCLI.pull_docs_files( - gh_user_repo="Lightning-AI/lightning-Graphcore", - target_dir="docs/source-pytorch/integrations/ipu", - checkout="refs/tags/v0.1.0", - as_orphan=True, # todo: this can be dropped after new IPU release -) -# the IPU also need one image -URL_RAW_DOCS_GRAPHCORE = "https://raw.githubusercontent.com/Lightning-AI/lightning-Graphcore/v0.1.0/docs/source" -for img in ["_static/images/ipu/profiler.png"]: - img_ = os.path.join(_PATH_HERE, "integrations", "ipu", img) - os.makedirs(os.path.dirname(img_), exist_ok=True) - urllib.request.urlretrieve(f"{URL_RAW_DOCS_GRAPHCORE}/{img}", img_) # Copy strategies docs as single pages assist_local.AssistantCLI.pull_docs_files( @@ -340,7 +328,6 @@ intersphinx_mapping = { "numpy": ("https://numpy.org/doc/stable/", None), "PIL": ("https://pillow.readthedocs.io/en/stable/", None), "torchmetrics": ("https://torchmetrics.readthedocs.io/en/stable/", None), - "graphcore": ("https://docs.graphcore.ai/en/latest/", None), "lightning_habana": ("https://lightning-ai.github.io/lightning-Habana/", None), "tensorboardX": ("https://tensorboardx.readthedocs.io/en/stable/", None), # needed for referencing App from lightning scope diff --git a/docs/source-pytorch/expertise_levels.rst b/docs/source-pytorch/expertise_levels.rst index 5988890332..7c34123f5b 100644 --- a/docs/source-pytorch/expertise_levels.rst +++ b/docs/source-pytorch/expertise_levels.rst @@ -190,34 +190,26 @@ Configure all aspects of Lightning for advanced usecases. :tag: advanced .. displayitem:: - :header: Level 18: Explore IPUs - :description: Explore Intelligence Processing Unit (IPU) for model scaling. + :header: Level 18: Explore HPUs + :description: Explore Havana Gaudi Processing Unit (HPU) for model scaling. :col_css: col-md-6 :button_link: levels/advanced_level_19.html :height: 150 :tag: advanced .. displayitem:: - :header: Level 19: Explore HPUs - :description: Explore Havana Gaudi Processing Unit (HPU) for model scaling. + :header: Level 19: Master TPUs + :description: Master TPUs and run on cloud TPUs. :col_css: col-md-6 :button_link: levels/advanced_level_20.html :height: 150 :tag: advanced .. displayitem:: - :header: Level 20: Master TPUs - :description: Master TPUs and run on cloud TPUs. - :col_css: col-md-6 - :button_link: levels/advanced_level_21.html - :height: 150 - :tag: advanced - -.. displayitem:: - :header: Level 21: Train models with billions of parameters + :header: Level 20: Train models with billions of parameters :description: Scale GPU training to models with billions of parameters :col_css: col-md-6 - :button_link: levels/advanced_level_22.html + :button_link: levels/advanced_level_21.html :height: 150 :tag: advanced @@ -240,7 +232,7 @@ Customize and extend Lightning for things like custom hardware or distributed st .. Add callout items below this line .. displayitem:: - :header: Level 22: Extend the Lightning CLI + :header: Level 21: Extend the Lightning CLI :description: Extend the functionality of the Lightning CLI. :col_css: col-md-6 :button_link: levels/expert_level_23.html @@ -248,7 +240,7 @@ Customize and extend Lightning for things like custom hardware or distributed st :tag: expert .. displayitem:: - :header: Level 23: Integrate a custom cluster + :header: Level 22: Integrate a custom cluster :description: Integrate a custom cluster into Lightning. :col_css: col-md-6 :button_link: levels/expert_level_24.html @@ -256,7 +248,7 @@ Customize and extend Lightning for things like custom hardware or distributed st :tag: expert .. displayitem:: - :header: Level 24: Make your own profiler + :header: Level 23: Make your own profiler :description: Make your own profiler. :col_css: col-md-6 :button_link: tuning/profiler_expert.html @@ -264,10 +256,10 @@ Customize and extend Lightning for things like custom hardware or distributed st :tag: expert .. displayitem:: - :header: Level 25: Add a new accelerator or Strategy + :header: Level 24: Add a new accelerator or Strategy :description: Integrate a new accelerator or distributed strategy. :col_css: col-md-6 - :button_link: levels/expert_level_27.html + :button_link: levels/expert_level_25.html :height: 150 :tag: expert diff --git a/docs/source-pytorch/extensions/accelerator.rst b/docs/source-pytorch/extensions/accelerator.rst index 0589d9850c..93dc467b02 100644 --- a/docs/source-pytorch/extensions/accelerator.rst +++ b/docs/source-pytorch/extensions/accelerator.rst @@ -4,13 +4,12 @@ Accelerator ########### -The Accelerator connects a Lightning Trainer to arbitrary hardware (CPUs, GPUs, TPUs, IPUs, MPS, ...). +The Accelerator connects a Lightning Trainer to arbitrary hardware (CPUs, GPUs, TPUs, HPUs, MPS, ...). Currently there are accelerators for: - CPU - :doc:`GPU <../accelerators/gpu>` - :doc:`TPU <../accelerators/tpu>` -- :doc:`IPU <../integrations/ipu/index>` - :doc:`HPU <../integrations/hpu/index>` - :doc:`MPS <../accelerators/mps>` diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst index 858a6744ee..7155cff815 100644 --- a/docs/source-pytorch/extensions/strategy.rst +++ b/docs/source-pytorch/extensions/strategy.rst @@ -57,9 +57,6 @@ Here are some examples: # Training with the DDP Spawn strategy on 8 TPU cores trainer = Trainer(strategy="ddp_spawn", accelerator="tpu", devices=8) - # Training with the default IPU strategy on 8 IPUs - trainer = Trainer(accelerator="ipu", devices=8) - The below table lists all relevant strategies available in Lightning with their corresponding short-hand name: .. list-table:: Strategy Classes and Nicknames @@ -87,9 +84,6 @@ The below table lists all relevant strategies available in Lightning with their * - hpu_single - ``SingleHPUStrategy`` - Strategy for training on a single HPU device. :doc:`Learn more. <../integrations/hpu/index>` - * - ipu_strategy - - ``IPUStrategy`` - - Plugin for training on IPU devices. :doc:`Learn more. <../integrations/ipu/index>` * - xla - :class:`~lightning.pytorch.strategies.XLAStrategy` - Strategy for training on multiple TPU devices using the :func:`torch_xla.distributed.xla_multiprocessing.spawn` method. :doc:`Learn more. <../accelerators/tpu>` diff --git a/docs/source-pytorch/glossary/index.rst b/docs/source-pytorch/glossary/index.rst index 805510a1aa..5ca677c48e 100644 --- a/docs/source-pytorch/glossary/index.rst +++ b/docs/source-pytorch/glossary/index.rst @@ -20,7 +20,6 @@ Half precision <../common/precision> HPU <../integrations/hpu/index> Inference <../deploy/production_intermediate> - IPU <../integrations/ipu/index> Lightning CLI <../cli/lightning_cli> LightningDataModule <../data/datamodule> LightningModule <../common/lightning_module> @@ -177,13 +176,6 @@ Glossary :button_link: ../deploy/production_intermediate.html :height: 100 -.. displayitem:: - :header: IPU - :description: Graphcore Intelligence Processing Unit for faster training - :col_css: col-md-12 - :button_link: ../integrations/ipu/index.html - :height: 100 - .. displayitem:: :header: Lightning CLI :description: A Command-line Interface (CLI) to interact with Lightning code via a terminal diff --git a/docs/source-pytorch/integrations/ipu/index.rst b/docs/source-pytorch/integrations/ipu/index.rst deleted file mode 100644 index 138814fefc..0000000000 --- a/docs/source-pytorch/integrations/ipu/index.rst +++ /dev/null @@ -1,48 +0,0 @@ -.. _ipu: - -Accelerator: IPU training -========================= - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Prepare your code (Optional) - :description: Prepare your code to run on any hardware - :col_css: col-md-6 - :button_link: accelerator_prepare.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Basic - :description: Learn the basics of single and multi-IPU training. - :col_css: col-md-6 - :button_link: ipu_basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Intermediate - :description: Tune model performance with mix-precision settings and the performance analyser. - :col_css: col-md-6 - :button_link: ipu_intermediate.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Advanced - :description: Learn advanced techniques to customize IPU training for massive models. - :col_css: col-md-6 - :button_link: ipu_advanced.html - :height: 150 - :tag: advanced - -.. raw:: html - -
-
diff --git a/docs/source-pytorch/levels/advanced.rst b/docs/source-pytorch/levels/advanced.rst index 1ea809d6fa..6e4b1e99f9 100644 --- a/docs/source-pytorch/levels/advanced.rst +++ b/docs/source-pytorch/levels/advanced.rst @@ -46,34 +46,26 @@ Configure all aspects of Lightning for advanced usecases. :tag: advanced .. displayitem:: - :header: Level 18: Explore IPUs - :description: Explore Intelligence Processing Unit (IPU) for model scaling. + :header: Level 18: Explore HPUs + :description: Explore Habana Gaudi Processing Unit (HPU) for model scaling. :col_css: col-md-6 :button_link: advanced_level_19.html :height: 150 :tag: advanced .. displayitem:: - :header: Level 19: Explore HPUs - :description: Explore Habana Gaudi Processing Unit (HPU) for model scaling. + :header: Level 19: Master TPUs + :description: Master TPUs and run on cloud TPUs. :col_css: col-md-6 :button_link: advanced_level_20.html :height: 150 :tag: advanced .. displayitem:: - :header: Level 20: Master TPUs - :description: Master TPUs and run on cloud TPUs. - :col_css: col-md-6 - :button_link: advanced_level_21.html - :height: 150 - :tag: advanced - -.. displayitem:: - :header: Level 21: Train models with billions of parameters + :header: Level 20: Train models with billions of parameters :description: Scale GPU training to models with billions of parameters :col_css: col-md-6 - :button_link: advanced_level_22.html + :button_link: advanced_level_21.html :height: 150 :tag: advanced diff --git a/docs/source-pytorch/levels/advanced_level_19.rst b/docs/source-pytorch/levels/advanced_level_19.rst index eba1a9bc14..6ce849c12b 100644 --- a/docs/source-pytorch/levels/advanced_level_19.rst +++ b/docs/source-pytorch/levels/advanced_level_19.rst @@ -1,10 +1,10 @@ :orphan: ###################### -Level 18: Explore IPUs +Level 18: Explore HPUs ###################### -Explore Intelligence Processing Unit (IPU) for model scaling. +Explore Intel Habana Processing Unit (HPU) for model scaling. ---- @@ -16,26 +16,18 @@ Explore Intelligence Processing Unit (IPU) for model scaling. .. Add callout items below this line .. displayitem:: - :header: Prepare your code (Optional) - :description: Prepare your code to run on any hardware. - :col_css: col-md-4 - :button_link: ../accelerators/accelerator_prepare.html + :header: Train models on HPUs + :description: Learn the basics of single and multi-HPU core training. + :col_css: col-md-6 + :button_link: ../integrations/hpu/basic.html :height: 150 :tag: basic .. displayitem:: - :header: Train models on IPUs - :description: Learn the basics of single and multi-IPU training. - :col_css: col-md-4 - :button_link: ../integrations/ipu/basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Optimize models training on IPUs - :description: Tune model performance with mixed precision and the performance analyser. - :col_css: col-md-4 - :button_link: ../integrations/ipu/intermediate.html + :header: Optimize models training on HPUs + :description: Enable state-of-the-art scaling with advanced mixed-precision settings. + :col_css: col-md-6 + :button_link: ../integrations/hpu/intermediate.html :height: 150 :tag: intermediate diff --git a/docs/source-pytorch/levels/advanced_level_20.rst b/docs/source-pytorch/levels/advanced_level_20.rst index 8aaa159cc6..ebde7d6ea5 100644 --- a/docs/source-pytorch/levels/advanced_level_20.rst +++ b/docs/source-pytorch/levels/advanced_level_20.rst @@ -1,10 +1,10 @@ :orphan: -###################### -Level 19: Explore HPUs -###################### +##################### +Level 19: Master TPUs +##################### -Explore Intel Habana Processing Unit (HPU) for model scaling. +Master cloud TPU training with profiling and scaling techniques. ---- @@ -16,20 +16,28 @@ Explore Intel Habana Processing Unit (HPU) for model scaling. .. Add callout items below this line .. displayitem:: - :header: Train models on HPUs - :description: Learn the basics of single and multi-HPU core training. - :col_css: col-md-6 - :button_link: ../integrations/hpu/basic.html - :height: 150 - :tag: basic + :header: Run on cloud TPUs + :description: Scale massive models using cloud TPUs. + :col_css: col-md-4 + :button_link: ../accelerators/tpu_intermediate.html + :height: 180 + :tag: intermediate .. displayitem:: - :header: Optimize models training on HPUs - :description: Enable state-of-the-art scaling with advanced mixed-precision settings. - :col_css: col-md-6 - :button_link: ../integrations/hpu/intermediate.html - :height: 150 - :tag: intermediate + :header: Explore advanced TPU scaling techniques + :description: Dive into XLA and advanced techniques to optimize TPU-powered models. + :col_css: col-md-4 + :button_link: ../accelerators/tpu_advanced.html + :height: 180 + :tag: advanced + +.. displayitem:: + :header: Profile TPU code + :description: Learn to profile TPU code. + :col_css: col-md-4 + :button_link: ../tuning/profiler_advanced.html + :height: 180 + :tag: advanced .. raw:: html diff --git a/docs/source-pytorch/levels/advanced_level_21.rst b/docs/source-pytorch/levels/advanced_level_21.rst index 92358c04eb..6c07d1d037 100644 --- a/docs/source-pytorch/levels/advanced_level_21.rst +++ b/docs/source-pytorch/levels/advanced_level_21.rst @@ -1,10 +1,10 @@ :orphan: -##################### -Level 20: Master TPUs -##################### +################################################## +Level 20: Train models with billions of parameters +################################################## -Master cloud TPU training with profiling and scaling techniques. +Scale to billions of parameters with multiple distributed strategies. ---- @@ -16,27 +16,19 @@ Master cloud TPU training with profiling and scaling techniques. .. Add callout items below this line .. displayitem:: - :header: Run on cloud TPUs - :description: Scale massive models using cloud TPUs. - :col_css: col-md-4 - :button_link: ../accelerators/tpu_intermediate.html - :height: 180 + :header: Scale with distributed strategies + :description: Learn about different distributed strategies to reach bigger model parameter sizes. + :col_css: col-md-6 + :button_link: ../accelerators/gpu_intermediate.html + :height: 150 :tag: intermediate .. displayitem:: - :header: Explore advanced TPU scaling techniques - :description: Dive into XLA and advanced techniques to optimize TPU-powered models. - :col_css: col-md-4 - :button_link: ../accelerators/tpu_advanced.html - :height: 180 - :tag: advanced - -.. displayitem:: - :header: Profile TPU code - :description: Learn to profile TPU code. - :col_css: col-md-4 - :button_link: ../tuning/profiler_advanced.html - :height: 180 + :header: Train models with billions of parameters + :description: Scale to billions of params on GPUs with FSDP or Deepspeed. + :col_css: col-md-6 + :button_link: ../advanced/model_parallel.html + :height: 150 :tag: advanced .. raw:: html diff --git a/docs/source-pytorch/levels/advanced_level_22.rst b/docs/source-pytorch/levels/advanced_level_22.rst deleted file mode 100644 index 825f389e61..0000000000 --- a/docs/source-pytorch/levels/advanced_level_22.rst +++ /dev/null @@ -1,37 +0,0 @@ -:orphan: - -################################################## -Level 21: Train models with billions of parameters -################################################## - -Scale to billions of parameters with multiple distributed strategies. - ----- - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Scale with distributed strategies - :description: Learn about different distributed strategies to reach bigger model parameter sizes. - :col_css: col-md-6 - :button_link: ../accelerators/gpu_intermediate.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Train models with billions of parameters - :description: Scale to billions of params on GPUs with FSDP or Deepspeed. - :col_css: col-md-6 - :button_link: ../advanced/model_parallel.html - :height: 150 - :tag: advanced - -.. raw:: html - -
-
diff --git a/docs/source-pytorch/levels/expert.rst b/docs/source-pytorch/levels/expert.rst index c73414201e..bb0fbf25a8 100644 --- a/docs/source-pytorch/levels/expert.rst +++ b/docs/source-pytorch/levels/expert.rst @@ -14,23 +14,23 @@ Customize and extend Lightning for things like custom hardware or distributed st .. Add callout items below this line .. displayitem:: - :header: Level 22: Extend the Lightning CLI + :header: Level 21: Extend the Lightning CLI :description: Extend the functionality of the Lightning CLI. :col_css: col-md-6 + :button_link: expert_level_22.html + :height: 150 + :tag: expert + +.. displayitem:: + :header: Level 22: Integrate a custom cluster + :description: Integrate a custom cluster into Lightning. + :col_css: col-md-6 :button_link: expert_level_23.html :height: 150 :tag: expert .. displayitem:: - :header: Level 23: Integrate a custom cluster - :description: Integrate a custom cluster into Lightning. - :col_css: col-md-6 - :button_link: expert_level_24.html - :height: 150 - :tag: expert - -.. displayitem:: - :header: Level 24: Make your own profiler + :header: Level 23: Make your own profiler :description: Make your own profiler. :col_css: col-md-6 :button_link: ../tuning/profiler_expert.html @@ -38,7 +38,7 @@ Customize and extend Lightning for things like custom hardware or distributed st :tag: expert .. displayitem:: - :header: Level 25: Add a new accelerator or Strategy + :header: Level 24: Add a new accelerator or Strategy :description: Integrate a new accelerator or distributed strategy. :col_css: col-md-6 :button_link: expert_level_27.html diff --git a/docs/source-pytorch/levels/expert_level_22.rst b/docs/source-pytorch/levels/expert_level_22.rst new file mode 100644 index 0000000000..af2f020272 --- /dev/null +++ b/docs/source-pytorch/levels/expert_level_22.rst @@ -0,0 +1,37 @@ +:orphan: + +################################## +Level 21: Extend the Lightning CLI +################################## + +Extend the functionality of the Lightning CLI. + +---- + +.. raw:: html + +
+
+ +.. Add callout items below this line + +.. displayitem:: + :header: Customize configs for complex projects + :description: Learn how to connect complex projects with each Registry. + :col_css: col-md-6 + :button_link: ../cli/lightning_cli_advanced_3.html + :height: 150 + :tag: expert + +.. displayitem:: + :header: Extend the Lightning CLI + :description: Customize the Lightning CLI + :col_css: col-md-6 + :button_link: ../cli/lightning_cli_expert.html + :height: 150 + :tag: expert + +.. raw:: html + +
+
diff --git a/docs/source-pytorch/levels/expert_level_23.rst b/docs/source-pytorch/levels/expert_level_23.rst index 5d1ba67e96..eff7b781e5 100644 --- a/docs/source-pytorch/levels/expert_level_23.rst +++ b/docs/source-pytorch/levels/expert_level_23.rst @@ -1,8 +1,8 @@ :orphan: -################################## -Level 22: Extend the Lightning CLI -################################## +#################################### +Level 22: Integrate a custom cluster +#################################### Extend the functionality of the Lightning CLI. @@ -16,18 +16,10 @@ Extend the functionality of the Lightning CLI. .. Add callout items below this line .. displayitem:: - :header: Customize configs for complex projects - :description: Learn how to connect complex projects with each Registry. + :header: Integrate your own cluster + :description: Learn how to integrate your own cluster :col_css: col-md-6 - :button_link: ../cli/lightning_cli_advanced_3.html - :height: 150 - :tag: expert - -.. displayitem:: - :header: Extend the Lightning CLI - :description: Customize the Lightning CLI - :col_css: col-md-6 - :button_link: ../cli/lightning_cli_expert.html + :button_link: ../clouds/cluster_expert.html :height: 150 :tag: expert diff --git a/docs/source-pytorch/levels/expert_level_24.rst b/docs/source-pytorch/levels/expert_level_24.rst deleted file mode 100644 index 54c544ee9d..0000000000 --- a/docs/source-pytorch/levels/expert_level_24.rst +++ /dev/null @@ -1,29 +0,0 @@ -:orphan: - -#################################### -Level 23: Integrate a custom cluster -#################################### - -Extend the functionality of the Lightning CLI. - ----- - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Integrate your own cluster - :description: Learn how to integrate your own cluster - :col_css: col-md-6 - :button_link: ../clouds/cluster_expert.html - :height: 150 - :tag: expert - -.. raw:: html - -
-
diff --git a/docs/source-pytorch/levels/expert_level_27.rst b/docs/source-pytorch/levels/expert_level_25.rst similarity index 96% rename from docs/source-pytorch/levels/expert_level_27.rst rename to docs/source-pytorch/levels/expert_level_25.rst index 9b06b10195..00244e53c0 100644 --- a/docs/source-pytorch/levels/expert_level_27.rst +++ b/docs/source-pytorch/levels/expert_level_25.rst @@ -1,7 +1,7 @@ :orphan: ########################################### -Level 25: Add a new accelerator or Strategy +Level 24: Add a new accelerator or Strategy ########################################### Integrate a new accelerator or distributed strategy. diff --git a/docs/source-pytorch/model/manual_optimization.rst b/docs/source-pytorch/model/manual_optimization.rst index d897b59c98..150f04793e 100644 --- a/docs/source-pytorch/model/manual_optimization.rst +++ b/docs/source-pytorch/model/manual_optimization.rst @@ -345,4 +345,4 @@ Here is an example using a closure function. opt.step(closure=closure) .. warning:: - The :class:`~torch.optim.LBFGS` optimizer is not supported for AMP, IPUs, or DeepSpeed. + The :class:`~torch.optim.LBFGS` optimizer is not supported for AMP or DeepSpeed. diff --git a/docs/source-pytorch/tuning/profiler_basic.rst b/docs/source-pytorch/tuning/profiler_basic.rst index d248cc6490..cf1ae93e97 100644 --- a/docs/source-pytorch/tuning/profiler_basic.rst +++ b/docs/source-pytorch/tuning/profiler_basic.rst @@ -110,7 +110,7 @@ If the profiler report becomes too long, you can stream the report to a file: ************************* Measure accelerator usage ************************* -Another helpful technique to detect bottlenecks is to ensure that you're using the full capacity of your accelerator (GPU/TPU/IPU/HPU). +Another helpful technique to detect bottlenecks is to ensure that you're using the full capacity of your accelerator (GPU/TPU/HPU). This can be measured with the :class:`~lightning.pytorch.callbacks.device_stats_monitor.DeviceStatsMonitor`: .. testcode:: diff --git a/examples/pytorch/ipu/mnist_sample.py b/examples/pytorch/ipu/mnist_sample.py deleted file mode 100644 index b50d2516eb..0000000000 --- a/examples/pytorch/ipu/mnist_sample.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from lightning.pytorch import LightningModule, Trainer -from lightning.pytorch.demos.mnist_datamodule import MNISTDataModule -from torch.nn import functional as F - - -class LitClassifier(LightningModule): - def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001): - super().__init__() - self.save_hyperparameters() - - self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim) - self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10) - - self.val_outptus = [] - self.test_outputs = [] - - def forward(self, x): - x = x.view(x.size(0), -1) - x = torch.relu(self.l1(x)) - return torch.relu(self.l2(x)) - - def training_step(self, batch, batch_idx): - x, y = batch - y_hat = self(x) - return F.cross_entropy(y_hat, y) - - def validation_step(self, batch, batch_idx): - x, y = batch - probs = self(x) - acc = self.accuracy(probs, y) - self.val_outputs.append(acc) - return acc - - def test_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - acc = self.accuracy(logits, y) - self.test_outputs.append(acc) - return acc - - def accuracy(self, logits, y): - # currently IPU poptorch doesn't implicit convert bools to tensor - # hence we use an explicit calculation for accuracy here. Once fixed in poptorch - # we can use the accuracy metric. - return torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y) - - def on_validation_epoch_end(self) -> None: - # since the training step/validation step and test step are run on the IPU device - # we must log the average loss outside the step functions. - self.log("val_acc", torch.stack(self.val_outptus).mean(), prog_bar=True) - self.val_outptus.clear() - - def on_test_epoch_end(self) -> None: - self.log("test_acc", torch.stack(self.test_outputs).mean()) - self.test_outputs.clear() - - def configure_optimizers(self): - return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - - -if __name__ == "__main__": - dm = MNISTDataModule(batch_size=32) - model = LitClassifier() - trainer = Trainer(max_epochs=2, accelerator="ipu", devices=8) - - trainer.fit(model, datamodule=dm) - trainer.test(model, datamodule=dm) diff --git a/requirements/_integrations/accelerators.txt b/requirements/_integrations/accelerators.txt index d7f372764e..90c72bedb2 100644 --- a/requirements/_integrations/accelerators.txt +++ b/requirements/_integrations/accelerators.txt @@ -1,3 +1,2 @@ # validation accelerator connectors lightning-habana >=1.2.0, <1.3.0 -lightning-graphcore >=0.1.0, <0.2.0 diff --git a/src/lightning/pytorch/_graveyard/__init__.py b/src/lightning/pytorch/_graveyard/__init__.py index 3403bbd52a..d5cfb18148 100644 --- a/src/lightning/pytorch/_graveyard/__init__.py +++ b/src/lightning/pytorch/_graveyard/__init__.py @@ -13,6 +13,5 @@ # limitations under the License. import lightning.pytorch._graveyard._torchmetrics import lightning.pytorch._graveyard.hpu -import lightning.pytorch._graveyard.ipu import lightning.pytorch._graveyard.precision import lightning.pytorch._graveyard.tpu # noqa: F401 diff --git a/src/lightning/pytorch/_graveyard/ipu.py b/src/lightning/pytorch/_graveyard/ipu.py deleted file mode 100644 index 3c462410d2..0000000000 --- a/src/lightning/pytorch/_graveyard/ipu.py +++ /dev/null @@ -1,52 +0,0 @@ -import sys -from typing import Any - -import lightning.pytorch as pl - - -def _patch_sys_modules() -> None: - self = sys.modules[__name__] - sys.modules["lightning.pytorch.accelerators.ipu"] = self - sys.modules["lightning.pytorch.strategies.ipu"] = self - sys.modules["lightning.pytorch.plugins.precision.ipu"] = self - - -class IPUAccelerator: - def __init__(self, *_: Any, **__: Any) -> None: - raise NotImplementedError( - "The `IPUAccelerator` class has been moved to an external package." - " Install the extension package as `pip install lightning-graphcore`" - " and import with `from lightning_graphcore import IPUAccelerator`." - " Please see: https://github.com/Lightning-AI/lightning-Graphcore for more details." - ) - - -class IPUStrategy: - def __init__(self, *_: Any, **__: Any) -> None: - raise NotImplementedError( - "The `IPUStrategy` class has been moved to an external package." - " Install the extension package as `pip install lightning-graphcore`" - " and import with `from lightning_graphcore import IPUStrategy`." - " Please see: https://github.com/Lightning-AI/lightning-Graphcore for more details." - ) - - -class IPUPrecisionPlugin: - def __init__(self, *_: Any, **__: Any) -> None: - raise NotImplementedError( - "The `IPUPrecisionPlugin` class has been moved to an external package." - " Install the extension package as `pip install lightning-graphcore`" - " and import with `from lightning_graphcore import IPUPrecisionPlugin`." - " Please see: https://github.com/Lightning-AI/lightning-Graphcore for more details." - ) - - -def _patch_classes() -> None: - setattr(pl.accelerators, "IPUAccelerator", IPUAccelerator) - setattr(pl.strategies, "IPUStrategy", IPUStrategy) - setattr(pl.plugins, "IPUPrecisionPlugin", IPUPrecisionPlugin) - setattr(pl.plugins.precision, "IPUPrecisionPlugin", IPUPrecisionPlugin) - - -_patch_sys_modules() -_patch_classes() diff --git a/src/lightning/pytorch/core/hooks.py b/src/lightning/pytorch/core/hooks.py index 2f510fe270..e1428a442d 100644 --- a/src/lightning/pytorch/core/hooks.py +++ b/src/lightning/pytorch/core/hooks.py @@ -601,10 +601,6 @@ class DataHooks: batch = super().transfer_batch_to_device(batch, device, dataloader_idx) return batch - Raises: - MisconfigurationException: - If using IPUs, ``Trainer(accelerator='ipu')``. - See Also: - :meth:`move_data_to_device` - :meth:`apply_to_collection` @@ -661,10 +657,6 @@ class DataHooks: batch['x'] = gpu_transforms(batch['x']) return batch - Raises: - MisconfigurationException: - If using IPUs, ``Trainer(accelerator='ipu')``. - See Also: - :meth:`on_before_batch_transfer` - :meth:`transfer_batch_to_device` diff --git a/src/lightning/pytorch/trainer/configuration_validator.py b/src/lightning/pytorch/trainer/configuration_validator.py index 27cca4b426..23c04523cd 100644 --- a/src/lightning/pytorch/trainer/configuration_validator.py +++ b/src/lightning/pytorch/trainer/configuration_validator.py @@ -16,7 +16,6 @@ import lightning.pytorch as pl from lightning.fabric.utilities.warnings import PossibleUserWarning from lightning.pytorch.trainer.states import TrainerFn from lightning.pytorch.utilities.exceptions import MisconfigurationException -from lightning.pytorch.utilities.imports import _graphcore_available_and_importable from lightning.pytorch.utilities.model_helpers import is_overridden from lightning.pytorch.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn from lightning.pytorch.utilities.signature_utils import is_param_in_hook_signature @@ -43,10 +42,7 @@ def _verify_loop_configurations(trainer: "pl.Trainer") -> None: elif trainer.state.fn == TrainerFn.PREDICTING: __verify_eval_loop_configuration(model, "predict") - __verify_batch_transfer_support(trainer) - __verify_configure_model_configuration(model) - __warn_dataloader_iter_limitations(model) @@ -120,22 +116,6 @@ def __verify_eval_loop_configuration(model: "pl.LightningModule", stage: str) -> ) -def __verify_batch_transfer_support(trainer: "pl.Trainer") -> None: - batch_transfer_hooks = ("transfer_batch_to_device", "on_after_batch_transfer") - datahook_selector = trainer._data_connector._datahook_selector - assert datahook_selector is not None - for hook in batch_transfer_hooks: - if _graphcore_available_and_importable(): - from lightning_graphcore import IPUAccelerator - - # TODO: This code could be done in a hook in the IPUAccelerator as it's a simple error check - # through the Trainer. It doesn't need to stay in Lightning - if isinstance(trainer.accelerator, IPUAccelerator) and ( - is_overridden(hook, datahook_selector.model) or is_overridden(hook, datahook_selector.datamodule) - ): - raise MisconfigurationException(f"Overriding `{hook}` is not supported with IPUs.") - - def __verify_manual_optimization_support(trainer: "pl.Trainer", model: "pl.LightningModule") -> None: if model.automatic_optimization: return diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index 8940b82749..fd2a5d413b 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -64,7 +64,6 @@ from lightning.pytorch.strategies.ddp import _DDP_FORK_ALIASES from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import ( _LIGHTNING_COLOSSALAI_AVAILABLE, - _graphcore_available_and_importable, _habana_available_and_importable, ) from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn @@ -338,11 +337,6 @@ class _AcceleratorConnector: """Choose the accelerator type (str) based on availability.""" if XLAAccelerator.is_available(): return "tpu" - if _graphcore_available_and_importable(): - from lightning_graphcore import IPUAccelerator - - if IPUAccelerator.is_available(): - return "ipu" if _habana_available_and_importable(): from lightning_habana import HPUAccelerator @@ -420,16 +414,6 @@ class _AcceleratorConnector: return LightningEnvironment() def _choose_strategy(self) -> Union[Strategy, str]: - if self._accelerator_flag == "ipu": - if not _graphcore_available_and_importable(): - raise ImportError( - "You have passed `accelerator='ipu'` but the IPU integration is not installed." - " Please run `pip install lightning-graphcore` or check out" - " https://github.com/Lightning-AI/lightning-Graphcore for instructions" - ) - from lightning_graphcore import IPUStrategy - - return IPUStrategy.strategy_name if self._accelerator_flag == "hpu": if not _habana_available_and_importable(): raise ImportError( @@ -500,16 +484,6 @@ class _AcceleratorConnector: if isinstance(self._precision_plugin_flag, Precision): return self._precision_plugin_flag - if _graphcore_available_and_importable(): - from lightning_graphcore import IPUAccelerator, IPUPrecision - - # TODO: For the strategies that have a fixed precision class, we don't really need this logic - # in the accelerator. Since the strategy owns the precision plugin, the strategy.precision_plugin - # could be a no-op and then we wouldn't need this. - - if isinstance(self.accelerator, IPUAccelerator): - return IPUPrecision(self._precision_flag) - if _habana_available_and_importable(): from lightning_habana import HPUAccelerator, HPUPrecisionPlugin @@ -691,12 +665,3 @@ def _register_external_accelerators_and_strategies() -> None: HPUParallelStrategy.register_strategies(StrategyRegistry) if "hpu_single" not in StrategyRegistry: SingleHPUStrategy.register_strategies(StrategyRegistry) - - if _graphcore_available_and_importable(): - from lightning_graphcore import IPUAccelerator, IPUStrategy - - # TODO: Prevent registering multiple times - if "ipu" not in AcceleratorRegistry: - IPUAccelerator.register_accelerators(AcceleratorRegistry) - if "ipu_strategy" not in StrategyRegistry: - IPUStrategy.register_strategies(StrategyRegistry) diff --git a/src/lightning/pytorch/trainer/connectors/data_connector.py b/src/lightning/pytorch/trainer/connectors/data_connector.py index 1bc63c62c5..eb1beccec8 100644 --- a/src/lightning/pytorch/trainer/connectors/data_connector.py +++ b/src/lightning/pytorch/trainer/connectors/data_connector.py @@ -34,7 +34,6 @@ from lightning.pytorch.trainer.states import RunningStage, TrainerFn from lightning.pytorch.utilities.combined_loader import CombinedLoader from lightning.pytorch.utilities.data import _is_dataloader_shuffled, _update_dataloader from lightning.pytorch.utilities.exceptions import MisconfigurationException -from lightning.pytorch.utilities.imports import _graphcore_available_and_importable from lightning.pytorch.utilities.model_helpers import is_overridden from lightning.pytorch.utilities.rank_zero import WarningCache, rank_zero_warn from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS @@ -165,19 +164,11 @@ class _DataConnector: datamodule.trainer = trainer def _requires_distributed_sampler(self, dataloader: DataLoader) -> bool: - if _graphcore_available_and_importable(): - from lightning_graphcore import IPUAccelerator - - # `DistributedSampler` is never used with `poptorch.DataLoader` - is_ipu = isinstance(self.trainer.accelerator, IPUAccelerator) - else: - is_ipu = False return ( self.trainer._accelerator_connector.use_distributed_sampler and self.trainer._accelerator_connector.is_distributed and not isinstance(dataloader.sampler, DistributedSampler) and not has_iterable_dataset(dataloader) - and not is_ipu ) def _prepare_dataloader(self, dataloader: object, shuffle: bool, mode: RunningStage) -> object: @@ -190,18 +181,9 @@ class _DataConnector: # don't do anything if it's not a dataloader if not isinstance(dataloader, DataLoader): return dataloader - - if _graphcore_available_and_importable(): - from lightning_graphcore import IPUAccelerator - - # IPUs use a custom `poptorch.DataLoader` which we might need to convert to - is_ipu = isinstance(self.trainer.accelerator, IPUAccelerator) - else: - is_ipu = False if ( self._requires_distributed_sampler(dataloader) # sets the distributed sampler or mode == RunningStage.PREDICTING # to track indices for the predictions - or is_ipu ): sampler = self._resolve_sampler(dataloader, shuffle=shuffle, mode=mode) return _update_dataloader(dataloader, sampler, mode=mode) diff --git a/src/lightning/pytorch/trainer/setup.py b/src/lightning/pytorch/trainer/setup.py index 2dd5af675a..00b546b252 100644 --- a/src/lightning/pytorch/trainer/setup.py +++ b/src/lightning/pytorch/trainer/setup.py @@ -28,7 +28,7 @@ from lightning.pytorch.profilers import ( XLAProfiler, ) from lightning.pytorch.utilities.exceptions import MisconfigurationException -from lightning.pytorch.utilities.imports import _graphcore_available_and_importable, _habana_available_and_importable +from lightning.pytorch.utilities.imports import _habana_available_and_importable from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn @@ -158,16 +158,6 @@ def _log_device_info(trainer: "pl.Trainer") -> None: num_tpu_cores = trainer.num_devices if isinstance(trainer.accelerator, XLAAccelerator) else 0 rank_zero_info(f"TPU available: {XLAAccelerator.is_available()}, using: {num_tpu_cores} TPU cores") - if _graphcore_available_and_importable(): - from lightning_graphcore import IPUAccelerator - - num_ipus = trainer.num_devices if isinstance(trainer.accelerator, IPUAccelerator) else 0 - ipu_available = IPUAccelerator.is_available() - else: - num_ipus = 0 - ipu_available = False - rank_zero_info(f"IPU available: {ipu_available}, using: {num_ipus} IPUs") - if _habana_available_and_importable(): from lightning_habana import HPUAccelerator @@ -192,12 +182,6 @@ def _log_device_info(trainer: "pl.Trainer") -> None: if XLAAccelerator.is_available() and not isinstance(trainer.accelerator, XLAAccelerator): rank_zero_warn("TPU available but not used. You can set it by doing `Trainer(accelerator='tpu')`.") - if _graphcore_available_and_importable(): - from lightning_graphcore import IPUAccelerator - - if IPUAccelerator.is_available() and not isinstance(trainer.accelerator, IPUAccelerator): - rank_zero_warn("IPU available but not used. You can set it by doing `Trainer(accelerator='ipu')`.") - if _habana_available_and_importable(): from lightning_habana import HPUAccelerator diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py index d0988d9d38..74b409a18b 100644 --- a/src/lightning/pytorch/trainer/trainer.py +++ b/src/lightning/pytorch/trainer/trainer.py @@ -136,7 +136,7 @@ class Trainer: r"""Customize every aspect of training via flags. Args: - accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "ipu", "hpu", "mps", "auto") + accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "hpu", "mps", "auto") as well as custom accelerator instances. strategy: Supports different training strategies with aliases as well custom strategies. @@ -151,7 +151,7 @@ class Trainer: precision: Double precision (64, '64' or '64-true'), full precision (32, '32' or '32-true'), 16bit mixed precision (16, '16', '16-mixed') or bfloat16 mixed precision ('bf16', 'bf16-mixed'). - Can be used on CPU, GPU, TPUs, HPUs or IPUs. + Can be used on CPU, GPU, TPUs, or HPUs. Default: ``'32-true'``. logger: Logger (or iterable collection of loggers) for experiment tracking. A ``True`` value uses diff --git a/src/lightning/pytorch/utilities/imports.py b/src/lightning/pytorch/utilities/imports.py index 2723eb2ccc..eabc1c1469 100644 --- a/src/lightning/pytorch/utilities/imports.py +++ b/src/lightning/pytorch/utilities/imports.py @@ -41,15 +41,6 @@ def _try_import_module(module_name: str) -> bool: return False -_LIGHTNING_GRAPHCORE_AVAILABLE = RequirementCache("lightning-graphcore>=0.1.0") - - -def _graphcore_available_and_importable() -> bool: - # This is defined as a function instead of a constant to avoid circular imports, because `lightning_graphcore` - # also imports Lightning - return bool(_LIGHTNING_GRAPHCORE_AVAILABLE) and _try_import_module("lightning_graphcore") - - _LIGHTNING_HABANA_AVAILABLE = RequirementCache("lightning-habana>=1.2.0") diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index 067038614a..f1e1d5dfc7 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -62,7 +62,7 @@ Lightning forces the following structure to your code which makes it reusable an - Non-essential research code (logging, etc... this goes in Callbacks). - Data (use PyTorch DataLoaders or organize them into a LightningDataModule). -Once you do this, you can train on multiple-GPUs, TPUs, CPUs, IPUs, HPUs and even in 16-bit precision without changing your code! +Once you do this, you can train on multiple-GPUs, TPUs, CPUs, HPUs and even in 16-bit precision without changing your code! [Get started in just 15 minutes](https://lightning.ai/docs/pytorch/latest/starter/introduction.html) diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py index d5e20338c1..64eb5c1f1f 100644 --- a/tests/tests_fabric/conftest.py +++ b/tests/tests_fabric/conftest.py @@ -63,7 +63,6 @@ def restore_env_variables(): "PL_GLOBAL_SEED", "PL_SEED_WORKERS", "RANK", # set by DeepSpeed - "POPLAR_ENGINE_OPTIONS", # set by IPUStrategy "CUDA_MODULE_LOADING", # leaked by PyTorch "CRC32C_SW_MODE", # set by tensorboardX "OMP_NUM_THREADS", # set by our launchers diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 96554ff24b..f42f53a56a 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -83,7 +83,6 @@ def restore_env_variables(): "WANDB_REQUIRE_SERVICE", "WANDB_SERVICE", "RANK", # set by DeepSpeed - "POPLAR_ENGINE_OPTIONS", # set by IPUStrategy "CUDA_MODULE_LOADING", # leaked by PyTorch "KMP_INIT_AT_FORK", # leaked by PyTorch "KMP_DUPLICATE_LIB_OK", # leaked by PyTorch diff --git a/tests/tests_pytorch/graveyard/test_ipu.py b/tests/tests_pytorch/graveyard/test_ipu.py deleted file mode 100644 index 520729f80c..0000000000 --- a/tests/tests_pytorch/graveyard/test_ipu.py +++ /dev/null @@ -1,21 +0,0 @@ -from importlib import import_module - -import pytest - - -@pytest.mark.parametrize( - ("import_path", "name"), - [ - ("lightning.pytorch.accelerators", "IPUAccelerator"), - ("lightning.pytorch.accelerators.ipu", "IPUAccelerator"), - ("lightning.pytorch.strategies", "IPUStrategy"), - ("lightning.pytorch.strategies.ipu", "IPUStrategy"), - ("lightning.pytorch.plugins.precision", "IPUPrecisionPlugin"), - ("lightning.pytorch.plugins.precision.ipu", "IPUPrecisionPlugin"), - ], -) -def test_extracted_ipu(import_path, name): - module = import_module(import_path) - cls = getattr(module, name) - with pytest.raises(NotImplementedError, match=f"{name}` class has been moved to an external package.*"): - cls() diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index a9c5306815..b1cafbf9dc 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -58,7 +58,6 @@ from lightning.pytorch.trainer.connectors.accelerator_connector import _Accelera from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import ( _LIGHTNING_HABANA_AVAILABLE, - _graphcore_available_and_importable, ) from lightning_utilities.core.imports import package_available @@ -580,16 +579,6 @@ def test_unsupported_tpu_choice(xla_available, tpu_available): Trainer(accelerator="tpu", precision="16-true", strategy="ddp") -def mock_ipu_available(monkeypatch, value=True): - # TODO: this isn't really mocking. it should be implemented and used as `mock_hpu_count` - try: - import lightning_graphcore - except ModuleNotFoundError: - return - monkeypatch.setattr(lightning_graphcore.accelerator, "_IPU_AVAILABLE", value) - monkeypatch.setattr(lightning_graphcore.strategy, "_IPU_AVAILABLE", value) - - if _LIGHTNING_HABANA_AVAILABLE: from lightning_habana import HPUAccelerator, HPUParallelStrategy, SingleHPUStrategy else: @@ -657,7 +646,6 @@ def mock_hpu_count(monkeypatch, n=1): def test_devices_auto_choice_cpu(monkeypatch, cuda_count_0): mock_hpu_count(monkeypatch, 0) - mock_ipu_available(monkeypatch, False) mock_xla_available(monkeypatch, False) trainer = Trainer(accelerator="auto", devices="auto") assert trainer.num_devices == 1 @@ -915,7 +903,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive): mock_cuda_count(monkeypatch, 0) mock_mps_count(monkeypatch, 0) mock_tpu_available(monkeypatch, False) - mock_ipu_available(monkeypatch, False) trainer = Trainer() assert isinstance(trainer.accelerator, CPUAccelerator) assert isinstance(trainer.strategy, SingleDeviceStrategy) @@ -927,7 +914,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive): mock_cuda_count(monkeypatch, 1) mock_mps_count(monkeypatch, 0) mock_tpu_available(monkeypatch, False) - mock_ipu_available(monkeypatch, False) trainer = Trainer() assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, SingleDeviceStrategy) @@ -939,7 +925,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive): mock_cuda_count(monkeypatch, 4) mock_mps_count(monkeypatch, 0) mock_tpu_available(monkeypatch, False) - mock_ipu_available(monkeypatch, False) trainer = Trainer() assert isinstance(trainer.accelerator, CUDAAccelerator) assert isinstance(trainer.strategy, (SingleDeviceStrategy if is_interactive else DDPStrategy)) @@ -955,7 +940,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive): mock_cuda_count(monkeypatch, 0) mock_mps_count(monkeypatch, 1) mock_tpu_available(monkeypatch, False) - mock_ipu_available(monkeypatch, False) connector = _AcceleratorConnector() assert isinstance(connector.accelerator, MPSAccelerator) assert isinstance(connector.strategy, SingleDeviceStrategy) @@ -965,7 +949,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive): with monkeypatch.context(): mock_cuda_count(monkeypatch, 0) mock_mps_count(monkeypatch, 0) - mock_ipu_available(monkeypatch, False) _mock_tpu_available(True) monkeypatch.setattr(lightning.pytorch.accelerators.XLAAccelerator, "auto_device_count", lambda *_: 1) monkeypatch.setattr(torch, "device", DeviceMock()) @@ -982,7 +965,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive): mock_cuda_count(monkeypatch, 0) mock_mps_count(monkeypatch, 0) _mock_tpu_available(True) - mock_ipu_available(monkeypatch, False) connector = _AcceleratorConnector() assert isinstance(connector.accelerator, XLAAccelerator) assert isinstance(connector.strategy, XLAStrategy) @@ -991,28 +973,11 @@ def test_connector_auto_selection(monkeypatch, is_interactive): assert connector.strategy._start_method == "fork" assert connector.strategy.launcher.is_interactive_compatible - # Single/Multi IPU: strategy is the same - if _graphcore_available_and_importable(): - with monkeypatch.context(): - mock_cuda_count(monkeypatch, 0) - mock_mps_count(monkeypatch, 0) - mock_tpu_available(monkeypatch, False) - mock_ipu_available(monkeypatch, True) - from lightning_graphcore import IPUAccelerator, IPUStrategy - - connector = _AcceleratorConnector() - assert isinstance(connector.accelerator, IPUAccelerator) - assert isinstance(connector.strategy, IPUStrategy) - assert connector._devices_flag == 4 - assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) - assert connector.strategy.launcher is None - # Single HPU with monkeypatch.context(): mock_cuda_count(monkeypatch, 0) mock_mps_count(monkeypatch, 0) mock_tpu_available(monkeypatch, False) - mock_ipu_available(monkeypatch, False) mock_hpu_count(monkeypatch, 1) connector = _AcceleratorConnector() assert isinstance(connector.accelerator, HPUAccelerator) @@ -1029,7 +994,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive): mock_cuda_count(monkeypatch, 0) mock_mps_count(monkeypatch, 0) mock_tpu_available(monkeypatch, False) - mock_ipu_available(monkeypatch, False) mock_hpu_count(monkeypatch, 8) connector = _AcceleratorConnector() assert isinstance(connector.accelerator, HPUAccelerator) @@ -1047,7 +1011,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive): mock_cuda_count(monkeypatch, 2) mock_mps_count(monkeypatch, 0) _mock_tpu_available(True) - mock_ipu_available(monkeypatch, False) connector = _AcceleratorConnector() assert isinstance(connector.accelerator, XLAAccelerator) assert isinstance(connector.strategy, XLAStrategy)