Remove the Graphcore IPU integration (#19405)
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
This commit is contained in:
parent
8d4768f2ae
commit
e950bb4828
|
@ -27,7 +27,6 @@ Brief description of all our automation tools used for boosting development perf
|
|||
|
||||
- GPU: 2 x NVIDIA RTX 3090
|
||||
- TPU: [Google TPU v4-8](https://cloud.google.com/tpu/docs)
|
||||
- IPU: [Colossus MK1 IPU](https://www.graphcore.ai/products/ipu)
|
||||
|
||||
- To check which versions of Python or PyTorch are used for testing in our CI, see the corresponding workflow files or checkgroup config file at [`.github/checkgroup.yml`](../checkgroup.yml).
|
||||
|
||||
|
|
|
@ -22,9 +22,7 @@ docs/source-pytorch/notebooks
|
|||
docs/source-pytorch/_static/images/course_UvA-DL
|
||||
docs/source-pytorch/_static/images/lightning_examples
|
||||
docs/source-pytorch/_static/fetched-s3-assets
|
||||
docs/source-pytorch/_static/images/ipu/
|
||||
docs/source-pytorch/integrations/hpu
|
||||
docs/source-pytorch/integrations/ipu
|
||||
|
||||
docs/source-fabric/*/generated
|
||||
|
||||
|
|
|
@ -53,7 +53,6 @@ And that's it!
|
|||
|
||||
GPU available: True (mps), used: False
|
||||
TPU available: False, using: 0 TPU cores
|
||||
IPU available: False, using: 0 IPUs
|
||||
|
||||
| Name | Type | Params | In sizes | Out sizes
|
||||
------------------------------------------------------------------
|
||||
|
|
|
@ -20,7 +20,7 @@ Training on Accelerators
|
|||
|
||||
**Use when:** Whenever possible!
|
||||
|
||||
With Lightning, running on GPUs, TPUs, IPUs on multiple nodes is a simple switch of a flag.
|
||||
With Lightning, running on GPUs, TPUs, HPUs on multiple nodes is a simple switch of a flag.
|
||||
|
||||
GPU Training
|
||||
============
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
../advanced/model_parallel
|
||||
Train on single or multiple GPUs <../accelerators/gpu>
|
||||
Train on single or multiple HPUs <../integrations/hpu/index>
|
||||
Train on single or multiple IPUs <../integrations/ipu/index>
|
||||
Train on single or multiple TPUs <../accelerators/tpu>
|
||||
Train on MPS <../accelerators/mps>
|
||||
Use a pretrained model <../advanced/pretrained>
|
||||
|
@ -168,13 +167,6 @@ How-to Guides
|
|||
:col_css: col-md-4
|
||||
:height: 180
|
||||
|
||||
.. displayitem::
|
||||
:header: Train on single or multiple IPUs
|
||||
:description: Train models faster with IPU accelerators
|
||||
:button_link: ../integrations/ipu/index.html
|
||||
:col_css: col-md-4
|
||||
:height: 180
|
||||
|
||||
.. displayitem::
|
||||
:header: Train on single or multiple TPUs
|
||||
:description: TTrain models faster with TPU accelerators
|
||||
|
|
|
@ -103,31 +103,26 @@ Precision support by accelerator
|
|||
********************************
|
||||
|
||||
.. list-table:: Precision with Accelerators
|
||||
:widths: 20 20 20 20 20
|
||||
:widths: 20 20 20 20
|
||||
:header-rows: 1
|
||||
|
||||
* - Precision
|
||||
- CPU
|
||||
- GPU
|
||||
- TPU
|
||||
- IPU
|
||||
* - 16 Mixed
|
||||
- No
|
||||
- Yes
|
||||
- No
|
||||
- Yes
|
||||
* - BFloat16 Mixed
|
||||
- Yes
|
||||
- Yes
|
||||
- Yes
|
||||
- No
|
||||
* - 32 True
|
||||
- Yes
|
||||
- Yes
|
||||
- Yes
|
||||
- Yes
|
||||
* - 64 True
|
||||
- Yes
|
||||
- Yes
|
||||
- No
|
||||
- No
|
||||
|
|
|
@ -175,7 +175,7 @@ Trainer flags
|
|||
accelerator
|
||||
^^^^^^^^^^^
|
||||
|
||||
Supports passing different accelerator types (``"cpu", "gpu", "tpu", "ipu", "auto"``)
|
||||
Supports passing different accelerator types (``"cpu", "gpu", "tpu", "hpu", "auto"``)
|
||||
as well as custom accelerator instances.
|
||||
|
||||
.. code-block:: python
|
||||
|
@ -393,9 +393,6 @@ Number of devices to train on (``int``), which devices to train on (``list`` or
|
|||
# Training with TPU Accelerator using 8 tpu cores
|
||||
trainer = Trainer(devices="auto", accelerator="tpu")
|
||||
|
||||
# Training with IPU Accelerator using 4 ipus
|
||||
trainer = Trainer(devices="auto", accelerator="ipu")
|
||||
|
||||
.. note::
|
||||
|
||||
If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and fetch the ``auto_device_count``
|
||||
|
|
|
@ -133,13 +133,6 @@ Customize and extend Lightning for things like custom hardware or distributed st
|
|||
:button_link: integrations/hpu/index.html
|
||||
:height: 100
|
||||
|
||||
.. displayitem::
|
||||
:header: Train on single or multiple IPUs
|
||||
:description: Train models faster with IPUs.
|
||||
:col_css: col-md-12
|
||||
:button_link: integrations/ipu/index.html
|
||||
:height: 100
|
||||
|
||||
.. displayitem::
|
||||
:header: Train on single or multiple TPUs
|
||||
:description: Train models faster with TPUs.
|
||||
|
|
|
@ -94,18 +94,6 @@ assist_local.AssistantCLI.pull_docs_files(
|
|||
target_dir="docs/source-pytorch/integrations/hpu",
|
||||
checkout="refs/tags/1.3.0",
|
||||
)
|
||||
assist_local.AssistantCLI.pull_docs_files(
|
||||
gh_user_repo="Lightning-AI/lightning-Graphcore",
|
||||
target_dir="docs/source-pytorch/integrations/ipu",
|
||||
checkout="refs/tags/v0.1.0",
|
||||
as_orphan=True, # todo: this can be dropped after new IPU release
|
||||
)
|
||||
# the IPU also need one image
|
||||
URL_RAW_DOCS_GRAPHCORE = "https://raw.githubusercontent.com/Lightning-AI/lightning-Graphcore/v0.1.0/docs/source"
|
||||
for img in ["_static/images/ipu/profiler.png"]:
|
||||
img_ = os.path.join(_PATH_HERE, "integrations", "ipu", img)
|
||||
os.makedirs(os.path.dirname(img_), exist_ok=True)
|
||||
urllib.request.urlretrieve(f"{URL_RAW_DOCS_GRAPHCORE}/{img}", img_)
|
||||
|
||||
# Copy strategies docs as single pages
|
||||
assist_local.AssistantCLI.pull_docs_files(
|
||||
|
@ -340,7 +328,6 @@ intersphinx_mapping = {
|
|||
"numpy": ("https://numpy.org/doc/stable/", None),
|
||||
"PIL": ("https://pillow.readthedocs.io/en/stable/", None),
|
||||
"torchmetrics": ("https://torchmetrics.readthedocs.io/en/stable/", None),
|
||||
"graphcore": ("https://docs.graphcore.ai/en/latest/", None),
|
||||
"lightning_habana": ("https://lightning-ai.github.io/lightning-Habana/", None),
|
||||
"tensorboardX": ("https://tensorboardx.readthedocs.io/en/stable/", None),
|
||||
# needed for referencing App from lightning scope
|
||||
|
|
|
@ -190,34 +190,26 @@ Configure all aspects of Lightning for advanced usecases.
|
|||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 18: Explore IPUs
|
||||
:description: Explore Intelligence Processing Unit (IPU) for model scaling.
|
||||
:header: Level 18: Explore HPUs
|
||||
:description: Explore Havana Gaudi Processing Unit (HPU) for model scaling.
|
||||
:col_css: col-md-6
|
||||
:button_link: levels/advanced_level_19.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 19: Explore HPUs
|
||||
:description: Explore Havana Gaudi Processing Unit (HPU) for model scaling.
|
||||
:header: Level 19: Master TPUs
|
||||
:description: Master TPUs and run on cloud TPUs.
|
||||
:col_css: col-md-6
|
||||
:button_link: levels/advanced_level_20.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 20: Master TPUs
|
||||
:description: Master TPUs and run on cloud TPUs.
|
||||
:col_css: col-md-6
|
||||
:button_link: levels/advanced_level_21.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 21: Train models with billions of parameters
|
||||
:header: Level 20: Train models with billions of parameters
|
||||
:description: Scale GPU training to models with billions of parameters
|
||||
:col_css: col-md-6
|
||||
:button_link: levels/advanced_level_22.html
|
||||
:button_link: levels/advanced_level_21.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
|
@ -240,7 +232,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
|
|||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 22: Extend the Lightning CLI
|
||||
:header: Level 21: Extend the Lightning CLI
|
||||
:description: Extend the functionality of the Lightning CLI.
|
||||
:col_css: col-md-6
|
||||
:button_link: levels/expert_level_23.html
|
||||
|
@ -248,7 +240,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
|
|||
:tag: expert
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 23: Integrate a custom cluster
|
||||
:header: Level 22: Integrate a custom cluster
|
||||
:description: Integrate a custom cluster into Lightning.
|
||||
:col_css: col-md-6
|
||||
:button_link: levels/expert_level_24.html
|
||||
|
@ -256,7 +248,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
|
|||
:tag: expert
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 24: Make your own profiler
|
||||
:header: Level 23: Make your own profiler
|
||||
:description: Make your own profiler.
|
||||
:col_css: col-md-6
|
||||
:button_link: tuning/profiler_expert.html
|
||||
|
@ -264,10 +256,10 @@ Customize and extend Lightning for things like custom hardware or distributed st
|
|||
:tag: expert
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 25: Add a new accelerator or Strategy
|
||||
:header: Level 24: Add a new accelerator or Strategy
|
||||
:description: Integrate a new accelerator or distributed strategy.
|
||||
:col_css: col-md-6
|
||||
:button_link: levels/expert_level_27.html
|
||||
:button_link: levels/expert_level_25.html
|
||||
:height: 150
|
||||
:tag: expert
|
||||
|
||||
|
|
|
@ -4,13 +4,12 @@
|
|||
Accelerator
|
||||
###########
|
||||
|
||||
The Accelerator connects a Lightning Trainer to arbitrary hardware (CPUs, GPUs, TPUs, IPUs, MPS, ...).
|
||||
The Accelerator connects a Lightning Trainer to arbitrary hardware (CPUs, GPUs, TPUs, HPUs, MPS, ...).
|
||||
Currently there are accelerators for:
|
||||
|
||||
- CPU
|
||||
- :doc:`GPU <../accelerators/gpu>`
|
||||
- :doc:`TPU <../accelerators/tpu>`
|
||||
- :doc:`IPU <../integrations/ipu/index>`
|
||||
- :doc:`HPU <../integrations/hpu/index>`
|
||||
- :doc:`MPS <../accelerators/mps>`
|
||||
|
||||
|
|
|
@ -57,9 +57,6 @@ Here are some examples:
|
|||
# Training with the DDP Spawn strategy on 8 TPU cores
|
||||
trainer = Trainer(strategy="ddp_spawn", accelerator="tpu", devices=8)
|
||||
|
||||
# Training with the default IPU strategy on 8 IPUs
|
||||
trainer = Trainer(accelerator="ipu", devices=8)
|
||||
|
||||
The below table lists all relevant strategies available in Lightning with their corresponding short-hand name:
|
||||
|
||||
.. list-table:: Strategy Classes and Nicknames
|
||||
|
@ -87,9 +84,6 @@ The below table lists all relevant strategies available in Lightning with their
|
|||
* - hpu_single
|
||||
- ``SingleHPUStrategy``
|
||||
- Strategy for training on a single HPU device. :doc:`Learn more. <../integrations/hpu/index>`
|
||||
* - ipu_strategy
|
||||
- ``IPUStrategy``
|
||||
- Plugin for training on IPU devices. :doc:`Learn more. <../integrations/ipu/index>`
|
||||
* - xla
|
||||
- :class:`~lightning.pytorch.strategies.XLAStrategy`
|
||||
- Strategy for training on multiple TPU devices using the :func:`torch_xla.distributed.xla_multiprocessing.spawn` method. :doc:`Learn more. <../accelerators/tpu>`
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
Half precision <../common/precision>
|
||||
HPU <../integrations/hpu/index>
|
||||
Inference <../deploy/production_intermediate>
|
||||
IPU <../integrations/ipu/index>
|
||||
Lightning CLI <../cli/lightning_cli>
|
||||
LightningDataModule <../data/datamodule>
|
||||
LightningModule <../common/lightning_module>
|
||||
|
@ -177,13 +176,6 @@ Glossary
|
|||
:button_link: ../deploy/production_intermediate.html
|
||||
:height: 100
|
||||
|
||||
.. displayitem::
|
||||
:header: IPU
|
||||
:description: Graphcore Intelligence Processing Unit for faster training
|
||||
:col_css: col-md-12
|
||||
:button_link: ../integrations/ipu/index.html
|
||||
:height: 100
|
||||
|
||||
.. displayitem::
|
||||
:header: Lightning CLI
|
||||
:description: A Command-line Interface (CLI) to interact with Lightning code via a terminal
|
||||
|
|
|
@ -1,48 +0,0 @@
|
|||
.. _ipu:
|
||||
|
||||
Accelerator: IPU training
|
||||
=========================
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div class="display-card-container">
|
||||
<div class="row">
|
||||
|
||||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Prepare your code (Optional)
|
||||
:description: Prepare your code to run on any hardware
|
||||
:col_css: col-md-6
|
||||
:button_link: accelerator_prepare.html
|
||||
:height: 150
|
||||
:tag: basic
|
||||
|
||||
.. displayitem::
|
||||
:header: Basic
|
||||
:description: Learn the basics of single and multi-IPU training.
|
||||
:col_css: col-md-6
|
||||
:button_link: ipu_basic.html
|
||||
:height: 150
|
||||
:tag: basic
|
||||
|
||||
.. displayitem::
|
||||
:header: Intermediate
|
||||
:description: Tune model performance with mix-precision settings and the performance analyser.
|
||||
:col_css: col-md-6
|
||||
:button_link: ipu_intermediate.html
|
||||
:height: 150
|
||||
:tag: intermediate
|
||||
|
||||
.. displayitem::
|
||||
:header: Advanced
|
||||
:description: Learn advanced techniques to customize IPU training for massive models.
|
||||
:col_css: col-md-6
|
||||
:button_link: ipu_advanced.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
.. raw:: html
|
||||
|
||||
</div>
|
||||
</div>
|
|
@ -46,34 +46,26 @@ Configure all aspects of Lightning for advanced usecases.
|
|||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 18: Explore IPUs
|
||||
:description: Explore Intelligence Processing Unit (IPU) for model scaling.
|
||||
:header: Level 18: Explore HPUs
|
||||
:description: Explore Habana Gaudi Processing Unit (HPU) for model scaling.
|
||||
:col_css: col-md-6
|
||||
:button_link: advanced_level_19.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 19: Explore HPUs
|
||||
:description: Explore Habana Gaudi Processing Unit (HPU) for model scaling.
|
||||
:header: Level 19: Master TPUs
|
||||
:description: Master TPUs and run on cloud TPUs.
|
||||
:col_css: col-md-6
|
||||
:button_link: advanced_level_20.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 20: Master TPUs
|
||||
:description: Master TPUs and run on cloud TPUs.
|
||||
:col_css: col-md-6
|
||||
:button_link: advanced_level_21.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 21: Train models with billions of parameters
|
||||
:header: Level 20: Train models with billions of parameters
|
||||
:description: Scale GPU training to models with billions of parameters
|
||||
:col_css: col-md-6
|
||||
:button_link: advanced_level_22.html
|
||||
:button_link: advanced_level_21.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
:orphan:
|
||||
|
||||
######################
|
||||
Level 18: Explore IPUs
|
||||
Level 18: Explore HPUs
|
||||
######################
|
||||
|
||||
Explore Intelligence Processing Unit (IPU) for model scaling.
|
||||
Explore Intel Habana Processing Unit (HPU) for model scaling.
|
||||
|
||||
----
|
||||
|
||||
|
@ -16,26 +16,18 @@ Explore Intelligence Processing Unit (IPU) for model scaling.
|
|||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Prepare your code (Optional)
|
||||
:description: Prepare your code to run on any hardware.
|
||||
:col_css: col-md-4
|
||||
:button_link: ../accelerators/accelerator_prepare.html
|
||||
:header: Train models on HPUs
|
||||
:description: Learn the basics of single and multi-HPU core training.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../integrations/hpu/basic.html
|
||||
:height: 150
|
||||
:tag: basic
|
||||
|
||||
.. displayitem::
|
||||
:header: Train models on IPUs
|
||||
:description: Learn the basics of single and multi-IPU training.
|
||||
:col_css: col-md-4
|
||||
:button_link: ../integrations/ipu/basic.html
|
||||
:height: 150
|
||||
:tag: basic
|
||||
|
||||
.. displayitem::
|
||||
:header: Optimize models training on IPUs
|
||||
:description: Tune model performance with mixed precision and the performance analyser.
|
||||
:col_css: col-md-4
|
||||
:button_link: ../integrations/ipu/intermediate.html
|
||||
:header: Optimize models training on HPUs
|
||||
:description: Enable state-of-the-art scaling with advanced mixed-precision settings.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../integrations/hpu/intermediate.html
|
||||
:height: 150
|
||||
:tag: intermediate
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
:orphan:
|
||||
|
||||
######################
|
||||
Level 19: Explore HPUs
|
||||
######################
|
||||
#####################
|
||||
Level 19: Master TPUs
|
||||
#####################
|
||||
|
||||
Explore Intel Habana Processing Unit (HPU) for model scaling.
|
||||
Master cloud TPU training with profiling and scaling techniques.
|
||||
|
||||
----
|
||||
|
||||
|
@ -16,20 +16,28 @@ Explore Intel Habana Processing Unit (HPU) for model scaling.
|
|||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Train models on HPUs
|
||||
:description: Learn the basics of single and multi-HPU core training.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../integrations/hpu/basic.html
|
||||
:height: 150
|
||||
:tag: basic
|
||||
:header: Run on cloud TPUs
|
||||
:description: Scale massive models using cloud TPUs.
|
||||
:col_css: col-md-4
|
||||
:button_link: ../accelerators/tpu_intermediate.html
|
||||
:height: 180
|
||||
:tag: intermediate
|
||||
|
||||
.. displayitem::
|
||||
:header: Optimize models training on HPUs
|
||||
:description: Enable state-of-the-art scaling with advanced mixed-precision settings.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../integrations/hpu/intermediate.html
|
||||
:height: 150
|
||||
:tag: intermediate
|
||||
:header: Explore advanced TPU scaling techniques
|
||||
:description: Dive into XLA and advanced techniques to optimize TPU-powered models.
|
||||
:col_css: col-md-4
|
||||
:button_link: ../accelerators/tpu_advanced.html
|
||||
:height: 180
|
||||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Profile TPU code
|
||||
:description: Learn to profile TPU code.
|
||||
:col_css: col-md-4
|
||||
:button_link: ../tuning/profiler_advanced.html
|
||||
:height: 180
|
||||
:tag: advanced
|
||||
|
||||
.. raw:: html
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
:orphan:
|
||||
|
||||
#####################
|
||||
Level 20: Master TPUs
|
||||
#####################
|
||||
##################################################
|
||||
Level 20: Train models with billions of parameters
|
||||
##################################################
|
||||
|
||||
Master cloud TPU training with profiling and scaling techniques.
|
||||
Scale to billions of parameters with multiple distributed strategies.
|
||||
|
||||
----
|
||||
|
||||
|
@ -16,27 +16,19 @@ Master cloud TPU training with profiling and scaling techniques.
|
|||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Run on cloud TPUs
|
||||
:description: Scale massive models using cloud TPUs.
|
||||
:col_css: col-md-4
|
||||
:button_link: ../accelerators/tpu_intermediate.html
|
||||
:height: 180
|
||||
:header: Scale with distributed strategies
|
||||
:description: Learn about different distributed strategies to reach bigger model parameter sizes.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../accelerators/gpu_intermediate.html
|
||||
:height: 150
|
||||
:tag: intermediate
|
||||
|
||||
.. displayitem::
|
||||
:header: Explore advanced TPU scaling techniques
|
||||
:description: Dive into XLA and advanced techniques to optimize TPU-powered models.
|
||||
:col_css: col-md-4
|
||||
:button_link: ../accelerators/tpu_advanced.html
|
||||
:height: 180
|
||||
:tag: advanced
|
||||
|
||||
.. displayitem::
|
||||
:header: Profile TPU code
|
||||
:description: Learn to profile TPU code.
|
||||
:col_css: col-md-4
|
||||
:button_link: ../tuning/profiler_advanced.html
|
||||
:height: 180
|
||||
:header: Train models with billions of parameters
|
||||
:description: Scale to billions of params on GPUs with FSDP or Deepspeed.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../advanced/model_parallel.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
.. raw:: html
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
##################################################
|
||||
Level 21: Train models with billions of parameters
|
||||
##################################################
|
||||
|
||||
Scale to billions of parameters with multiple distributed strategies.
|
||||
|
||||
----
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div class="display-card-container">
|
||||
<div class="row">
|
||||
|
||||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Scale with distributed strategies
|
||||
:description: Learn about different distributed strategies to reach bigger model parameter sizes.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../accelerators/gpu_intermediate.html
|
||||
:height: 150
|
||||
:tag: intermediate
|
||||
|
||||
.. displayitem::
|
||||
:header: Train models with billions of parameters
|
||||
:description: Scale to billions of params on GPUs with FSDP or Deepspeed.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../advanced/model_parallel.html
|
||||
:height: 150
|
||||
:tag: advanced
|
||||
|
||||
.. raw:: html
|
||||
|
||||
</div>
|
||||
</div>
|
|
@ -14,23 +14,23 @@ Customize and extend Lightning for things like custom hardware or distributed st
|
|||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 22: Extend the Lightning CLI
|
||||
:header: Level 21: Extend the Lightning CLI
|
||||
:description: Extend the functionality of the Lightning CLI.
|
||||
:col_css: col-md-6
|
||||
:button_link: expert_level_22.html
|
||||
:height: 150
|
||||
:tag: expert
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 22: Integrate a custom cluster
|
||||
:description: Integrate a custom cluster into Lightning.
|
||||
:col_css: col-md-6
|
||||
:button_link: expert_level_23.html
|
||||
:height: 150
|
||||
:tag: expert
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 23: Integrate a custom cluster
|
||||
:description: Integrate a custom cluster into Lightning.
|
||||
:col_css: col-md-6
|
||||
:button_link: expert_level_24.html
|
||||
:height: 150
|
||||
:tag: expert
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 24: Make your own profiler
|
||||
:header: Level 23: Make your own profiler
|
||||
:description: Make your own profiler.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../tuning/profiler_expert.html
|
||||
|
@ -38,7 +38,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
|
|||
:tag: expert
|
||||
|
||||
.. displayitem::
|
||||
:header: Level 25: Add a new accelerator or Strategy
|
||||
:header: Level 24: Add a new accelerator or Strategy
|
||||
:description: Integrate a new accelerator or distributed strategy.
|
||||
:col_css: col-md-6
|
||||
:button_link: expert_level_27.html
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
:orphan:
|
||||
|
||||
##################################
|
||||
Level 21: Extend the Lightning CLI
|
||||
##################################
|
||||
|
||||
Extend the functionality of the Lightning CLI.
|
||||
|
||||
----
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div class="display-card-container">
|
||||
<div class="row">
|
||||
|
||||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Customize configs for complex projects
|
||||
:description: Learn how to connect complex projects with each Registry.
|
||||
:col_css: col-md-6
|
||||
:button_link: ../cli/lightning_cli_advanced_3.html
|
||||
:height: 150
|
||||
:tag: expert
|
||||
|
||||
.. displayitem::
|
||||
:header: Extend the Lightning CLI
|
||||
:description: Customize the Lightning CLI
|
||||
:col_css: col-md-6
|
||||
:button_link: ../cli/lightning_cli_expert.html
|
||||
:height: 150
|
||||
:tag: expert
|
||||
|
||||
.. raw:: html
|
||||
|
||||
</div>
|
||||
</div>
|
|
@ -1,8 +1,8 @@
|
|||
:orphan:
|
||||
|
||||
##################################
|
||||
Level 22: Extend the Lightning CLI
|
||||
##################################
|
||||
####################################
|
||||
Level 22: Integrate a custom cluster
|
||||
####################################
|
||||
|
||||
Extend the functionality of the Lightning CLI.
|
||||
|
||||
|
@ -16,18 +16,10 @@ Extend the functionality of the Lightning CLI.
|
|||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Customize configs for complex projects
|
||||
:description: Learn how to connect complex projects with each Registry.
|
||||
:header: Integrate your own cluster
|
||||
:description: Learn how to integrate your own cluster
|
||||
:col_css: col-md-6
|
||||
:button_link: ../cli/lightning_cli_advanced_3.html
|
||||
:height: 150
|
||||
:tag: expert
|
||||
|
||||
.. displayitem::
|
||||
:header: Extend the Lightning CLI
|
||||
:description: Customize the Lightning CLI
|
||||
:col_css: col-md-6
|
||||
:button_link: ../cli/lightning_cli_expert.html
|
||||
:button_link: ../clouds/cluster_expert.html
|
||||
:height: 150
|
||||
:tag: expert
|
||||
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
:orphan:
|
||||
|
||||
####################################
|
||||
Level 23: Integrate a custom cluster
|
||||
####################################
|
||||
|
||||
Extend the functionality of the Lightning CLI.
|
||||
|
||||
----
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<div class="display-card-container">
|
||||
<div class="row">
|
||||
|
||||
.. Add callout items below this line
|
||||
|
||||
.. displayitem::
|
||||
:header: Integrate your own cluster
|
||||
:description: Learn how to integrate your own cluster
|
||||
:col_css: col-md-6
|
||||
:button_link: ../clouds/cluster_expert.html
|
||||
:height: 150
|
||||
:tag: expert
|
||||
|
||||
.. raw:: html
|
||||
|
||||
</div>
|
||||
</div>
|
|
@ -1,7 +1,7 @@
|
|||
:orphan:
|
||||
|
||||
###########################################
|
||||
Level 25: Add a new accelerator or Strategy
|
||||
Level 24: Add a new accelerator or Strategy
|
||||
###########################################
|
||||
|
||||
Integrate a new accelerator or distributed strategy.
|
|
@ -345,4 +345,4 @@ Here is an example using a closure function.
|
|||
opt.step(closure=closure)
|
||||
|
||||
.. warning::
|
||||
The :class:`~torch.optim.LBFGS` optimizer is not supported for AMP, IPUs, or DeepSpeed.
|
||||
The :class:`~torch.optim.LBFGS` optimizer is not supported for AMP or DeepSpeed.
|
||||
|
|
|
@ -110,7 +110,7 @@ If the profiler report becomes too long, you can stream the report to a file:
|
|||
*************************
|
||||
Measure accelerator usage
|
||||
*************************
|
||||
Another helpful technique to detect bottlenecks is to ensure that you're using the full capacity of your accelerator (GPU/TPU/IPU/HPU).
|
||||
Another helpful technique to detect bottlenecks is to ensure that you're using the full capacity of your accelerator (GPU/TPU/HPU).
|
||||
This can be measured with the :class:`~lightning.pytorch.callbacks.device_stats_monitor.DeviceStatsMonitor`:
|
||||
|
||||
.. testcode::
|
||||
|
|
|
@ -1,82 +0,0 @@
|
|||
# Copyright The Lightning AI team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
from lightning.pytorch import LightningModule, Trainer
|
||||
from lightning.pytorch.demos.mnist_datamodule import MNISTDataModule
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class LitClassifier(LightningModule):
|
||||
def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001):
|
||||
super().__init__()
|
||||
self.save_hyperparameters()
|
||||
|
||||
self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
|
||||
self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)
|
||||
|
||||
self.val_outptus = []
|
||||
self.test_outputs = []
|
||||
|
||||
def forward(self, x):
|
||||
x = x.view(x.size(0), -1)
|
||||
x = torch.relu(self.l1(x))
|
||||
return torch.relu(self.l2(x))
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
y_hat = self(x)
|
||||
return F.cross_entropy(y_hat, y)
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
probs = self(x)
|
||||
acc = self.accuracy(probs, y)
|
||||
self.val_outputs.append(acc)
|
||||
return acc
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
acc = self.accuracy(logits, y)
|
||||
self.test_outputs.append(acc)
|
||||
return acc
|
||||
|
||||
def accuracy(self, logits, y):
|
||||
# currently IPU poptorch doesn't implicit convert bools to tensor
|
||||
# hence we use an explicit calculation for accuracy here. Once fixed in poptorch
|
||||
# we can use the accuracy metric.
|
||||
return torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
|
||||
|
||||
def on_validation_epoch_end(self) -> None:
|
||||
# since the training step/validation step and test step are run on the IPU device
|
||||
# we must log the average loss outside the step functions.
|
||||
self.log("val_acc", torch.stack(self.val_outptus).mean(), prog_bar=True)
|
||||
self.val_outptus.clear()
|
||||
|
||||
def on_test_epoch_end(self) -> None:
|
||||
self.log("test_acc", torch.stack(self.test_outputs).mean())
|
||||
self.test_outputs.clear()
|
||||
|
||||
def configure_optimizers(self):
|
||||
return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dm = MNISTDataModule(batch_size=32)
|
||||
model = LitClassifier()
|
||||
trainer = Trainer(max_epochs=2, accelerator="ipu", devices=8)
|
||||
|
||||
trainer.fit(model, datamodule=dm)
|
||||
trainer.test(model, datamodule=dm)
|
|
@ -1,3 +1,2 @@
|
|||
# validation accelerator connectors
|
||||
lightning-habana >=1.2.0, <1.3.0
|
||||
lightning-graphcore >=0.1.0, <0.2.0
|
||||
|
|
|
@ -13,6 +13,5 @@
|
|||
# limitations under the License.
|
||||
import lightning.pytorch._graveyard._torchmetrics
|
||||
import lightning.pytorch._graveyard.hpu
|
||||
import lightning.pytorch._graveyard.ipu
|
||||
import lightning.pytorch._graveyard.precision
|
||||
import lightning.pytorch._graveyard.tpu # noqa: F401
|
||||
|
|
|
@ -1,52 +0,0 @@
|
|||
import sys
|
||||
from typing import Any
|
||||
|
||||
import lightning.pytorch as pl
|
||||
|
||||
|
||||
def _patch_sys_modules() -> None:
|
||||
self = sys.modules[__name__]
|
||||
sys.modules["lightning.pytorch.accelerators.ipu"] = self
|
||||
sys.modules["lightning.pytorch.strategies.ipu"] = self
|
||||
sys.modules["lightning.pytorch.plugins.precision.ipu"] = self
|
||||
|
||||
|
||||
class IPUAccelerator:
|
||||
def __init__(self, *_: Any, **__: Any) -> None:
|
||||
raise NotImplementedError(
|
||||
"The `IPUAccelerator` class has been moved to an external package."
|
||||
" Install the extension package as `pip install lightning-graphcore`"
|
||||
" and import with `from lightning_graphcore import IPUAccelerator`."
|
||||
" Please see: https://github.com/Lightning-AI/lightning-Graphcore for more details."
|
||||
)
|
||||
|
||||
|
||||
class IPUStrategy:
|
||||
def __init__(self, *_: Any, **__: Any) -> None:
|
||||
raise NotImplementedError(
|
||||
"The `IPUStrategy` class has been moved to an external package."
|
||||
" Install the extension package as `pip install lightning-graphcore`"
|
||||
" and import with `from lightning_graphcore import IPUStrategy`."
|
||||
" Please see: https://github.com/Lightning-AI/lightning-Graphcore for more details."
|
||||
)
|
||||
|
||||
|
||||
class IPUPrecisionPlugin:
|
||||
def __init__(self, *_: Any, **__: Any) -> None:
|
||||
raise NotImplementedError(
|
||||
"The `IPUPrecisionPlugin` class has been moved to an external package."
|
||||
" Install the extension package as `pip install lightning-graphcore`"
|
||||
" and import with `from lightning_graphcore import IPUPrecisionPlugin`."
|
||||
" Please see: https://github.com/Lightning-AI/lightning-Graphcore for more details."
|
||||
)
|
||||
|
||||
|
||||
def _patch_classes() -> None:
|
||||
setattr(pl.accelerators, "IPUAccelerator", IPUAccelerator)
|
||||
setattr(pl.strategies, "IPUStrategy", IPUStrategy)
|
||||
setattr(pl.plugins, "IPUPrecisionPlugin", IPUPrecisionPlugin)
|
||||
setattr(pl.plugins.precision, "IPUPrecisionPlugin", IPUPrecisionPlugin)
|
||||
|
||||
|
||||
_patch_sys_modules()
|
||||
_patch_classes()
|
|
@ -601,10 +601,6 @@ class DataHooks:
|
|||
batch = super().transfer_batch_to_device(batch, device, dataloader_idx)
|
||||
return batch
|
||||
|
||||
Raises:
|
||||
MisconfigurationException:
|
||||
If using IPUs, ``Trainer(accelerator='ipu')``.
|
||||
|
||||
See Also:
|
||||
- :meth:`move_data_to_device`
|
||||
- :meth:`apply_to_collection`
|
||||
|
@ -661,10 +657,6 @@ class DataHooks:
|
|||
batch['x'] = gpu_transforms(batch['x'])
|
||||
return batch
|
||||
|
||||
Raises:
|
||||
MisconfigurationException:
|
||||
If using IPUs, ``Trainer(accelerator='ipu')``.
|
||||
|
||||
See Also:
|
||||
- :meth:`on_before_batch_transfer`
|
||||
- :meth:`transfer_batch_to_device`
|
||||
|
|
|
@ -16,7 +16,6 @@ import lightning.pytorch as pl
|
|||
from lightning.fabric.utilities.warnings import PossibleUserWarning
|
||||
from lightning.pytorch.trainer.states import TrainerFn
|
||||
from lightning.pytorch.utilities.exceptions import MisconfigurationException
|
||||
from lightning.pytorch.utilities.imports import _graphcore_available_and_importable
|
||||
from lightning.pytorch.utilities.model_helpers import is_overridden
|
||||
from lightning.pytorch.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn
|
||||
from lightning.pytorch.utilities.signature_utils import is_param_in_hook_signature
|
||||
|
@ -43,10 +42,7 @@ def _verify_loop_configurations(trainer: "pl.Trainer") -> None:
|
|||
elif trainer.state.fn == TrainerFn.PREDICTING:
|
||||
__verify_eval_loop_configuration(model, "predict")
|
||||
|
||||
__verify_batch_transfer_support(trainer)
|
||||
|
||||
__verify_configure_model_configuration(model)
|
||||
|
||||
__warn_dataloader_iter_limitations(model)
|
||||
|
||||
|
||||
|
@ -120,22 +116,6 @@ def __verify_eval_loop_configuration(model: "pl.LightningModule", stage: str) ->
|
|||
)
|
||||
|
||||
|
||||
def __verify_batch_transfer_support(trainer: "pl.Trainer") -> None:
|
||||
batch_transfer_hooks = ("transfer_batch_to_device", "on_after_batch_transfer")
|
||||
datahook_selector = trainer._data_connector._datahook_selector
|
||||
assert datahook_selector is not None
|
||||
for hook in batch_transfer_hooks:
|
||||
if _graphcore_available_and_importable():
|
||||
from lightning_graphcore import IPUAccelerator
|
||||
|
||||
# TODO: This code could be done in a hook in the IPUAccelerator as it's a simple error check
|
||||
# through the Trainer. It doesn't need to stay in Lightning
|
||||
if isinstance(trainer.accelerator, IPUAccelerator) and (
|
||||
is_overridden(hook, datahook_selector.model) or is_overridden(hook, datahook_selector.datamodule)
|
||||
):
|
||||
raise MisconfigurationException(f"Overriding `{hook}` is not supported with IPUs.")
|
||||
|
||||
|
||||
def __verify_manual_optimization_support(trainer: "pl.Trainer", model: "pl.LightningModule") -> None:
|
||||
if model.automatic_optimization:
|
||||
return
|
||||
|
|
|
@ -64,7 +64,6 @@ from lightning.pytorch.strategies.ddp import _DDP_FORK_ALIASES
|
|||
from lightning.pytorch.utilities.exceptions import MisconfigurationException
|
||||
from lightning.pytorch.utilities.imports import (
|
||||
_LIGHTNING_COLOSSALAI_AVAILABLE,
|
||||
_graphcore_available_and_importable,
|
||||
_habana_available_and_importable,
|
||||
)
|
||||
from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn
|
||||
|
@ -338,11 +337,6 @@ class _AcceleratorConnector:
|
|||
"""Choose the accelerator type (str) based on availability."""
|
||||
if XLAAccelerator.is_available():
|
||||
return "tpu"
|
||||
if _graphcore_available_and_importable():
|
||||
from lightning_graphcore import IPUAccelerator
|
||||
|
||||
if IPUAccelerator.is_available():
|
||||
return "ipu"
|
||||
if _habana_available_and_importable():
|
||||
from lightning_habana import HPUAccelerator
|
||||
|
||||
|
@ -420,16 +414,6 @@ class _AcceleratorConnector:
|
|||
return LightningEnvironment()
|
||||
|
||||
def _choose_strategy(self) -> Union[Strategy, str]:
|
||||
if self._accelerator_flag == "ipu":
|
||||
if not _graphcore_available_and_importable():
|
||||
raise ImportError(
|
||||
"You have passed `accelerator='ipu'` but the IPU integration is not installed."
|
||||
" Please run `pip install lightning-graphcore` or check out"
|
||||
" https://github.com/Lightning-AI/lightning-Graphcore for instructions"
|
||||
)
|
||||
from lightning_graphcore import IPUStrategy
|
||||
|
||||
return IPUStrategy.strategy_name
|
||||
if self._accelerator_flag == "hpu":
|
||||
if not _habana_available_and_importable():
|
||||
raise ImportError(
|
||||
|
@ -500,16 +484,6 @@ class _AcceleratorConnector:
|
|||
if isinstance(self._precision_plugin_flag, Precision):
|
||||
return self._precision_plugin_flag
|
||||
|
||||
if _graphcore_available_and_importable():
|
||||
from lightning_graphcore import IPUAccelerator, IPUPrecision
|
||||
|
||||
# TODO: For the strategies that have a fixed precision class, we don't really need this logic
|
||||
# in the accelerator. Since the strategy owns the precision plugin, the strategy.precision_plugin
|
||||
# could be a no-op and then we wouldn't need this.
|
||||
|
||||
if isinstance(self.accelerator, IPUAccelerator):
|
||||
return IPUPrecision(self._precision_flag)
|
||||
|
||||
if _habana_available_and_importable():
|
||||
from lightning_habana import HPUAccelerator, HPUPrecisionPlugin
|
||||
|
||||
|
@ -691,12 +665,3 @@ def _register_external_accelerators_and_strategies() -> None:
|
|||
HPUParallelStrategy.register_strategies(StrategyRegistry)
|
||||
if "hpu_single" not in StrategyRegistry:
|
||||
SingleHPUStrategy.register_strategies(StrategyRegistry)
|
||||
|
||||
if _graphcore_available_and_importable():
|
||||
from lightning_graphcore import IPUAccelerator, IPUStrategy
|
||||
|
||||
# TODO: Prevent registering multiple times
|
||||
if "ipu" not in AcceleratorRegistry:
|
||||
IPUAccelerator.register_accelerators(AcceleratorRegistry)
|
||||
if "ipu_strategy" not in StrategyRegistry:
|
||||
IPUStrategy.register_strategies(StrategyRegistry)
|
||||
|
|
|
@ -34,7 +34,6 @@ from lightning.pytorch.trainer.states import RunningStage, TrainerFn
|
|||
from lightning.pytorch.utilities.combined_loader import CombinedLoader
|
||||
from lightning.pytorch.utilities.data import _is_dataloader_shuffled, _update_dataloader
|
||||
from lightning.pytorch.utilities.exceptions import MisconfigurationException
|
||||
from lightning.pytorch.utilities.imports import _graphcore_available_and_importable
|
||||
from lightning.pytorch.utilities.model_helpers import is_overridden
|
||||
from lightning.pytorch.utilities.rank_zero import WarningCache, rank_zero_warn
|
||||
from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
|
||||
|
@ -165,19 +164,11 @@ class _DataConnector:
|
|||
datamodule.trainer = trainer
|
||||
|
||||
def _requires_distributed_sampler(self, dataloader: DataLoader) -> bool:
|
||||
if _graphcore_available_and_importable():
|
||||
from lightning_graphcore import IPUAccelerator
|
||||
|
||||
# `DistributedSampler` is never used with `poptorch.DataLoader`
|
||||
is_ipu = isinstance(self.trainer.accelerator, IPUAccelerator)
|
||||
else:
|
||||
is_ipu = False
|
||||
return (
|
||||
self.trainer._accelerator_connector.use_distributed_sampler
|
||||
and self.trainer._accelerator_connector.is_distributed
|
||||
and not isinstance(dataloader.sampler, DistributedSampler)
|
||||
and not has_iterable_dataset(dataloader)
|
||||
and not is_ipu
|
||||
)
|
||||
|
||||
def _prepare_dataloader(self, dataloader: object, shuffle: bool, mode: RunningStage) -> object:
|
||||
|
@ -190,18 +181,9 @@ class _DataConnector:
|
|||
# don't do anything if it's not a dataloader
|
||||
if not isinstance(dataloader, DataLoader):
|
||||
return dataloader
|
||||
|
||||
if _graphcore_available_and_importable():
|
||||
from lightning_graphcore import IPUAccelerator
|
||||
|
||||
# IPUs use a custom `poptorch.DataLoader` which we might need to convert to
|
||||
is_ipu = isinstance(self.trainer.accelerator, IPUAccelerator)
|
||||
else:
|
||||
is_ipu = False
|
||||
if (
|
||||
self._requires_distributed_sampler(dataloader) # sets the distributed sampler
|
||||
or mode == RunningStage.PREDICTING # to track indices for the predictions
|
||||
or is_ipu
|
||||
):
|
||||
sampler = self._resolve_sampler(dataloader, shuffle=shuffle, mode=mode)
|
||||
return _update_dataloader(dataloader, sampler, mode=mode)
|
||||
|
|
|
@ -28,7 +28,7 @@ from lightning.pytorch.profilers import (
|
|||
XLAProfiler,
|
||||
)
|
||||
from lightning.pytorch.utilities.exceptions import MisconfigurationException
|
||||
from lightning.pytorch.utilities.imports import _graphcore_available_and_importable, _habana_available_and_importable
|
||||
from lightning.pytorch.utilities.imports import _habana_available_and_importable
|
||||
from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn
|
||||
|
||||
|
||||
|
@ -158,16 +158,6 @@ def _log_device_info(trainer: "pl.Trainer") -> None:
|
|||
num_tpu_cores = trainer.num_devices if isinstance(trainer.accelerator, XLAAccelerator) else 0
|
||||
rank_zero_info(f"TPU available: {XLAAccelerator.is_available()}, using: {num_tpu_cores} TPU cores")
|
||||
|
||||
if _graphcore_available_and_importable():
|
||||
from lightning_graphcore import IPUAccelerator
|
||||
|
||||
num_ipus = trainer.num_devices if isinstance(trainer.accelerator, IPUAccelerator) else 0
|
||||
ipu_available = IPUAccelerator.is_available()
|
||||
else:
|
||||
num_ipus = 0
|
||||
ipu_available = False
|
||||
rank_zero_info(f"IPU available: {ipu_available}, using: {num_ipus} IPUs")
|
||||
|
||||
if _habana_available_and_importable():
|
||||
from lightning_habana import HPUAccelerator
|
||||
|
||||
|
@ -192,12 +182,6 @@ def _log_device_info(trainer: "pl.Trainer") -> None:
|
|||
if XLAAccelerator.is_available() and not isinstance(trainer.accelerator, XLAAccelerator):
|
||||
rank_zero_warn("TPU available but not used. You can set it by doing `Trainer(accelerator='tpu')`.")
|
||||
|
||||
if _graphcore_available_and_importable():
|
||||
from lightning_graphcore import IPUAccelerator
|
||||
|
||||
if IPUAccelerator.is_available() and not isinstance(trainer.accelerator, IPUAccelerator):
|
||||
rank_zero_warn("IPU available but not used. You can set it by doing `Trainer(accelerator='ipu')`.")
|
||||
|
||||
if _habana_available_and_importable():
|
||||
from lightning_habana import HPUAccelerator
|
||||
|
||||
|
|
|
@ -136,7 +136,7 @@ class Trainer:
|
|||
r"""Customize every aspect of training via flags.
|
||||
|
||||
Args:
|
||||
accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "ipu", "hpu", "mps", "auto")
|
||||
accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "hpu", "mps", "auto")
|
||||
as well as custom accelerator instances.
|
||||
|
||||
strategy: Supports different training strategies with aliases as well custom strategies.
|
||||
|
@ -151,7 +151,7 @@ class Trainer:
|
|||
|
||||
precision: Double precision (64, '64' or '64-true'), full precision (32, '32' or '32-true'),
|
||||
16bit mixed precision (16, '16', '16-mixed') or bfloat16 mixed precision ('bf16', 'bf16-mixed').
|
||||
Can be used on CPU, GPU, TPUs, HPUs or IPUs.
|
||||
Can be used on CPU, GPU, TPUs, or HPUs.
|
||||
Default: ``'32-true'``.
|
||||
|
||||
logger: Logger (or iterable collection of loggers) for experiment tracking. A ``True`` value uses
|
||||
|
|
|
@ -41,15 +41,6 @@ def _try_import_module(module_name: str) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
_LIGHTNING_GRAPHCORE_AVAILABLE = RequirementCache("lightning-graphcore>=0.1.0")
|
||||
|
||||
|
||||
def _graphcore_available_and_importable() -> bool:
|
||||
# This is defined as a function instead of a constant to avoid circular imports, because `lightning_graphcore`
|
||||
# also imports Lightning
|
||||
return bool(_LIGHTNING_GRAPHCORE_AVAILABLE) and _try_import_module("lightning_graphcore")
|
||||
|
||||
|
||||
_LIGHTNING_HABANA_AVAILABLE = RequirementCache("lightning-habana>=1.2.0")
|
||||
|
||||
|
||||
|
|
|
@ -62,7 +62,7 @@ Lightning forces the following structure to your code which makes it reusable an
|
|||
- Non-essential research code (logging, etc... this goes in Callbacks).
|
||||
- Data (use PyTorch DataLoaders or organize them into a LightningDataModule).
|
||||
|
||||
Once you do this, you can train on multiple-GPUs, TPUs, CPUs, IPUs, HPUs and even in 16-bit precision without changing your code!
|
||||
Once you do this, you can train on multiple-GPUs, TPUs, CPUs, HPUs and even in 16-bit precision without changing your code!
|
||||
|
||||
[Get started in just 15 minutes](https://lightning.ai/docs/pytorch/latest/starter/introduction.html)
|
||||
|
||||
|
|
|
@ -63,7 +63,6 @@ def restore_env_variables():
|
|||
"PL_GLOBAL_SEED",
|
||||
"PL_SEED_WORKERS",
|
||||
"RANK", # set by DeepSpeed
|
||||
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
|
||||
"CUDA_MODULE_LOADING", # leaked by PyTorch
|
||||
"CRC32C_SW_MODE", # set by tensorboardX
|
||||
"OMP_NUM_THREADS", # set by our launchers
|
||||
|
|
|
@ -83,7 +83,6 @@ def restore_env_variables():
|
|||
"WANDB_REQUIRE_SERVICE",
|
||||
"WANDB_SERVICE",
|
||||
"RANK", # set by DeepSpeed
|
||||
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
|
||||
"CUDA_MODULE_LOADING", # leaked by PyTorch
|
||||
"KMP_INIT_AT_FORK", # leaked by PyTorch
|
||||
"KMP_DUPLICATE_LIB_OK", # leaked by PyTorch
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
from importlib import import_module
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("import_path", "name"),
|
||||
[
|
||||
("lightning.pytorch.accelerators", "IPUAccelerator"),
|
||||
("lightning.pytorch.accelerators.ipu", "IPUAccelerator"),
|
||||
("lightning.pytorch.strategies", "IPUStrategy"),
|
||||
("lightning.pytorch.strategies.ipu", "IPUStrategy"),
|
||||
("lightning.pytorch.plugins.precision", "IPUPrecisionPlugin"),
|
||||
("lightning.pytorch.plugins.precision.ipu", "IPUPrecisionPlugin"),
|
||||
],
|
||||
)
|
||||
def test_extracted_ipu(import_path, name):
|
||||
module = import_module(import_path)
|
||||
cls = getattr(module, name)
|
||||
with pytest.raises(NotImplementedError, match=f"{name}` class has been moved to an external package.*"):
|
||||
cls()
|
|
@ -58,7 +58,6 @@ from lightning.pytorch.trainer.connectors.accelerator_connector import _Accelera
|
|||
from lightning.pytorch.utilities.exceptions import MisconfigurationException
|
||||
from lightning.pytorch.utilities.imports import (
|
||||
_LIGHTNING_HABANA_AVAILABLE,
|
||||
_graphcore_available_and_importable,
|
||||
)
|
||||
from lightning_utilities.core.imports import package_available
|
||||
|
||||
|
@ -580,16 +579,6 @@ def test_unsupported_tpu_choice(xla_available, tpu_available):
|
|||
Trainer(accelerator="tpu", precision="16-true", strategy="ddp")
|
||||
|
||||
|
||||
def mock_ipu_available(monkeypatch, value=True):
|
||||
# TODO: this isn't really mocking. it should be implemented and used as `mock_hpu_count`
|
||||
try:
|
||||
import lightning_graphcore
|
||||
except ModuleNotFoundError:
|
||||
return
|
||||
monkeypatch.setattr(lightning_graphcore.accelerator, "_IPU_AVAILABLE", value)
|
||||
monkeypatch.setattr(lightning_graphcore.strategy, "_IPU_AVAILABLE", value)
|
||||
|
||||
|
||||
if _LIGHTNING_HABANA_AVAILABLE:
|
||||
from lightning_habana import HPUAccelerator, HPUParallelStrategy, SingleHPUStrategy
|
||||
else:
|
||||
|
@ -657,7 +646,6 @@ def mock_hpu_count(monkeypatch, n=1):
|
|||
|
||||
def test_devices_auto_choice_cpu(monkeypatch, cuda_count_0):
|
||||
mock_hpu_count(monkeypatch, 0)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
mock_xla_available(monkeypatch, False)
|
||||
trainer = Trainer(accelerator="auto", devices="auto")
|
||||
assert trainer.num_devices == 1
|
||||
|
@ -915,7 +903,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
|
|||
mock_cuda_count(monkeypatch, 0)
|
||||
mock_mps_count(monkeypatch, 0)
|
||||
mock_tpu_available(monkeypatch, False)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
trainer = Trainer()
|
||||
assert isinstance(trainer.accelerator, CPUAccelerator)
|
||||
assert isinstance(trainer.strategy, SingleDeviceStrategy)
|
||||
|
@ -927,7 +914,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
|
|||
mock_cuda_count(monkeypatch, 1)
|
||||
mock_mps_count(monkeypatch, 0)
|
||||
mock_tpu_available(monkeypatch, False)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
trainer = Trainer()
|
||||
assert isinstance(trainer.accelerator, CUDAAccelerator)
|
||||
assert isinstance(trainer.strategy, SingleDeviceStrategy)
|
||||
|
@ -939,7 +925,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
|
|||
mock_cuda_count(monkeypatch, 4)
|
||||
mock_mps_count(monkeypatch, 0)
|
||||
mock_tpu_available(monkeypatch, False)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
trainer = Trainer()
|
||||
assert isinstance(trainer.accelerator, CUDAAccelerator)
|
||||
assert isinstance(trainer.strategy, (SingleDeviceStrategy if is_interactive else DDPStrategy))
|
||||
|
@ -955,7 +940,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
|
|||
mock_cuda_count(monkeypatch, 0)
|
||||
mock_mps_count(monkeypatch, 1)
|
||||
mock_tpu_available(monkeypatch, False)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
connector = _AcceleratorConnector()
|
||||
assert isinstance(connector.accelerator, MPSAccelerator)
|
||||
assert isinstance(connector.strategy, SingleDeviceStrategy)
|
||||
|
@ -965,7 +949,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
|
|||
with monkeypatch.context():
|
||||
mock_cuda_count(monkeypatch, 0)
|
||||
mock_mps_count(monkeypatch, 0)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
_mock_tpu_available(True)
|
||||
monkeypatch.setattr(lightning.pytorch.accelerators.XLAAccelerator, "auto_device_count", lambda *_: 1)
|
||||
monkeypatch.setattr(torch, "device", DeviceMock())
|
||||
|
@ -982,7 +965,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
|
|||
mock_cuda_count(monkeypatch, 0)
|
||||
mock_mps_count(monkeypatch, 0)
|
||||
_mock_tpu_available(True)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
connector = _AcceleratorConnector()
|
||||
assert isinstance(connector.accelerator, XLAAccelerator)
|
||||
assert isinstance(connector.strategy, XLAStrategy)
|
||||
|
@ -991,28 +973,11 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
|
|||
assert connector.strategy._start_method == "fork"
|
||||
assert connector.strategy.launcher.is_interactive_compatible
|
||||
|
||||
# Single/Multi IPU: strategy is the same
|
||||
if _graphcore_available_and_importable():
|
||||
with monkeypatch.context():
|
||||
mock_cuda_count(monkeypatch, 0)
|
||||
mock_mps_count(monkeypatch, 0)
|
||||
mock_tpu_available(monkeypatch, False)
|
||||
mock_ipu_available(monkeypatch, True)
|
||||
from lightning_graphcore import IPUAccelerator, IPUStrategy
|
||||
|
||||
connector = _AcceleratorConnector()
|
||||
assert isinstance(connector.accelerator, IPUAccelerator)
|
||||
assert isinstance(connector.strategy, IPUStrategy)
|
||||
assert connector._devices_flag == 4
|
||||
assert isinstance(connector.strategy.cluster_environment, LightningEnvironment)
|
||||
assert connector.strategy.launcher is None
|
||||
|
||||
# Single HPU
|
||||
with monkeypatch.context():
|
||||
mock_cuda_count(monkeypatch, 0)
|
||||
mock_mps_count(monkeypatch, 0)
|
||||
mock_tpu_available(monkeypatch, False)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
mock_hpu_count(monkeypatch, 1)
|
||||
connector = _AcceleratorConnector()
|
||||
assert isinstance(connector.accelerator, HPUAccelerator)
|
||||
|
@ -1029,7 +994,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
|
|||
mock_cuda_count(monkeypatch, 0)
|
||||
mock_mps_count(monkeypatch, 0)
|
||||
mock_tpu_available(monkeypatch, False)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
mock_hpu_count(monkeypatch, 8)
|
||||
connector = _AcceleratorConnector()
|
||||
assert isinstance(connector.accelerator, HPUAccelerator)
|
||||
|
@ -1047,7 +1011,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
|
|||
mock_cuda_count(monkeypatch, 2)
|
||||
mock_mps_count(monkeypatch, 0)
|
||||
_mock_tpu_available(True)
|
||||
mock_ipu_available(monkeypatch, False)
|
||||
connector = _AcceleratorConnector()
|
||||
assert isinstance(connector.accelerator, XLAAccelerator)
|
||||
assert isinstance(connector.strategy, XLAStrategy)
|
||||
|
|
Loading…
Reference in New Issue