Remove the Graphcore IPU integration (#19405)

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
This commit is contained in:
awaelchli 2024-02-12 22:16:02 +01:00 committed by GitHub
parent 8d4768f2ae
commit e950bb4828
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
42 changed files with 130 additions and 596 deletions

View File

@ -27,7 +27,6 @@ Brief description of all our automation tools used for boosting development perf
- GPU: 2 x NVIDIA RTX 3090
- TPU: [Google TPU v4-8](https://cloud.google.com/tpu/docs)
- IPU: [Colossus MK1 IPU](https://www.graphcore.ai/products/ipu)
- To check which versions of Python or PyTorch are used for testing in our CI, see the corresponding workflow files or checkgroup config file at [`.github/checkgroup.yml`](../checkgroup.yml).

2
.gitignore vendored
View File

@ -22,9 +22,7 @@ docs/source-pytorch/notebooks
docs/source-pytorch/_static/images/course_UvA-DL
docs/source-pytorch/_static/images/lightning_examples
docs/source-pytorch/_static/fetched-s3-assets
docs/source-pytorch/_static/images/ipu/
docs/source-pytorch/integrations/hpu
docs/source-pytorch/integrations/ipu
docs/source-fabric/*/generated

View File

@ -53,7 +53,6 @@ And that's it!
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
| Name | Type | Params | In sizes | Out sizes
------------------------------------------------------------------

View File

@ -20,7 +20,7 @@ Training on Accelerators
**Use when:** Whenever possible!
With Lightning, running on GPUs, TPUs, IPUs on multiple nodes is a simple switch of a flag.
With Lightning, running on GPUs, TPUs, HPUs on multiple nodes is a simple switch of a flag.
GPU Training
============

View File

@ -17,7 +17,6 @@
../advanced/model_parallel
Train on single or multiple GPUs <../accelerators/gpu>
Train on single or multiple HPUs <../integrations/hpu/index>
Train on single or multiple IPUs <../integrations/ipu/index>
Train on single or multiple TPUs <../accelerators/tpu>
Train on MPS <../accelerators/mps>
Use a pretrained model <../advanced/pretrained>
@ -168,13 +167,6 @@ How-to Guides
:col_css: col-md-4
:height: 180
.. displayitem::
:header: Train on single or multiple IPUs
:description: Train models faster with IPU accelerators
:button_link: ../integrations/ipu/index.html
:col_css: col-md-4
:height: 180
.. displayitem::
:header: Train on single or multiple TPUs
:description: TTrain models faster with TPU accelerators

View File

@ -103,31 +103,26 @@ Precision support by accelerator
********************************
.. list-table:: Precision with Accelerators
:widths: 20 20 20 20 20
:widths: 20 20 20 20
:header-rows: 1
* - Precision
- CPU
- GPU
- TPU
- IPU
* - 16 Mixed
- No
- Yes
- No
- Yes
* - BFloat16 Mixed
- Yes
- Yes
- Yes
- No
* - 32 True
- Yes
- Yes
- Yes
- Yes
* - 64 True
- Yes
- Yes
- No
- No

View File

@ -175,7 +175,7 @@ Trainer flags
accelerator
^^^^^^^^^^^
Supports passing different accelerator types (``"cpu", "gpu", "tpu", "ipu", "auto"``)
Supports passing different accelerator types (``"cpu", "gpu", "tpu", "hpu", "auto"``)
as well as custom accelerator instances.
.. code-block:: python
@ -393,9 +393,6 @@ Number of devices to train on (``int``), which devices to train on (``list`` or
# Training with TPU Accelerator using 8 tpu cores
trainer = Trainer(devices="auto", accelerator="tpu")
# Training with IPU Accelerator using 4 ipus
trainer = Trainer(devices="auto", accelerator="ipu")
.. note::
If the ``devices`` flag is not defined, it will assume ``devices`` to be ``"auto"`` and fetch the ``auto_device_count``

View File

@ -133,13 +133,6 @@ Customize and extend Lightning for things like custom hardware or distributed st
:button_link: integrations/hpu/index.html
:height: 100
.. displayitem::
:header: Train on single or multiple IPUs
:description: Train models faster with IPUs.
:col_css: col-md-12
:button_link: integrations/ipu/index.html
:height: 100
.. displayitem::
:header: Train on single or multiple TPUs
:description: Train models faster with TPUs.

View File

@ -94,18 +94,6 @@ assist_local.AssistantCLI.pull_docs_files(
target_dir="docs/source-pytorch/integrations/hpu",
checkout="refs/tags/1.3.0",
)
assist_local.AssistantCLI.pull_docs_files(
gh_user_repo="Lightning-AI/lightning-Graphcore",
target_dir="docs/source-pytorch/integrations/ipu",
checkout="refs/tags/v0.1.0",
as_orphan=True, # todo: this can be dropped after new IPU release
)
# the IPU also need one image
URL_RAW_DOCS_GRAPHCORE = "https://raw.githubusercontent.com/Lightning-AI/lightning-Graphcore/v0.1.0/docs/source"
for img in ["_static/images/ipu/profiler.png"]:
img_ = os.path.join(_PATH_HERE, "integrations", "ipu", img)
os.makedirs(os.path.dirname(img_), exist_ok=True)
urllib.request.urlretrieve(f"{URL_RAW_DOCS_GRAPHCORE}/{img}", img_)
# Copy strategies docs as single pages
assist_local.AssistantCLI.pull_docs_files(
@ -340,7 +328,6 @@ intersphinx_mapping = {
"numpy": ("https://numpy.org/doc/stable/", None),
"PIL": ("https://pillow.readthedocs.io/en/stable/", None),
"torchmetrics": ("https://torchmetrics.readthedocs.io/en/stable/", None),
"graphcore": ("https://docs.graphcore.ai/en/latest/", None),
"lightning_habana": ("https://lightning-ai.github.io/lightning-Habana/", None),
"tensorboardX": ("https://tensorboardx.readthedocs.io/en/stable/", None),
# needed for referencing App from lightning scope

View File

@ -190,34 +190,26 @@ Configure all aspects of Lightning for advanced usecases.
:tag: advanced
.. displayitem::
:header: Level 18: Explore IPUs
:description: Explore Intelligence Processing Unit (IPU) for model scaling.
:header: Level 18: Explore HPUs
:description: Explore Havana Gaudi Processing Unit (HPU) for model scaling.
:col_css: col-md-6
:button_link: levels/advanced_level_19.html
:height: 150
:tag: advanced
.. displayitem::
:header: Level 19: Explore HPUs
:description: Explore Havana Gaudi Processing Unit (HPU) for model scaling.
:header: Level 19: Master TPUs
:description: Master TPUs and run on cloud TPUs.
:col_css: col-md-6
:button_link: levels/advanced_level_20.html
:height: 150
:tag: advanced
.. displayitem::
:header: Level 20: Master TPUs
:description: Master TPUs and run on cloud TPUs.
:col_css: col-md-6
:button_link: levels/advanced_level_21.html
:height: 150
:tag: advanced
.. displayitem::
:header: Level 21: Train models with billions of parameters
:header: Level 20: Train models with billions of parameters
:description: Scale GPU training to models with billions of parameters
:col_css: col-md-6
:button_link: levels/advanced_level_22.html
:button_link: levels/advanced_level_21.html
:height: 150
:tag: advanced
@ -240,7 +232,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
.. Add callout items below this line
.. displayitem::
:header: Level 22: Extend the Lightning CLI
:header: Level 21: Extend the Lightning CLI
:description: Extend the functionality of the Lightning CLI.
:col_css: col-md-6
:button_link: levels/expert_level_23.html
@ -248,7 +240,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
:tag: expert
.. displayitem::
:header: Level 23: Integrate a custom cluster
:header: Level 22: Integrate a custom cluster
:description: Integrate a custom cluster into Lightning.
:col_css: col-md-6
:button_link: levels/expert_level_24.html
@ -256,7 +248,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
:tag: expert
.. displayitem::
:header: Level 24: Make your own profiler
:header: Level 23: Make your own profiler
:description: Make your own profiler.
:col_css: col-md-6
:button_link: tuning/profiler_expert.html
@ -264,10 +256,10 @@ Customize and extend Lightning for things like custom hardware or distributed st
:tag: expert
.. displayitem::
:header: Level 25: Add a new accelerator or Strategy
:header: Level 24: Add a new accelerator or Strategy
:description: Integrate a new accelerator or distributed strategy.
:col_css: col-md-6
:button_link: levels/expert_level_27.html
:button_link: levels/expert_level_25.html
:height: 150
:tag: expert

View File

@ -4,13 +4,12 @@
Accelerator
###########
The Accelerator connects a Lightning Trainer to arbitrary hardware (CPUs, GPUs, TPUs, IPUs, MPS, ...).
The Accelerator connects a Lightning Trainer to arbitrary hardware (CPUs, GPUs, TPUs, HPUs, MPS, ...).
Currently there are accelerators for:
- CPU
- :doc:`GPU <../accelerators/gpu>`
- :doc:`TPU <../accelerators/tpu>`
- :doc:`IPU <../integrations/ipu/index>`
- :doc:`HPU <../integrations/hpu/index>`
- :doc:`MPS <../accelerators/mps>`

View File

@ -57,9 +57,6 @@ Here are some examples:
# Training with the DDP Spawn strategy on 8 TPU cores
trainer = Trainer(strategy="ddp_spawn", accelerator="tpu", devices=8)
# Training with the default IPU strategy on 8 IPUs
trainer = Trainer(accelerator="ipu", devices=8)
The below table lists all relevant strategies available in Lightning with their corresponding short-hand name:
.. list-table:: Strategy Classes and Nicknames
@ -87,9 +84,6 @@ The below table lists all relevant strategies available in Lightning with their
* - hpu_single
- ``SingleHPUStrategy``
- Strategy for training on a single HPU device. :doc:`Learn more. <../integrations/hpu/index>`
* - ipu_strategy
- ``IPUStrategy``
- Plugin for training on IPU devices. :doc:`Learn more. <../integrations/ipu/index>`
* - xla
- :class:`~lightning.pytorch.strategies.XLAStrategy`
- Strategy for training on multiple TPU devices using the :func:`torch_xla.distributed.xla_multiprocessing.spawn` method. :doc:`Learn more. <../accelerators/tpu>`

View File

@ -20,7 +20,6 @@
Half precision <../common/precision>
HPU <../integrations/hpu/index>
Inference <../deploy/production_intermediate>
IPU <../integrations/ipu/index>
Lightning CLI <../cli/lightning_cli>
LightningDataModule <../data/datamodule>
LightningModule <../common/lightning_module>
@ -177,13 +176,6 @@ Glossary
:button_link: ../deploy/production_intermediate.html
:height: 100
.. displayitem::
:header: IPU
:description: Graphcore Intelligence Processing Unit for faster training
:col_css: col-md-12
:button_link: ../integrations/ipu/index.html
:height: 100
.. displayitem::
:header: Lightning CLI
:description: A Command-line Interface (CLI) to interact with Lightning code via a terminal

View File

@ -1,48 +0,0 @@
.. _ipu:
Accelerator: IPU training
=========================
.. raw:: html
<div class="display-card-container">
<div class="row">
.. Add callout items below this line
.. displayitem::
:header: Prepare your code (Optional)
:description: Prepare your code to run on any hardware
:col_css: col-md-6
:button_link: accelerator_prepare.html
:height: 150
:tag: basic
.. displayitem::
:header: Basic
:description: Learn the basics of single and multi-IPU training.
:col_css: col-md-6
:button_link: ipu_basic.html
:height: 150
:tag: basic
.. displayitem::
:header: Intermediate
:description: Tune model performance with mix-precision settings and the performance analyser.
:col_css: col-md-6
:button_link: ipu_intermediate.html
:height: 150
:tag: intermediate
.. displayitem::
:header: Advanced
:description: Learn advanced techniques to customize IPU training for massive models.
:col_css: col-md-6
:button_link: ipu_advanced.html
:height: 150
:tag: advanced
.. raw:: html
</div>
</div>

View File

@ -46,34 +46,26 @@ Configure all aspects of Lightning for advanced usecases.
:tag: advanced
.. displayitem::
:header: Level 18: Explore IPUs
:description: Explore Intelligence Processing Unit (IPU) for model scaling.
:header: Level 18: Explore HPUs
:description: Explore Habana Gaudi Processing Unit (HPU) for model scaling.
:col_css: col-md-6
:button_link: advanced_level_19.html
:height: 150
:tag: advanced
.. displayitem::
:header: Level 19: Explore HPUs
:description: Explore Habana Gaudi Processing Unit (HPU) for model scaling.
:header: Level 19: Master TPUs
:description: Master TPUs and run on cloud TPUs.
:col_css: col-md-6
:button_link: advanced_level_20.html
:height: 150
:tag: advanced
.. displayitem::
:header: Level 20: Master TPUs
:description: Master TPUs and run on cloud TPUs.
:col_css: col-md-6
:button_link: advanced_level_21.html
:height: 150
:tag: advanced
.. displayitem::
:header: Level 21: Train models with billions of parameters
:header: Level 20: Train models with billions of parameters
:description: Scale GPU training to models with billions of parameters
:col_css: col-md-6
:button_link: advanced_level_22.html
:button_link: advanced_level_21.html
:height: 150
:tag: advanced

View File

@ -1,10 +1,10 @@
:orphan:
######################
Level 18: Explore IPUs
Level 18: Explore HPUs
######################
Explore Intelligence Processing Unit (IPU) for model scaling.
Explore Intel Habana Processing Unit (HPU) for model scaling.
----
@ -16,26 +16,18 @@ Explore Intelligence Processing Unit (IPU) for model scaling.
.. Add callout items below this line
.. displayitem::
:header: Prepare your code (Optional)
:description: Prepare your code to run on any hardware.
:col_css: col-md-4
:button_link: ../accelerators/accelerator_prepare.html
:header: Train models on HPUs
:description: Learn the basics of single and multi-HPU core training.
:col_css: col-md-6
:button_link: ../integrations/hpu/basic.html
:height: 150
:tag: basic
.. displayitem::
:header: Train models on IPUs
:description: Learn the basics of single and multi-IPU training.
:col_css: col-md-4
:button_link: ../integrations/ipu/basic.html
:height: 150
:tag: basic
.. displayitem::
:header: Optimize models training on IPUs
:description: Tune model performance with mixed precision and the performance analyser.
:col_css: col-md-4
:button_link: ../integrations/ipu/intermediate.html
:header: Optimize models training on HPUs
:description: Enable state-of-the-art scaling with advanced mixed-precision settings.
:col_css: col-md-6
:button_link: ../integrations/hpu/intermediate.html
:height: 150
:tag: intermediate

View File

@ -1,10 +1,10 @@
:orphan:
######################
Level 19: Explore HPUs
######################
#####################
Level 19: Master TPUs
#####################
Explore Intel Habana Processing Unit (HPU) for model scaling.
Master cloud TPU training with profiling and scaling techniques.
----
@ -16,20 +16,28 @@ Explore Intel Habana Processing Unit (HPU) for model scaling.
.. Add callout items below this line
.. displayitem::
:header: Train models on HPUs
:description: Learn the basics of single and multi-HPU core training.
:col_css: col-md-6
:button_link: ../integrations/hpu/basic.html
:height: 150
:tag: basic
:header: Run on cloud TPUs
:description: Scale massive models using cloud TPUs.
:col_css: col-md-4
:button_link: ../accelerators/tpu_intermediate.html
:height: 180
:tag: intermediate
.. displayitem::
:header: Optimize models training on HPUs
:description: Enable state-of-the-art scaling with advanced mixed-precision settings.
:col_css: col-md-6
:button_link: ../integrations/hpu/intermediate.html
:height: 150
:tag: intermediate
:header: Explore advanced TPU scaling techniques
:description: Dive into XLA and advanced techniques to optimize TPU-powered models.
:col_css: col-md-4
:button_link: ../accelerators/tpu_advanced.html
:height: 180
:tag: advanced
.. displayitem::
:header: Profile TPU code
:description: Learn to profile TPU code.
:col_css: col-md-4
:button_link: ../tuning/profiler_advanced.html
:height: 180
:tag: advanced
.. raw:: html

View File

@ -1,10 +1,10 @@
:orphan:
#####################
Level 20: Master TPUs
#####################
##################################################
Level 20: Train models with billions of parameters
##################################################
Master cloud TPU training with profiling and scaling techniques.
Scale to billions of parameters with multiple distributed strategies.
----
@ -16,27 +16,19 @@ Master cloud TPU training with profiling and scaling techniques.
.. Add callout items below this line
.. displayitem::
:header: Run on cloud TPUs
:description: Scale massive models using cloud TPUs.
:col_css: col-md-4
:button_link: ../accelerators/tpu_intermediate.html
:height: 180
:header: Scale with distributed strategies
:description: Learn about different distributed strategies to reach bigger model parameter sizes.
:col_css: col-md-6
:button_link: ../accelerators/gpu_intermediate.html
:height: 150
:tag: intermediate
.. displayitem::
:header: Explore advanced TPU scaling techniques
:description: Dive into XLA and advanced techniques to optimize TPU-powered models.
:col_css: col-md-4
:button_link: ../accelerators/tpu_advanced.html
:height: 180
:tag: advanced
.. displayitem::
:header: Profile TPU code
:description: Learn to profile TPU code.
:col_css: col-md-4
:button_link: ../tuning/profiler_advanced.html
:height: 180
:header: Train models with billions of parameters
:description: Scale to billions of params on GPUs with FSDP or Deepspeed.
:col_css: col-md-6
:button_link: ../advanced/model_parallel.html
:height: 150
:tag: advanced
.. raw:: html

View File

@ -1,37 +0,0 @@
:orphan:
##################################################
Level 21: Train models with billions of parameters
##################################################
Scale to billions of parameters with multiple distributed strategies.
----
.. raw:: html
<div class="display-card-container">
<div class="row">
.. Add callout items below this line
.. displayitem::
:header: Scale with distributed strategies
:description: Learn about different distributed strategies to reach bigger model parameter sizes.
:col_css: col-md-6
:button_link: ../accelerators/gpu_intermediate.html
:height: 150
:tag: intermediate
.. displayitem::
:header: Train models with billions of parameters
:description: Scale to billions of params on GPUs with FSDP or Deepspeed.
:col_css: col-md-6
:button_link: ../advanced/model_parallel.html
:height: 150
:tag: advanced
.. raw:: html
</div>
</div>

View File

@ -14,23 +14,23 @@ Customize and extend Lightning for things like custom hardware or distributed st
.. Add callout items below this line
.. displayitem::
:header: Level 22: Extend the Lightning CLI
:header: Level 21: Extend the Lightning CLI
:description: Extend the functionality of the Lightning CLI.
:col_css: col-md-6
:button_link: expert_level_22.html
:height: 150
:tag: expert
.. displayitem::
:header: Level 22: Integrate a custom cluster
:description: Integrate a custom cluster into Lightning.
:col_css: col-md-6
:button_link: expert_level_23.html
:height: 150
:tag: expert
.. displayitem::
:header: Level 23: Integrate a custom cluster
:description: Integrate a custom cluster into Lightning.
:col_css: col-md-6
:button_link: expert_level_24.html
:height: 150
:tag: expert
.. displayitem::
:header: Level 24: Make your own profiler
:header: Level 23: Make your own profiler
:description: Make your own profiler.
:col_css: col-md-6
:button_link: ../tuning/profiler_expert.html
@ -38,7 +38,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
:tag: expert
.. displayitem::
:header: Level 25: Add a new accelerator or Strategy
:header: Level 24: Add a new accelerator or Strategy
:description: Integrate a new accelerator or distributed strategy.
:col_css: col-md-6
:button_link: expert_level_27.html

View File

@ -0,0 +1,37 @@
:orphan:
##################################
Level 21: Extend the Lightning CLI
##################################
Extend the functionality of the Lightning CLI.
----
.. raw:: html
<div class="display-card-container">
<div class="row">
.. Add callout items below this line
.. displayitem::
:header: Customize configs for complex projects
:description: Learn how to connect complex projects with each Registry.
:col_css: col-md-6
:button_link: ../cli/lightning_cli_advanced_3.html
:height: 150
:tag: expert
.. displayitem::
:header: Extend the Lightning CLI
:description: Customize the Lightning CLI
:col_css: col-md-6
:button_link: ../cli/lightning_cli_expert.html
:height: 150
:tag: expert
.. raw:: html
</div>
</div>

View File

@ -1,8 +1,8 @@
:orphan:
##################################
Level 22: Extend the Lightning CLI
##################################
####################################
Level 22: Integrate a custom cluster
####################################
Extend the functionality of the Lightning CLI.
@ -16,18 +16,10 @@ Extend the functionality of the Lightning CLI.
.. Add callout items below this line
.. displayitem::
:header: Customize configs for complex projects
:description: Learn how to connect complex projects with each Registry.
:header: Integrate your own cluster
:description: Learn how to integrate your own cluster
:col_css: col-md-6
:button_link: ../cli/lightning_cli_advanced_3.html
:height: 150
:tag: expert
.. displayitem::
:header: Extend the Lightning CLI
:description: Customize the Lightning CLI
:col_css: col-md-6
:button_link: ../cli/lightning_cli_expert.html
:button_link: ../clouds/cluster_expert.html
:height: 150
:tag: expert

View File

@ -1,29 +0,0 @@
:orphan:
####################################
Level 23: Integrate a custom cluster
####################################
Extend the functionality of the Lightning CLI.
----
.. raw:: html
<div class="display-card-container">
<div class="row">
.. Add callout items below this line
.. displayitem::
:header: Integrate your own cluster
:description: Learn how to integrate your own cluster
:col_css: col-md-6
:button_link: ../clouds/cluster_expert.html
:height: 150
:tag: expert
.. raw:: html
</div>
</div>

View File

@ -1,7 +1,7 @@
:orphan:
###########################################
Level 25: Add a new accelerator or Strategy
Level 24: Add a new accelerator or Strategy
###########################################
Integrate a new accelerator or distributed strategy.

View File

@ -345,4 +345,4 @@ Here is an example using a closure function.
opt.step(closure=closure)
.. warning::
The :class:`~torch.optim.LBFGS` optimizer is not supported for AMP, IPUs, or DeepSpeed.
The :class:`~torch.optim.LBFGS` optimizer is not supported for AMP or DeepSpeed.

View File

@ -110,7 +110,7 @@ If the profiler report becomes too long, you can stream the report to a file:
*************************
Measure accelerator usage
*************************
Another helpful technique to detect bottlenecks is to ensure that you're using the full capacity of your accelerator (GPU/TPU/IPU/HPU).
Another helpful technique to detect bottlenecks is to ensure that you're using the full capacity of your accelerator (GPU/TPU/HPU).
This can be measured with the :class:`~lightning.pytorch.callbacks.device_stats_monitor.DeviceStatsMonitor`:
.. testcode::

View File

@ -1,82 +0,0 @@
# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from lightning.pytorch import LightningModule, Trainer
from lightning.pytorch.demos.mnist_datamodule import MNISTDataModule
from torch.nn import functional as F
class LitClassifier(LightningModule):
def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001):
super().__init__()
self.save_hyperparameters()
self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)
self.val_outptus = []
self.test_outputs = []
def forward(self, x):
x = x.view(x.size(0), -1)
x = torch.relu(self.l1(x))
return torch.relu(self.l2(x))
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
return F.cross_entropy(y_hat, y)
def validation_step(self, batch, batch_idx):
x, y = batch
probs = self(x)
acc = self.accuracy(probs, y)
self.val_outputs.append(acc)
return acc
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
acc = self.accuracy(logits, y)
self.test_outputs.append(acc)
return acc
def accuracy(self, logits, y):
# currently IPU poptorch doesn't implicit convert bools to tensor
# hence we use an explicit calculation for accuracy here. Once fixed in poptorch
# we can use the accuracy metric.
return torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
def on_validation_epoch_end(self) -> None:
# since the training step/validation step and test step are run on the IPU device
# we must log the average loss outside the step functions.
self.log("val_acc", torch.stack(self.val_outptus).mean(), prog_bar=True)
self.val_outptus.clear()
def on_test_epoch_end(self) -> None:
self.log("test_acc", torch.stack(self.test_outputs).mean())
self.test_outputs.clear()
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
if __name__ == "__main__":
dm = MNISTDataModule(batch_size=32)
model = LitClassifier()
trainer = Trainer(max_epochs=2, accelerator="ipu", devices=8)
trainer.fit(model, datamodule=dm)
trainer.test(model, datamodule=dm)

View File

@ -1,3 +1,2 @@
# validation accelerator connectors
lightning-habana >=1.2.0, <1.3.0
lightning-graphcore >=0.1.0, <0.2.0

View File

@ -13,6 +13,5 @@
# limitations under the License.
import lightning.pytorch._graveyard._torchmetrics
import lightning.pytorch._graveyard.hpu
import lightning.pytorch._graveyard.ipu
import lightning.pytorch._graveyard.precision
import lightning.pytorch._graveyard.tpu # noqa: F401

View File

@ -1,52 +0,0 @@
import sys
from typing import Any
import lightning.pytorch as pl
def _patch_sys_modules() -> None:
self = sys.modules[__name__]
sys.modules["lightning.pytorch.accelerators.ipu"] = self
sys.modules["lightning.pytorch.strategies.ipu"] = self
sys.modules["lightning.pytorch.plugins.precision.ipu"] = self
class IPUAccelerator:
def __init__(self, *_: Any, **__: Any) -> None:
raise NotImplementedError(
"The `IPUAccelerator` class has been moved to an external package."
" Install the extension package as `pip install lightning-graphcore`"
" and import with `from lightning_graphcore import IPUAccelerator`."
" Please see: https://github.com/Lightning-AI/lightning-Graphcore for more details."
)
class IPUStrategy:
def __init__(self, *_: Any, **__: Any) -> None:
raise NotImplementedError(
"The `IPUStrategy` class has been moved to an external package."
" Install the extension package as `pip install lightning-graphcore`"
" and import with `from lightning_graphcore import IPUStrategy`."
" Please see: https://github.com/Lightning-AI/lightning-Graphcore for more details."
)
class IPUPrecisionPlugin:
def __init__(self, *_: Any, **__: Any) -> None:
raise NotImplementedError(
"The `IPUPrecisionPlugin` class has been moved to an external package."
" Install the extension package as `pip install lightning-graphcore`"
" and import with `from lightning_graphcore import IPUPrecisionPlugin`."
" Please see: https://github.com/Lightning-AI/lightning-Graphcore for more details."
)
def _patch_classes() -> None:
setattr(pl.accelerators, "IPUAccelerator", IPUAccelerator)
setattr(pl.strategies, "IPUStrategy", IPUStrategy)
setattr(pl.plugins, "IPUPrecisionPlugin", IPUPrecisionPlugin)
setattr(pl.plugins.precision, "IPUPrecisionPlugin", IPUPrecisionPlugin)
_patch_sys_modules()
_patch_classes()

View File

@ -601,10 +601,6 @@ class DataHooks:
batch = super().transfer_batch_to_device(batch, device, dataloader_idx)
return batch
Raises:
MisconfigurationException:
If using IPUs, ``Trainer(accelerator='ipu')``.
See Also:
- :meth:`move_data_to_device`
- :meth:`apply_to_collection`
@ -661,10 +657,6 @@ class DataHooks:
batch['x'] = gpu_transforms(batch['x'])
return batch
Raises:
MisconfigurationException:
If using IPUs, ``Trainer(accelerator='ipu')``.
See Also:
- :meth:`on_before_batch_transfer`
- :meth:`transfer_batch_to_device`

View File

@ -16,7 +16,6 @@ import lightning.pytorch as pl
from lightning.fabric.utilities.warnings import PossibleUserWarning
from lightning.pytorch.trainer.states import TrainerFn
from lightning.pytorch.utilities.exceptions import MisconfigurationException
from lightning.pytorch.utilities.imports import _graphcore_available_and_importable
from lightning.pytorch.utilities.model_helpers import is_overridden
from lightning.pytorch.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn
from lightning.pytorch.utilities.signature_utils import is_param_in_hook_signature
@ -43,10 +42,7 @@ def _verify_loop_configurations(trainer: "pl.Trainer") -> None:
elif trainer.state.fn == TrainerFn.PREDICTING:
__verify_eval_loop_configuration(model, "predict")
__verify_batch_transfer_support(trainer)
__verify_configure_model_configuration(model)
__warn_dataloader_iter_limitations(model)
@ -120,22 +116,6 @@ def __verify_eval_loop_configuration(model: "pl.LightningModule", stage: str) ->
)
def __verify_batch_transfer_support(trainer: "pl.Trainer") -> None:
batch_transfer_hooks = ("transfer_batch_to_device", "on_after_batch_transfer")
datahook_selector = trainer._data_connector._datahook_selector
assert datahook_selector is not None
for hook in batch_transfer_hooks:
if _graphcore_available_and_importable():
from lightning_graphcore import IPUAccelerator
# TODO: This code could be done in a hook in the IPUAccelerator as it's a simple error check
# through the Trainer. It doesn't need to stay in Lightning
if isinstance(trainer.accelerator, IPUAccelerator) and (
is_overridden(hook, datahook_selector.model) or is_overridden(hook, datahook_selector.datamodule)
):
raise MisconfigurationException(f"Overriding `{hook}` is not supported with IPUs.")
def __verify_manual_optimization_support(trainer: "pl.Trainer", model: "pl.LightningModule") -> None:
if model.automatic_optimization:
return

View File

@ -64,7 +64,6 @@ from lightning.pytorch.strategies.ddp import _DDP_FORK_ALIASES
from lightning.pytorch.utilities.exceptions import MisconfigurationException
from lightning.pytorch.utilities.imports import (
_LIGHTNING_COLOSSALAI_AVAILABLE,
_graphcore_available_and_importable,
_habana_available_and_importable,
)
from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn
@ -338,11 +337,6 @@ class _AcceleratorConnector:
"""Choose the accelerator type (str) based on availability."""
if XLAAccelerator.is_available():
return "tpu"
if _graphcore_available_and_importable():
from lightning_graphcore import IPUAccelerator
if IPUAccelerator.is_available():
return "ipu"
if _habana_available_and_importable():
from lightning_habana import HPUAccelerator
@ -420,16 +414,6 @@ class _AcceleratorConnector:
return LightningEnvironment()
def _choose_strategy(self) -> Union[Strategy, str]:
if self._accelerator_flag == "ipu":
if not _graphcore_available_and_importable():
raise ImportError(
"You have passed `accelerator='ipu'` but the IPU integration is not installed."
" Please run `pip install lightning-graphcore` or check out"
" https://github.com/Lightning-AI/lightning-Graphcore for instructions"
)
from lightning_graphcore import IPUStrategy
return IPUStrategy.strategy_name
if self._accelerator_flag == "hpu":
if not _habana_available_and_importable():
raise ImportError(
@ -500,16 +484,6 @@ class _AcceleratorConnector:
if isinstance(self._precision_plugin_flag, Precision):
return self._precision_plugin_flag
if _graphcore_available_and_importable():
from lightning_graphcore import IPUAccelerator, IPUPrecision
# TODO: For the strategies that have a fixed precision class, we don't really need this logic
# in the accelerator. Since the strategy owns the precision plugin, the strategy.precision_plugin
# could be a no-op and then we wouldn't need this.
if isinstance(self.accelerator, IPUAccelerator):
return IPUPrecision(self._precision_flag)
if _habana_available_and_importable():
from lightning_habana import HPUAccelerator, HPUPrecisionPlugin
@ -691,12 +665,3 @@ def _register_external_accelerators_and_strategies() -> None:
HPUParallelStrategy.register_strategies(StrategyRegistry)
if "hpu_single" not in StrategyRegistry:
SingleHPUStrategy.register_strategies(StrategyRegistry)
if _graphcore_available_and_importable():
from lightning_graphcore import IPUAccelerator, IPUStrategy
# TODO: Prevent registering multiple times
if "ipu" not in AcceleratorRegistry:
IPUAccelerator.register_accelerators(AcceleratorRegistry)
if "ipu_strategy" not in StrategyRegistry:
IPUStrategy.register_strategies(StrategyRegistry)

View File

@ -34,7 +34,6 @@ from lightning.pytorch.trainer.states import RunningStage, TrainerFn
from lightning.pytorch.utilities.combined_loader import CombinedLoader
from lightning.pytorch.utilities.data import _is_dataloader_shuffled, _update_dataloader
from lightning.pytorch.utilities.exceptions import MisconfigurationException
from lightning.pytorch.utilities.imports import _graphcore_available_and_importable
from lightning.pytorch.utilities.model_helpers import is_overridden
from lightning.pytorch.utilities.rank_zero import WarningCache, rank_zero_warn
from lightning.pytorch.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
@ -165,19 +164,11 @@ class _DataConnector:
datamodule.trainer = trainer
def _requires_distributed_sampler(self, dataloader: DataLoader) -> bool:
if _graphcore_available_and_importable():
from lightning_graphcore import IPUAccelerator
# `DistributedSampler` is never used with `poptorch.DataLoader`
is_ipu = isinstance(self.trainer.accelerator, IPUAccelerator)
else:
is_ipu = False
return (
self.trainer._accelerator_connector.use_distributed_sampler
and self.trainer._accelerator_connector.is_distributed
and not isinstance(dataloader.sampler, DistributedSampler)
and not has_iterable_dataset(dataloader)
and not is_ipu
)
def _prepare_dataloader(self, dataloader: object, shuffle: bool, mode: RunningStage) -> object:
@ -190,18 +181,9 @@ class _DataConnector:
# don't do anything if it's not a dataloader
if not isinstance(dataloader, DataLoader):
return dataloader
if _graphcore_available_and_importable():
from lightning_graphcore import IPUAccelerator
# IPUs use a custom `poptorch.DataLoader` which we might need to convert to
is_ipu = isinstance(self.trainer.accelerator, IPUAccelerator)
else:
is_ipu = False
if (
self._requires_distributed_sampler(dataloader) # sets the distributed sampler
or mode == RunningStage.PREDICTING # to track indices for the predictions
or is_ipu
):
sampler = self._resolve_sampler(dataloader, shuffle=shuffle, mode=mode)
return _update_dataloader(dataloader, sampler, mode=mode)

View File

@ -28,7 +28,7 @@ from lightning.pytorch.profilers import (
XLAProfiler,
)
from lightning.pytorch.utilities.exceptions import MisconfigurationException
from lightning.pytorch.utilities.imports import _graphcore_available_and_importable, _habana_available_and_importable
from lightning.pytorch.utilities.imports import _habana_available_and_importable
from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn
@ -158,16 +158,6 @@ def _log_device_info(trainer: "pl.Trainer") -> None:
num_tpu_cores = trainer.num_devices if isinstance(trainer.accelerator, XLAAccelerator) else 0
rank_zero_info(f"TPU available: {XLAAccelerator.is_available()}, using: {num_tpu_cores} TPU cores")
if _graphcore_available_and_importable():
from lightning_graphcore import IPUAccelerator
num_ipus = trainer.num_devices if isinstance(trainer.accelerator, IPUAccelerator) else 0
ipu_available = IPUAccelerator.is_available()
else:
num_ipus = 0
ipu_available = False
rank_zero_info(f"IPU available: {ipu_available}, using: {num_ipus} IPUs")
if _habana_available_and_importable():
from lightning_habana import HPUAccelerator
@ -192,12 +182,6 @@ def _log_device_info(trainer: "pl.Trainer") -> None:
if XLAAccelerator.is_available() and not isinstance(trainer.accelerator, XLAAccelerator):
rank_zero_warn("TPU available but not used. You can set it by doing `Trainer(accelerator='tpu')`.")
if _graphcore_available_and_importable():
from lightning_graphcore import IPUAccelerator
if IPUAccelerator.is_available() and not isinstance(trainer.accelerator, IPUAccelerator):
rank_zero_warn("IPU available but not used. You can set it by doing `Trainer(accelerator='ipu')`.")
if _habana_available_and_importable():
from lightning_habana import HPUAccelerator

View File

@ -136,7 +136,7 @@ class Trainer:
r"""Customize every aspect of training via flags.
Args:
accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "ipu", "hpu", "mps", "auto")
accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "hpu", "mps", "auto")
as well as custom accelerator instances.
strategy: Supports different training strategies with aliases as well custom strategies.
@ -151,7 +151,7 @@ class Trainer:
precision: Double precision (64, '64' or '64-true'), full precision (32, '32' or '32-true'),
16bit mixed precision (16, '16', '16-mixed') or bfloat16 mixed precision ('bf16', 'bf16-mixed').
Can be used on CPU, GPU, TPUs, HPUs or IPUs.
Can be used on CPU, GPU, TPUs, or HPUs.
Default: ``'32-true'``.
logger: Logger (or iterable collection of loggers) for experiment tracking. A ``True`` value uses

View File

@ -41,15 +41,6 @@ def _try_import_module(module_name: str) -> bool:
return False
_LIGHTNING_GRAPHCORE_AVAILABLE = RequirementCache("lightning-graphcore>=0.1.0")
def _graphcore_available_and_importable() -> bool:
# This is defined as a function instead of a constant to avoid circular imports, because `lightning_graphcore`
# also imports Lightning
return bool(_LIGHTNING_GRAPHCORE_AVAILABLE) and _try_import_module("lightning_graphcore")
_LIGHTNING_HABANA_AVAILABLE = RequirementCache("lightning-habana>=1.2.0")

View File

@ -62,7 +62,7 @@ Lightning forces the following structure to your code which makes it reusable an
- Non-essential research code (logging, etc... this goes in Callbacks).
- Data (use PyTorch DataLoaders or organize them into a LightningDataModule).
Once you do this, you can train on multiple-GPUs, TPUs, CPUs, IPUs, HPUs and even in 16-bit precision without changing your code!
Once you do this, you can train on multiple-GPUs, TPUs, CPUs, HPUs and even in 16-bit precision without changing your code!
[Get started in just 15 minutes](https://lightning.ai/docs/pytorch/latest/starter/introduction.html)

View File

@ -63,7 +63,6 @@ def restore_env_variables():
"PL_GLOBAL_SEED",
"PL_SEED_WORKERS",
"RANK", # set by DeepSpeed
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
"CUDA_MODULE_LOADING", # leaked by PyTorch
"CRC32C_SW_MODE", # set by tensorboardX
"OMP_NUM_THREADS", # set by our launchers

View File

@ -83,7 +83,6 @@ def restore_env_variables():
"WANDB_REQUIRE_SERVICE",
"WANDB_SERVICE",
"RANK", # set by DeepSpeed
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
"CUDA_MODULE_LOADING", # leaked by PyTorch
"KMP_INIT_AT_FORK", # leaked by PyTorch
"KMP_DUPLICATE_LIB_OK", # leaked by PyTorch

View File

@ -1,21 +0,0 @@
from importlib import import_module
import pytest
@pytest.mark.parametrize(
("import_path", "name"),
[
("lightning.pytorch.accelerators", "IPUAccelerator"),
("lightning.pytorch.accelerators.ipu", "IPUAccelerator"),
("lightning.pytorch.strategies", "IPUStrategy"),
("lightning.pytorch.strategies.ipu", "IPUStrategy"),
("lightning.pytorch.plugins.precision", "IPUPrecisionPlugin"),
("lightning.pytorch.plugins.precision.ipu", "IPUPrecisionPlugin"),
],
)
def test_extracted_ipu(import_path, name):
module = import_module(import_path)
cls = getattr(module, name)
with pytest.raises(NotImplementedError, match=f"{name}` class has been moved to an external package.*"):
cls()

View File

@ -58,7 +58,6 @@ from lightning.pytorch.trainer.connectors.accelerator_connector import _Accelera
from lightning.pytorch.utilities.exceptions import MisconfigurationException
from lightning.pytorch.utilities.imports import (
_LIGHTNING_HABANA_AVAILABLE,
_graphcore_available_and_importable,
)
from lightning_utilities.core.imports import package_available
@ -580,16 +579,6 @@ def test_unsupported_tpu_choice(xla_available, tpu_available):
Trainer(accelerator="tpu", precision="16-true", strategy="ddp")
def mock_ipu_available(monkeypatch, value=True):
# TODO: this isn't really mocking. it should be implemented and used as `mock_hpu_count`
try:
import lightning_graphcore
except ModuleNotFoundError:
return
monkeypatch.setattr(lightning_graphcore.accelerator, "_IPU_AVAILABLE", value)
monkeypatch.setattr(lightning_graphcore.strategy, "_IPU_AVAILABLE", value)
if _LIGHTNING_HABANA_AVAILABLE:
from lightning_habana import HPUAccelerator, HPUParallelStrategy, SingleHPUStrategy
else:
@ -657,7 +646,6 @@ def mock_hpu_count(monkeypatch, n=1):
def test_devices_auto_choice_cpu(monkeypatch, cuda_count_0):
mock_hpu_count(monkeypatch, 0)
mock_ipu_available(monkeypatch, False)
mock_xla_available(monkeypatch, False)
trainer = Trainer(accelerator="auto", devices="auto")
assert trainer.num_devices == 1
@ -915,7 +903,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
mock_cuda_count(monkeypatch, 0)
mock_mps_count(monkeypatch, 0)
mock_tpu_available(monkeypatch, False)
mock_ipu_available(monkeypatch, False)
trainer = Trainer()
assert isinstance(trainer.accelerator, CPUAccelerator)
assert isinstance(trainer.strategy, SingleDeviceStrategy)
@ -927,7 +914,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
mock_cuda_count(monkeypatch, 1)
mock_mps_count(monkeypatch, 0)
mock_tpu_available(monkeypatch, False)
mock_ipu_available(monkeypatch, False)
trainer = Trainer()
assert isinstance(trainer.accelerator, CUDAAccelerator)
assert isinstance(trainer.strategy, SingleDeviceStrategy)
@ -939,7 +925,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
mock_cuda_count(monkeypatch, 4)
mock_mps_count(monkeypatch, 0)
mock_tpu_available(monkeypatch, False)
mock_ipu_available(monkeypatch, False)
trainer = Trainer()
assert isinstance(trainer.accelerator, CUDAAccelerator)
assert isinstance(trainer.strategy, (SingleDeviceStrategy if is_interactive else DDPStrategy))
@ -955,7 +940,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
mock_cuda_count(monkeypatch, 0)
mock_mps_count(monkeypatch, 1)
mock_tpu_available(monkeypatch, False)
mock_ipu_available(monkeypatch, False)
connector = _AcceleratorConnector()
assert isinstance(connector.accelerator, MPSAccelerator)
assert isinstance(connector.strategy, SingleDeviceStrategy)
@ -965,7 +949,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
with monkeypatch.context():
mock_cuda_count(monkeypatch, 0)
mock_mps_count(monkeypatch, 0)
mock_ipu_available(monkeypatch, False)
_mock_tpu_available(True)
monkeypatch.setattr(lightning.pytorch.accelerators.XLAAccelerator, "auto_device_count", lambda *_: 1)
monkeypatch.setattr(torch, "device", DeviceMock())
@ -982,7 +965,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
mock_cuda_count(monkeypatch, 0)
mock_mps_count(monkeypatch, 0)
_mock_tpu_available(True)
mock_ipu_available(monkeypatch, False)
connector = _AcceleratorConnector()
assert isinstance(connector.accelerator, XLAAccelerator)
assert isinstance(connector.strategy, XLAStrategy)
@ -991,28 +973,11 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
assert connector.strategy._start_method == "fork"
assert connector.strategy.launcher.is_interactive_compatible
# Single/Multi IPU: strategy is the same
if _graphcore_available_and_importable():
with monkeypatch.context():
mock_cuda_count(monkeypatch, 0)
mock_mps_count(monkeypatch, 0)
mock_tpu_available(monkeypatch, False)
mock_ipu_available(monkeypatch, True)
from lightning_graphcore import IPUAccelerator, IPUStrategy
connector = _AcceleratorConnector()
assert isinstance(connector.accelerator, IPUAccelerator)
assert isinstance(connector.strategy, IPUStrategy)
assert connector._devices_flag == 4
assert isinstance(connector.strategy.cluster_environment, LightningEnvironment)
assert connector.strategy.launcher is None
# Single HPU
with monkeypatch.context():
mock_cuda_count(monkeypatch, 0)
mock_mps_count(monkeypatch, 0)
mock_tpu_available(monkeypatch, False)
mock_ipu_available(monkeypatch, False)
mock_hpu_count(monkeypatch, 1)
connector = _AcceleratorConnector()
assert isinstance(connector.accelerator, HPUAccelerator)
@ -1029,7 +994,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
mock_cuda_count(monkeypatch, 0)
mock_mps_count(monkeypatch, 0)
mock_tpu_available(monkeypatch, False)
mock_ipu_available(monkeypatch, False)
mock_hpu_count(monkeypatch, 8)
connector = _AcceleratorConnector()
assert isinstance(connector.accelerator, HPUAccelerator)
@ -1047,7 +1011,6 @@ def test_connector_auto_selection(monkeypatch, is_interactive):
mock_cuda_count(monkeypatch, 2)
mock_mps_count(monkeypatch, 0)
_mock_tpu_available(True)
mock_ipu_available(monkeypatch, False)
connector = _AcceleratorConnector()
assert isinstance(connector.accelerator, XLAAccelerator)
assert isinstance(connector.strategy, XLAStrategy)