Rename LiteMultiNode to FabricMultiNode (#16505)

This commit is contained in:
Adrian Wälchli 2023-01-26 12:36:27 +01:00 committed by GitHub
parent f812cb8339
commit c68cfd686e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 28 additions and 26 deletions

View File

@ -205,7 +205,7 @@ subprojects:
- "examples/fabric/**"
- "examples/run_fabric_examples.sh"
- "tests/tests_fabric/run_standalone_*.sh"
- "tests/tests_pytorch/run_standalone_tests.sh" # used by Lite through a symlink
- "tests/tests_pytorch/run_standalone_tests.sh" # used by Fabric through a symlink
- "requirements/fabric/**"
- "src/lightning_fabric/**"
- "tests/tests_fabric/**"

View File

@ -28,7 +28,7 @@ ___________________
~serve.python_server.PythonServer
~serve.streamlit.ServeStreamlit
~multi_node.base.MultiNode
~multi_node.lite.LiteMultiNode
~multi_node.fabric.FabricMultiNode
~multi_node.pytorch_spawn.PyTorchSpawnMultiNode
~multi_node.trainer.LightningTrainerMultiNode
~serve.auto_scaler.AutoScaler

View File

@ -102,10 +102,10 @@ use the following utility function to pick GPU indices that are "accessible", wi
# Find two GPUs on the system that are not already occupied
trainer = Trainer(accelerator="cuda", devices=find_usable_cuda_devices(2))
from lightning.lite.accelerators import find_usable_cuda_devices
from lightning.fabric.accelerators import find_usable_cuda_devices
# Works with LightningLite too
lite = LightningLite(accelerator="cuda", devices=find_usable_cuda_devices(2))
# Works with Fabric too
fabric = Fabric(accelerator="cuda", devices=find_usable_cuda_devices(2))
This is especially useful when GPUs are configured to be in "exclusive compute mode", such that only one process at a time is allowed access to the device.

View File

@ -42,7 +42,7 @@ Launch multi-node training in the cloud
:caption: app.py
import lightning as L
from lightning.app.components import LiteMultiNode
from lightning.app.components import FabricMultiNode
# 1. Put your code inside a LightningWork
class MyTrainingComponent(L.LightningWork):
@ -58,16 +58,16 @@ Launch multi-node training in the cloud
model, optimizer = fabric.setup(model, optimizer)
...
**Step 2:** Init a :class:`~lightning_app.core.app.LightningApp` with the ``LiteMultiNode`` component.
**Step 2:** Init a :class:`~lightning_app.core.app.LightningApp` with the ``FabricMultiNode`` component.
Configure the number of nodes, the number of GPUs per node, and the type of GPU:
.. code-block:: python
:emphasize-lines: 5,7
:caption: app.py
# 2. Create the app with the LiteMultiNode component inside
# 2. Create the app with the FabricMultiNode component inside
app = L.LightningApp(
LiteMultiNode(
FabricMultiNode(
MyTrainingComponent,
# Run with 2 nodes
num_nodes=2,

View File

@ -1,11 +1,11 @@
import torch
import lightning as L
from lightning.app.components import LiteMultiNode
from lightning.app.components import FabricMultiNode
from lightning.fabric import Fabric
class LitePyTorchDistributed(L.LightningWork):
class FabricPyTorchDistributed(L.LightningWork):
def run(self):
# 1. Prepare the model
model = torch.nn.Sequential(
@ -33,8 +33,8 @@ class LitePyTorchDistributed(L.LightningWork):
# 8 GPUs: (2 nodes of 4 x v100)
app = L.LightningApp(
LiteMultiNode(
LitePyTorchDistributed,
FabricMultiNode(
FabricPyTorchDistributed,
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100
num_nodes=2,
)

View File

@ -99,7 +99,7 @@ warn_no_return = "False"
# the list can be generated with:
# mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",'
module = [
"lightning_app.components.multi_node.lite",
"lightning_app.components.multi_node.fabric",
"lightning_app.components.multi_node.base",
"lightning_app.components.multi_node.pytorch_spawn",
"lightning_app.components.multi_node.trainer",

View File

@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Add support for async predict method in PythonServer and remove torch context ([#16453](https://github.com/Lightning-AI/lightning/pull/16453))
- Renamed `lightning.app.components.LiteMultiNode` to `lightning.app.components.FabricMultiNode` ([#16505](https://github.com/Lightning-AI/lightning/pull/16505))
### Deprecated

View File

@ -1,8 +1,8 @@
from lightning_app.components.database.client import DatabaseClient
from lightning_app.components.database.server import Database
from lightning_app.components.multi_node import (
FabricMultiNode,
LightningTrainerMultiNode,
LiteMultiNode,
MultiNode,
PyTorchSpawnMultiNode,
)
@ -33,7 +33,7 @@ __all__ = [
"Category",
"Text",
"MultiNode",
"LiteMultiNode",
"FabricMultiNode",
"LightningTrainerScript",
"PyTorchLightningScriptRunner",
"PyTorchSpawnMultiNode",

View File

@ -1,6 +1,6 @@
from lightning_app.components.multi_node.base import MultiNode
from lightning_app.components.multi_node.lite import LiteMultiNode
from lightning_app.components.multi_node.fabric import FabricMultiNode
from lightning_app.components.multi_node.pytorch_spawn import PyTorchSpawnMultiNode
from lightning_app.components.multi_node.trainer import LightningTrainerMultiNode
__all__ = ["LiteMultiNode", "MultiNode", "PyTorchSpawnMultiNode", "LightningTrainerMultiNode"]
__all__ = ["FabricMultiNode", "MultiNode", "PyTorchSpawnMultiNode", "LightningTrainerMultiNode"]

View File

@ -28,7 +28,7 @@ from lightning_app.utilities.tracer import Tracer
@runtime_checkable
class _LiteWorkProtocol(Protocol):
class _FabricWorkProtocol(Protocol):
@staticmethod
def run() -> None:
...
@ -71,11 +71,11 @@ class _FabricRunExecutor(_PyTorchSpawnRunExecutor):
os.environ["LOCAL_WORLD_SIZE"] = str(nprocs)
os.environ["TORCHELASTIC_RUN_ID"] = "1"
# Used to force Lite to setup the distributed environnement.
# Used to force Fabric to setup the distributed environnement.
os.environ["LT_CLI_USED"] = "1"
# Used to pass information to Lite directly.
def pre_fn(lite, *args, **kwargs):
# Used to pass information to Fabric directly.
def pre_fn(fabric, *args, **kwargs):
kwargs["devices"] = nprocs
kwargs["num_nodes"] = num_nodes
@ -110,7 +110,7 @@ class _FabricRunExecutor(_PyTorchSpawnRunExecutor):
return ret_val
class LiteMultiNode(MultiNode):
class FabricMultiNode(MultiNode):
def __init__(
self,
work_cls: Type["LightningWork"],
@ -119,7 +119,7 @@ class LiteMultiNode(MultiNode):
*work_args: Any,
**work_kwargs: Any,
) -> None:
assert issubclass(work_cls, _LiteWorkProtocol)
assert issubclass(work_cls, _FabricWorkProtocol)
# Note: Private way to modify the work run executor
# Probably exposed to the users in the future if needed.

View File

@ -8,7 +8,7 @@ from lightning_utilities.core.imports import module_available
from lightning_utilities.test.warning import no_warning_call
import lightning_fabric as lf
from lightning_app.components.multi_node.lite import _FabricRunExecutor
from lightning_app.components.multi_node.fabric import _FabricRunExecutor
class DummyFabric(lf.Fabric):
@ -48,7 +48,7 @@ def check_lightning_fabric_mps():
@pytest.mark.skipif(not check_lightning_fabric_mps(), reason="Fabric not available or mps not available")
@pytest.mark.parametrize("accelerator_given,accelerator_expected", [("cpu", "cpu"), ("auto", "cpu"), ("gpu", "cpu")])
def test_lite_run_executor_mps_forced_cpu(accelerator_given, accelerator_expected):
def test_fabric_run_executor_mps_forced_cpu(accelerator_given, accelerator_expected):
warning_str = (
r"Forcing accelerator=cpu as other accelerators \(specifically MPS\) are not supported "
+ "by PyTorch for distributed training on mps capable devices"