diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 11a9c556c8..656c12f3d9 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -205,7 +205,7 @@ subprojects: - "examples/fabric/**" - "examples/run_fabric_examples.sh" - "tests/tests_fabric/run_standalone_*.sh" - - "tests/tests_pytorch/run_standalone_tests.sh" # used by Lite through a symlink + - "tests/tests_pytorch/run_standalone_tests.sh" # used by Fabric through a symlink - "requirements/fabric/**" - "src/lightning_fabric/**" - "tests/tests_fabric/**" diff --git a/docs/source-app/api_reference/components.rst b/docs/source-app/api_reference/components.rst index 7d85abbb36..5a3cb681e1 100644 --- a/docs/source-app/api_reference/components.rst +++ b/docs/source-app/api_reference/components.rst @@ -28,7 +28,7 @@ ___________________ ~serve.python_server.PythonServer ~serve.streamlit.ServeStreamlit ~multi_node.base.MultiNode - ~multi_node.lite.LiteMultiNode + ~multi_node.fabric.FabricMultiNode ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode ~multi_node.trainer.LightningTrainerMultiNode ~serve.auto_scaler.AutoScaler diff --git a/docs/source-pytorch/accelerators/gpu_basic.rst b/docs/source-pytorch/accelerators/gpu_basic.rst index 3bf82d39e6..2d8ad71cb4 100644 --- a/docs/source-pytorch/accelerators/gpu_basic.rst +++ b/docs/source-pytorch/accelerators/gpu_basic.rst @@ -102,10 +102,10 @@ use the following utility function to pick GPU indices that are "accessible", wi # Find two GPUs on the system that are not already occupied trainer = Trainer(accelerator="cuda", devices=find_usable_cuda_devices(2)) - from lightning.lite.accelerators import find_usable_cuda_devices + from lightning.fabric.accelerators import find_usable_cuda_devices - # Works with LightningLite too - lite = LightningLite(accelerator="cuda", devices=find_usable_cuda_devices(2)) + # Works with Fabric too + fabric = Fabric(accelerator="cuda", devices=find_usable_cuda_devices(2)) This is especially useful when GPUs are configured to be in "exclusive compute mode", such that only one process at a time is allowed access to the device. diff --git a/docs/source-pytorch/fabric/guide/multi_node/cloud.rst b/docs/source-pytorch/fabric/guide/multi_node/cloud.rst index 4b015992d8..15ed55b29d 100644 --- a/docs/source-pytorch/fabric/guide/multi_node/cloud.rst +++ b/docs/source-pytorch/fabric/guide/multi_node/cloud.rst @@ -42,7 +42,7 @@ Launch multi-node training in the cloud :caption: app.py import lightning as L - from lightning.app.components import LiteMultiNode + from lightning.app.components import FabricMultiNode # 1. Put your code inside a LightningWork class MyTrainingComponent(L.LightningWork): @@ -58,16 +58,16 @@ Launch multi-node training in the cloud model, optimizer = fabric.setup(model, optimizer) ... -**Step 2:** Init a :class:`~lightning_app.core.app.LightningApp` with the ``LiteMultiNode`` component. +**Step 2:** Init a :class:`~lightning_app.core.app.LightningApp` with the ``FabricMultiNode`` component. Configure the number of nodes, the number of GPUs per node, and the type of GPU: .. code-block:: python :emphasize-lines: 5,7 :caption: app.py - # 2. Create the app with the LiteMultiNode component inside + # 2. Create the app with the FabricMultiNode component inside app = L.LightningApp( - LiteMultiNode( + FabricMultiNode( MyTrainingComponent, # Run with 2 nodes num_nodes=2, diff --git a/examples/app_multi_node/train_fabric.py b/examples/app_multi_node/train_fabric.py index 5a1751e538..1bb2ecd313 100644 --- a/examples/app_multi_node/train_fabric.py +++ b/examples/app_multi_node/train_fabric.py @@ -1,11 +1,11 @@ import torch import lightning as L -from lightning.app.components import LiteMultiNode +from lightning.app.components import FabricMultiNode from lightning.fabric import Fabric -class LitePyTorchDistributed(L.LightningWork): +class FabricPyTorchDistributed(L.LightningWork): def run(self): # 1. Prepare the model model = torch.nn.Sequential( @@ -33,8 +33,8 @@ class LitePyTorchDistributed(L.LightningWork): # 8 GPUs: (2 nodes of 4 x v100) app = L.LightningApp( - LiteMultiNode( - LitePyTorchDistributed, + FabricMultiNode( + FabricPyTorchDistributed, cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100 num_nodes=2, ) diff --git a/pyproject.toml b/pyproject.toml index 1a9ac87d8e..de4cfcf4b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,7 +99,7 @@ warn_no_return = "False" # the list can be generated with: # mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",' module = [ - "lightning_app.components.multi_node.lite", + "lightning_app.components.multi_node.fabric", "lightning_app.components.multi_node.base", "lightning_app.components.multi_node.pytorch_spawn", "lightning_app.components.multi_node.trainer", diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 0e06354d59..e9b294eed6 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for async predict method in PythonServer and remove torch context ([#16453](https://github.com/Lightning-AI/lightning/pull/16453)) +- Renamed `lightning.app.components.LiteMultiNode` to `lightning.app.components.FabricMultiNode` ([#16505](https://github.com/Lightning-AI/lightning/pull/16505)) + ### Deprecated diff --git a/src/lightning_app/components/__init__.py b/src/lightning_app/components/__init__.py index e639a597e8..23505cca13 100644 --- a/src/lightning_app/components/__init__.py +++ b/src/lightning_app/components/__init__.py @@ -1,8 +1,8 @@ from lightning_app.components.database.client import DatabaseClient from lightning_app.components.database.server import Database from lightning_app.components.multi_node import ( + FabricMultiNode, LightningTrainerMultiNode, - LiteMultiNode, MultiNode, PyTorchSpawnMultiNode, ) @@ -33,7 +33,7 @@ __all__ = [ "Category", "Text", "MultiNode", - "LiteMultiNode", + "FabricMultiNode", "LightningTrainerScript", "PyTorchLightningScriptRunner", "PyTorchSpawnMultiNode", diff --git a/src/lightning_app/components/multi_node/__init__.py b/src/lightning_app/components/multi_node/__init__.py index b2d45a2610..464029341e 100644 --- a/src/lightning_app/components/multi_node/__init__.py +++ b/src/lightning_app/components/multi_node/__init__.py @@ -1,6 +1,6 @@ from lightning_app.components.multi_node.base import MultiNode -from lightning_app.components.multi_node.lite import LiteMultiNode +from lightning_app.components.multi_node.fabric import FabricMultiNode from lightning_app.components.multi_node.pytorch_spawn import PyTorchSpawnMultiNode from lightning_app.components.multi_node.trainer import LightningTrainerMultiNode -__all__ = ["LiteMultiNode", "MultiNode", "PyTorchSpawnMultiNode", "LightningTrainerMultiNode"] +__all__ = ["FabricMultiNode", "MultiNode", "PyTorchSpawnMultiNode", "LightningTrainerMultiNode"] diff --git a/src/lightning_app/components/multi_node/lite.py b/src/lightning_app/components/multi_node/fabric.py similarity index 93% rename from src/lightning_app/components/multi_node/lite.py rename to src/lightning_app/components/multi_node/fabric.py index 527bd7352c..d2ec944a03 100644 --- a/src/lightning_app/components/multi_node/lite.py +++ b/src/lightning_app/components/multi_node/fabric.py @@ -28,7 +28,7 @@ from lightning_app.utilities.tracer import Tracer @runtime_checkable -class _LiteWorkProtocol(Protocol): +class _FabricWorkProtocol(Protocol): @staticmethod def run() -> None: ... @@ -71,11 +71,11 @@ class _FabricRunExecutor(_PyTorchSpawnRunExecutor): os.environ["LOCAL_WORLD_SIZE"] = str(nprocs) os.environ["TORCHELASTIC_RUN_ID"] = "1" - # Used to force Lite to setup the distributed environnement. + # Used to force Fabric to setup the distributed environnement. os.environ["LT_CLI_USED"] = "1" - # Used to pass information to Lite directly. - def pre_fn(lite, *args, **kwargs): + # Used to pass information to Fabric directly. + def pre_fn(fabric, *args, **kwargs): kwargs["devices"] = nprocs kwargs["num_nodes"] = num_nodes @@ -110,7 +110,7 @@ class _FabricRunExecutor(_PyTorchSpawnRunExecutor): return ret_val -class LiteMultiNode(MultiNode): +class FabricMultiNode(MultiNode): def __init__( self, work_cls: Type["LightningWork"], @@ -119,7 +119,7 @@ class LiteMultiNode(MultiNode): *work_args: Any, **work_kwargs: Any, ) -> None: - assert issubclass(work_cls, _LiteWorkProtocol) + assert issubclass(work_cls, _FabricWorkProtocol) # Note: Private way to modify the work run executor # Probably exposed to the users in the future if needed. diff --git a/tests/tests_app/components/multi_node/test_fabric.py b/tests/tests_app/components/multi_node/test_fabric.py index 6232ad9e25..e4bb811cd4 100644 --- a/tests/tests_app/components/multi_node/test_fabric.py +++ b/tests/tests_app/components/multi_node/test_fabric.py @@ -8,7 +8,7 @@ from lightning_utilities.core.imports import module_available from lightning_utilities.test.warning import no_warning_call import lightning_fabric as lf -from lightning_app.components.multi_node.lite import _FabricRunExecutor +from lightning_app.components.multi_node.fabric import _FabricRunExecutor class DummyFabric(lf.Fabric): @@ -48,7 +48,7 @@ def check_lightning_fabric_mps(): @pytest.mark.skipif(not check_lightning_fabric_mps(), reason="Fabric not available or mps not available") @pytest.mark.parametrize("accelerator_given,accelerator_expected", [("cpu", "cpu"), ("auto", "cpu"), ("gpu", "cpu")]) -def test_lite_run_executor_mps_forced_cpu(accelerator_given, accelerator_expected): +def test_fabric_run_executor_mps_forced_cpu(accelerator_given, accelerator_expected): warning_str = ( r"Forcing accelerator=cpu as other accelerators \(specifically MPS\) are not supported " + "by PyTorch for distributed training on mps capable devices"