Rename LiteMultiNode to FabricMultiNode (#16505)
This commit is contained in:
parent
f812cb8339
commit
c68cfd686e
|
@ -205,7 +205,7 @@ subprojects:
|
|||
- "examples/fabric/**"
|
||||
- "examples/run_fabric_examples.sh"
|
||||
- "tests/tests_fabric/run_standalone_*.sh"
|
||||
- "tests/tests_pytorch/run_standalone_tests.sh" # used by Lite through a symlink
|
||||
- "tests/tests_pytorch/run_standalone_tests.sh" # used by Fabric through a symlink
|
||||
- "requirements/fabric/**"
|
||||
- "src/lightning_fabric/**"
|
||||
- "tests/tests_fabric/**"
|
||||
|
|
|
@ -28,7 +28,7 @@ ___________________
|
|||
~serve.python_server.PythonServer
|
||||
~serve.streamlit.ServeStreamlit
|
||||
~multi_node.base.MultiNode
|
||||
~multi_node.lite.LiteMultiNode
|
||||
~multi_node.fabric.FabricMultiNode
|
||||
~multi_node.pytorch_spawn.PyTorchSpawnMultiNode
|
||||
~multi_node.trainer.LightningTrainerMultiNode
|
||||
~serve.auto_scaler.AutoScaler
|
||||
|
|
|
@ -102,10 +102,10 @@ use the following utility function to pick GPU indices that are "accessible", wi
|
|||
# Find two GPUs on the system that are not already occupied
|
||||
trainer = Trainer(accelerator="cuda", devices=find_usable_cuda_devices(2))
|
||||
|
||||
from lightning.lite.accelerators import find_usable_cuda_devices
|
||||
from lightning.fabric.accelerators import find_usable_cuda_devices
|
||||
|
||||
# Works with LightningLite too
|
||||
lite = LightningLite(accelerator="cuda", devices=find_usable_cuda_devices(2))
|
||||
# Works with Fabric too
|
||||
fabric = Fabric(accelerator="cuda", devices=find_usable_cuda_devices(2))
|
||||
|
||||
|
||||
This is especially useful when GPUs are configured to be in "exclusive compute mode", such that only one process at a time is allowed access to the device.
|
||||
|
|
|
@ -42,7 +42,7 @@ Launch multi-node training in the cloud
|
|||
:caption: app.py
|
||||
|
||||
import lightning as L
|
||||
from lightning.app.components import LiteMultiNode
|
||||
from lightning.app.components import FabricMultiNode
|
||||
|
||||
# 1. Put your code inside a LightningWork
|
||||
class MyTrainingComponent(L.LightningWork):
|
||||
|
@ -58,16 +58,16 @@ Launch multi-node training in the cloud
|
|||
model, optimizer = fabric.setup(model, optimizer)
|
||||
...
|
||||
|
||||
**Step 2:** Init a :class:`~lightning_app.core.app.LightningApp` with the ``LiteMultiNode`` component.
|
||||
**Step 2:** Init a :class:`~lightning_app.core.app.LightningApp` with the ``FabricMultiNode`` component.
|
||||
Configure the number of nodes, the number of GPUs per node, and the type of GPU:
|
||||
|
||||
.. code-block:: python
|
||||
:emphasize-lines: 5,7
|
||||
:caption: app.py
|
||||
|
||||
# 2. Create the app with the LiteMultiNode component inside
|
||||
# 2. Create the app with the FabricMultiNode component inside
|
||||
app = L.LightningApp(
|
||||
LiteMultiNode(
|
||||
FabricMultiNode(
|
||||
MyTrainingComponent,
|
||||
# Run with 2 nodes
|
||||
num_nodes=2,
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import torch
|
||||
|
||||
import lightning as L
|
||||
from lightning.app.components import LiteMultiNode
|
||||
from lightning.app.components import FabricMultiNode
|
||||
from lightning.fabric import Fabric
|
||||
|
||||
|
||||
class LitePyTorchDistributed(L.LightningWork):
|
||||
class FabricPyTorchDistributed(L.LightningWork):
|
||||
def run(self):
|
||||
# 1. Prepare the model
|
||||
model = torch.nn.Sequential(
|
||||
|
@ -33,8 +33,8 @@ class LitePyTorchDistributed(L.LightningWork):
|
|||
|
||||
# 8 GPUs: (2 nodes of 4 x v100)
|
||||
app = L.LightningApp(
|
||||
LiteMultiNode(
|
||||
LitePyTorchDistributed,
|
||||
FabricMultiNode(
|
||||
FabricPyTorchDistributed,
|
||||
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100
|
||||
num_nodes=2,
|
||||
)
|
||||
|
|
|
@ -99,7 +99,7 @@ warn_no_return = "False"
|
|||
# the list can be generated with:
|
||||
# mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",'
|
||||
module = [
|
||||
"lightning_app.components.multi_node.lite",
|
||||
"lightning_app.components.multi_node.fabric",
|
||||
"lightning_app.components.multi_node.base",
|
||||
"lightning_app.components.multi_node.pytorch_spawn",
|
||||
"lightning_app.components.multi_node.trainer",
|
||||
|
|
|
@ -18,6 +18,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
- Add support for async predict method in PythonServer and remove torch context ([#16453](https://github.com/Lightning-AI/lightning/pull/16453))
|
||||
|
||||
- Renamed `lightning.app.components.LiteMultiNode` to `lightning.app.components.FabricMultiNode` ([#16505](https://github.com/Lightning-AI/lightning/pull/16505))
|
||||
|
||||
|
||||
### Deprecated
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from lightning_app.components.database.client import DatabaseClient
|
||||
from lightning_app.components.database.server import Database
|
||||
from lightning_app.components.multi_node import (
|
||||
FabricMultiNode,
|
||||
LightningTrainerMultiNode,
|
||||
LiteMultiNode,
|
||||
MultiNode,
|
||||
PyTorchSpawnMultiNode,
|
||||
)
|
||||
|
@ -33,7 +33,7 @@ __all__ = [
|
|||
"Category",
|
||||
"Text",
|
||||
"MultiNode",
|
||||
"LiteMultiNode",
|
||||
"FabricMultiNode",
|
||||
"LightningTrainerScript",
|
||||
"PyTorchLightningScriptRunner",
|
||||
"PyTorchSpawnMultiNode",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from lightning_app.components.multi_node.base import MultiNode
|
||||
from lightning_app.components.multi_node.lite import LiteMultiNode
|
||||
from lightning_app.components.multi_node.fabric import FabricMultiNode
|
||||
from lightning_app.components.multi_node.pytorch_spawn import PyTorchSpawnMultiNode
|
||||
from lightning_app.components.multi_node.trainer import LightningTrainerMultiNode
|
||||
|
||||
__all__ = ["LiteMultiNode", "MultiNode", "PyTorchSpawnMultiNode", "LightningTrainerMultiNode"]
|
||||
__all__ = ["FabricMultiNode", "MultiNode", "PyTorchSpawnMultiNode", "LightningTrainerMultiNode"]
|
||||
|
|
|
@ -28,7 +28,7 @@ from lightning_app.utilities.tracer import Tracer
|
|||
|
||||
|
||||
@runtime_checkable
|
||||
class _LiteWorkProtocol(Protocol):
|
||||
class _FabricWorkProtocol(Protocol):
|
||||
@staticmethod
|
||||
def run() -> None:
|
||||
...
|
||||
|
@ -71,11 +71,11 @@ class _FabricRunExecutor(_PyTorchSpawnRunExecutor):
|
|||
os.environ["LOCAL_WORLD_SIZE"] = str(nprocs)
|
||||
os.environ["TORCHELASTIC_RUN_ID"] = "1"
|
||||
|
||||
# Used to force Lite to setup the distributed environnement.
|
||||
# Used to force Fabric to setup the distributed environnement.
|
||||
os.environ["LT_CLI_USED"] = "1"
|
||||
|
||||
# Used to pass information to Lite directly.
|
||||
def pre_fn(lite, *args, **kwargs):
|
||||
# Used to pass information to Fabric directly.
|
||||
def pre_fn(fabric, *args, **kwargs):
|
||||
kwargs["devices"] = nprocs
|
||||
kwargs["num_nodes"] = num_nodes
|
||||
|
||||
|
@ -110,7 +110,7 @@ class _FabricRunExecutor(_PyTorchSpawnRunExecutor):
|
|||
return ret_val
|
||||
|
||||
|
||||
class LiteMultiNode(MultiNode):
|
||||
class FabricMultiNode(MultiNode):
|
||||
def __init__(
|
||||
self,
|
||||
work_cls: Type["LightningWork"],
|
||||
|
@ -119,7 +119,7 @@ class LiteMultiNode(MultiNode):
|
|||
*work_args: Any,
|
||||
**work_kwargs: Any,
|
||||
) -> None:
|
||||
assert issubclass(work_cls, _LiteWorkProtocol)
|
||||
assert issubclass(work_cls, _FabricWorkProtocol)
|
||||
|
||||
# Note: Private way to modify the work run executor
|
||||
# Probably exposed to the users in the future if needed.
|
|
@ -8,7 +8,7 @@ from lightning_utilities.core.imports import module_available
|
|||
from lightning_utilities.test.warning import no_warning_call
|
||||
|
||||
import lightning_fabric as lf
|
||||
from lightning_app.components.multi_node.lite import _FabricRunExecutor
|
||||
from lightning_app.components.multi_node.fabric import _FabricRunExecutor
|
||||
|
||||
|
||||
class DummyFabric(lf.Fabric):
|
||||
|
@ -48,7 +48,7 @@ def check_lightning_fabric_mps():
|
|||
|
||||
@pytest.mark.skipif(not check_lightning_fabric_mps(), reason="Fabric not available or mps not available")
|
||||
@pytest.mark.parametrize("accelerator_given,accelerator_expected", [("cpu", "cpu"), ("auto", "cpu"), ("gpu", "cpu")])
|
||||
def test_lite_run_executor_mps_forced_cpu(accelerator_given, accelerator_expected):
|
||||
def test_fabric_run_executor_mps_forced_cpu(accelerator_given, accelerator_expected):
|
||||
warning_str = (
|
||||
r"Forcing accelerator=cpu as other accelerators \(specifically MPS\) are not supported "
|
||||
+ "by PyTorch for distributed training on mps capable devices"
|
||||
|
|
Loading…
Reference in New Issue