lightning/tests/tests_fabric/test_connector.py

1135 lines
46 KiB
Python

# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import inspect
import os
import sys
from contextlib import nullcontext
from typing import Any, Dict
from unittest import mock
from unittest.mock import Mock
import lightning.fabric
import pytest
import torch
import torch.distributed
from lightning.fabric import Fabric
from lightning.fabric.accelerators import XLAAccelerator
from lightning.fabric.accelerators.accelerator import Accelerator
from lightning.fabric.accelerators.cpu import CPUAccelerator
from lightning.fabric.accelerators.cuda import CUDAAccelerator
from lightning.fabric.accelerators.mps import MPSAccelerator
from lightning.fabric.connector import _Connector
from lightning.fabric.plugins import (
BitsandbytesPrecision,
DeepSpeedPrecision,
DoublePrecision,
FSDPPrecision,
HalfPrecision,
MixedPrecision,
Precision,
XLAPrecision,
)
from lightning.fabric.plugins.environments import (
KubeflowEnvironment,
LightningEnvironment,
LSFEnvironment,
SLURMEnvironment,
TorchElasticEnvironment,
XLAEnvironment,
)
from lightning.fabric.plugins.io import TorchCheckpointIO
from lightning.fabric.strategies import (
DataParallelStrategy,
DDPStrategy,
DeepSpeedStrategy,
FSDPStrategy,
ModelParallelStrategy,
SingleDeviceStrategy,
SingleDeviceXLAStrategy,
XLAFSDPStrategy,
XLAStrategy,
)
from lightning.fabric.strategies.ddp import _DDP_FORK_ALIASES
from lightning.fabric.strategies.launchers.subprocess_script import _SubprocessScriptLauncher
from lightning.fabric.utilities.imports import _IS_WINDOWS
from lightning_utilities.test.warning import no_warning_call
from tests_fabric.conftest import mock_tpu_available
from tests_fabric.helpers.runif import RunIf
class DeviceMock(Mock):
def __instancecheck__(self, instance):
return True
@pytest.mark.parametrize(
("accelerator", "devices"), [("tpu", "auto"), ("tpu", 1), ("tpu", [1]), ("tpu", 8), ("auto", 1), ("auto", 8)]
)
@RunIf(min_python="3.9") # mocking issue
def test_accelerator_choice_tpu(accelerator, devices, tpu_available, monkeypatch):
monkeypatch.setattr(torch, "device", DeviceMock())
connector = _Connector(accelerator=accelerator, devices=devices)
assert isinstance(connector.accelerator, XLAAccelerator)
if devices == "auto" or (isinstance(devices, int) and devices > 1):
assert isinstance(connector.strategy, XLAStrategy)
assert isinstance(connector.strategy.cluster_environment, XLAEnvironment)
assert isinstance(connector.cluster_environment, XLAEnvironment)
else:
assert isinstance(connector.strategy, SingleDeviceXLAStrategy)
@RunIf(skip_windows=True, standalone=True)
def test_strategy_choice_ddp_on_cpu():
"""Test that selecting DDPStrategy on CPU works."""
_test_strategy_choice_ddp_and_cpu(ddp_strategy_class=DDPStrategy)
def _test_strategy_choice_ddp_and_cpu(ddp_strategy_class):
connector = _Connector(
strategy=ddp_strategy_class(),
accelerator="cpu",
devices=2,
)
assert isinstance(connector.strategy, ddp_strategy_class)
assert isinstance(connector.accelerator, CPUAccelerator)
assert connector.strategy.num_processes == 2
assert connector.strategy.parallel_devices == [torch.device("cpu")] * 2
@mock.patch.dict(
os.environ,
{
"SLURM_NTASKS": "2",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_PROCID": "0",
"SLURM_LOCALID": "0",
},
)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=0)
def test_custom_cluster_environment_in_slurm_environment(_):
"""Test that we choose the custom cluster even when SLURM or TE flags are around."""
class CustomCluster(LightningEnvironment):
@property
def main_address(self):
return "asdf"
@property
def creates_processes_externally(self) -> bool:
return True
connector = _Connector(
plugins=[CustomCluster()],
accelerator="cpu",
strategy="ddp",
devices=2,
)
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, CustomCluster)
# this checks that `strategy._set_world_ranks` was called by the connector
assert connector.strategy.world_size == 2
@RunIf(mps=False)
@mock.patch.dict(
os.environ,
{
"SLURM_NTASKS": "2",
"SLURM_NTASKS_PER_NODE": "1",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_PROCID": "0",
"SLURM_LOCALID": "0",
},
)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=0)
def test_custom_accelerator(*_):
class Accel(Accelerator):
def setup_device(self, device: torch.device) -> None:
pass
def get_device_stats(self, device: torch.device) -> Dict[str, Any]:
pass
def teardown(self) -> None:
pass
@staticmethod
def parse_devices(devices):
return devices
@staticmethod
def get_parallel_devices(devices):
return [torch.device("cpu")] * devices
@staticmethod
def auto_device_count() -> int:
return 1
@staticmethod
def is_available() -> bool:
return True
@staticmethod
def name() -> str:
return "custom_acc_name"
class Prec(Precision):
pass
class Strat(SingleDeviceStrategy):
pass
strategy = Strat(device=torch.device("cpu"), accelerator=Accel(), precision=Prec())
connector = _Connector(strategy=strategy, devices=2)
assert isinstance(connector.accelerator, Accel)
assert isinstance(connector.strategy, Strat)
assert isinstance(connector.precision, Prec)
assert connector.strategy is strategy
class Strat(DDPStrategy):
pass
strategy = Strat(accelerator=Accel(), precision=Prec())
connector = _Connector(strategy=strategy, devices=2)
assert isinstance(connector.accelerator, Accel)
assert isinstance(connector.strategy, Strat)
assert isinstance(connector.precision, Prec)
assert connector.strategy is strategy
@pytest.mark.parametrize(
("env_vars", "expected_environment"),
[
(
{
"SLURM_NTASKS": "2",
"SLURM_NTASKS_PER_NODE": "1",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_PROCID": "0",
"SLURM_LOCALID": "0",
},
SLURMEnvironment,
),
(
{
"LSB_JOBID": "1",
"LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
"JSM_NAMESPACE_LOCAL_RANK": "1",
"JSM_NAMESPACE_SIZE": "20",
"JSM_NAMESPACE_RANK": "1",
},
LSFEnvironment,
),
],
)
@mock.patch("lightning.fabric.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
@mock.patch("lightning.fabric.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
with mock.patch.dict(os.environ, env_vars, clear=True):
connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, expected_environment)
@RunIf(mps=False)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
def test_interactive_incompatible_backend_error(_, monkeypatch):
monkeypatch.setattr(lightning.fabric.connector, "_IS_INTERACTIVE", True)
with pytest.raises(RuntimeError, match=r"strategy='ddp'\)`.*is not compatible"):
_Connector(strategy="ddp", accelerator="gpu", devices=2)
with pytest.raises(RuntimeError, match=r"strategy='ddp_spawn'\)`.*is not compatible"):
_Connector(strategy="ddp_spawn", accelerator="gpu", devices=2)
with pytest.raises(RuntimeError, match=r"strategy='ddp'\)`.*is not compatible"):
# Edge case: _Connector maps dp to ddp if accelerator != gpu
_Connector(strategy="dp", accelerator="cpu")
def test_precision_and_precision_plugin_raises():
with pytest.raises(ValueError, match="both `precision=16-true` and `plugins"):
_Connector(precision="16-true", plugins=Precision())
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_interactive_compatible_dp_strategy_gpu(_, __, monkeypatch):
monkeypatch.setattr(lightning.fabric.utilities.imports, "_IS_INTERACTIVE", True)
connector = _Connector(strategy="dp", accelerator="gpu")
assert connector.strategy.launcher is None
@RunIf(skip_windows=True)
def test_interactive_compatible_strategy_ddp_fork(monkeypatch):
monkeypatch.setattr(lightning.fabric.utilities.imports, "_IS_INTERACTIVE", True)
connector = _Connector(strategy="ddp_fork", accelerator="cpu")
assert connector.strategy.launcher.is_interactive_compatible
@RunIf(mps=True)
@pytest.mark.parametrize(
("strategy", "strategy_class"),
[
("ddp", DDPStrategy),
("dp", DataParallelStrategy),
pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
],
)
@pytest.mark.parametrize("accelerator", ["mps", "auto", "gpu", MPSAccelerator()])
def test_invalid_ddp_strategy_with_mps(accelerator, strategy, strategy_class):
with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
_Connector(accelerator=accelerator, strategy=strategy)
with pytest.raises(ValueError, match="strategies from the DDP family are not supported"):
_Connector(accelerator="mps", strategy=strategy_class())
@RunIf(mps=False)
@pytest.mark.parametrize(
("strategy", "strategy_class"),
[
("ddp", DDPStrategy),
("ddp_spawn", DDPStrategy),
pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
],
)
@pytest.mark.parametrize("devices", [1, 2])
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
def test_strategy_choice_multi_node_gpu(_, strategy, strategy_class, devices):
connector = _Connector(num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices)
assert isinstance(connector.strategy, strategy_class)
def test_num_nodes_input_validation():
with pytest.raises(ValueError, match="`num_nodes` must be a positive integer"):
_Connector(num_nodes=0)
with pytest.raises(ValueError, match="`num_nodes` must be a positive integer"):
_Connector(num_nodes=-1)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=0)
def test_cuda_accelerator_can_not_run_on_system(_):
connector = _Connector(accelerator="cpu")
assert isinstance(connector.accelerator, CPUAccelerator)
with pytest.raises(
RuntimeError,
match="CUDAAccelerator` can not run on your system since the accelerator is not available.",
):
_Connector(accelerator="cuda", devices=1)
@pytest.mark.skipif(XLAAccelerator.is_available(), reason="test requires missing TPU")
@mock.patch("lightning.fabric.accelerators.xla._XLA_AVAILABLE", True)
@mock.patch("lightning.fabric.accelerators.xla._using_pjrt", return_value=True)
def test_tpu_accelerator_can_not_run_on_system(_):
with pytest.raises(RuntimeError, match="XLAAccelerator` can not run on your system"):
_Connector(accelerator="tpu", devices=8)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
@pytest.mark.parametrize("device_count", [["0"], [0, "1"], ["GPU"], [["0", "1"], [0, 1]], [False]])
def test_accelerator_invalid_type_devices(_, device_count):
with pytest.raises(TypeError, match=r"must be an int, a string, a sequence of ints, but you"):
_ = _Connector(accelerator="gpu", devices=device_count)
@RunIf(min_cuda_gpus=1)
def test_accelerator_gpu():
connector = _Connector(accelerator="gpu", devices=1)
assert isinstance(connector.accelerator, CUDAAccelerator)
connector = _Connector(accelerator="gpu")
assert isinstance(connector.accelerator, CUDAAccelerator)
connector = _Connector(accelerator="auto", devices=1)
assert isinstance(connector.accelerator, CUDAAccelerator)
@pytest.mark.parametrize(("devices", "strategy_class"), [(1, SingleDeviceStrategy), (5, DDPStrategy)])
def test_accelerator_cpu_with_devices(devices, strategy_class):
connector = _Connector(accelerator="cpu", devices=devices)
assert connector._parallel_devices == [torch.device("cpu")] * devices
assert isinstance(connector.strategy, strategy_class)
assert isinstance(connector.accelerator, CPUAccelerator)
@RunIf(min_cuda_gpus=2)
@pytest.mark.parametrize(
("devices", "strategy_class"), [(1, SingleDeviceStrategy), ([1], SingleDeviceStrategy), (2, DDPStrategy)]
)
def test_accelerator_gpu_with_devices(devices, strategy_class):
connector = _Connector(accelerator="gpu", devices=devices)
assert len(connector._parallel_devices) == len(devices) if isinstance(devices, list) else devices
assert isinstance(connector.strategy, strategy_class)
assert isinstance(connector.accelerator, CUDAAccelerator)
@RunIf(min_cuda_gpus=1)
def test_accelerator_auto_with_devices_gpu():
connector = _Connector(accelerator="auto", devices=1)
assert isinstance(connector.accelerator, CUDAAccelerator)
assert connector._parallel_devices == [torch.device("cuda", 0)]
def test_set_devices_if_none_cpu():
connector = _Connector(accelerator="cpu", devices=3)
assert connector._parallel_devices == [torch.device("cpu")] * 3
@RunIf(mps=False)
def test_unsupported_strategy_types_on_cpu_and_fallback():
with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting `strategy='ddp"):
connector = _Connector(accelerator="cpu", strategy="dp", devices=2)
assert isinstance(connector.strategy, DDPStrategy)
def test_invalid_accelerator_choice():
with pytest.raises(ValueError, match="You selected an invalid accelerator name: `accelerator='cocofruit'`"):
_Connector(accelerator="cocofruit")
@pytest.mark.parametrize("invalid_strategy", ["cocofruit", object()])
def test_invalid_strategy_choice(invalid_strategy):
with pytest.raises(ValueError, match="You selected an invalid strategy name:"):
_Connector(strategy=invalid_strategy)
@pytest.mark.parametrize(
("strategy", "strategy_class"),
[
("ddp_spawn", DDPStrategy),
("ddp", DDPStrategy),
],
)
def test_strategy_choice_cpu_str(strategy, strategy_class):
connector = _Connector(strategy=strategy, accelerator="cpu", devices=2)
assert isinstance(connector.strategy, strategy_class)
@RunIf(min_cuda_gpus=2)
@pytest.mark.parametrize(
("strategy", "strategy_class"),
[
("ddp_spawn", DDPStrategy),
("ddp", DDPStrategy),
("dp", DataParallelStrategy),
pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
],
)
def test_strategy_choice_gpu_str(strategy, strategy_class):
connector = _Connector(strategy=strategy, accelerator="gpu", devices=2)
assert isinstance(connector.strategy, strategy_class)
def test_device_type_when_strategy_instance_cpu_passed():
connector = _Connector(strategy=DDPStrategy(), accelerator="cpu", devices=2)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.accelerator, CPUAccelerator)
@RunIf(min_cuda_gpus=2)
def test_device_type_when_strategy_instance_gpu_passed():
connector = _Connector(strategy=DDPStrategy(), accelerator="gpu", devices=2)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.accelerator, CUDAAccelerator)
@pytest.mark.parametrize("precision", [1, 12, "invalid"])
def test_validate_precision_type(precision):
with pytest.raises(ValueError, match=f"Precision {repr(precision)} is invalid"):
_Connector(precision=precision)
@pytest.mark.parametrize(
("precision", "expected_precision", "should_warn"),
[
(16, "16-mixed", True),
("16", "16-mixed", True),
("16-mixed", "16-mixed", False),
("bf16", "bf16-mixed", True),
("bf16-mixed", "bf16-mixed", False),
(32, "32-true", False),
("32", "32-true", False),
("32-true", "32-true", False),
(64, "64-true", False),
("64", "64-true", False),
("64-true", "64-true", False),
],
)
# mock cuda as available to not be limited by dtype and accelerator compatibility - this is tested elsewhere
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=1)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_precision_conversion(patch1, patch2, precision, expected_precision, should_warn):
warn_context = pytest.warns if should_warn else no_warning_call
with warn_context(
UserWarning,
match=(
f"{precision}` is supported for historical reasons but its usage is discouraged. "
f"Please set your precision to {expected_precision} instead!"
),
):
connector = _Connector(precision=precision, accelerator="cuda")
assert connector._precision_input == expected_precision
def test_multi_device_default_strategy():
"""The default strategy when multiple devices are selected is "ddp" with the subprocess launcher."""
connector = _Connector(strategy="auto", accelerator="cpu", devices=2)
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert connector.strategy._start_method == "popen"
assert isinstance(connector.strategy.launcher, _SubprocessScriptLauncher)
def test_strategy_choice_ddp_spawn_cpu():
connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, LightningEnvironment)
assert connector.strategy._start_method == "spawn"
assert connector.strategy.launcher._start_method == "spawn"
@RunIf(skip_windows=True)
@mock.patch("lightning.fabric.connector._IS_INTERACTIVE", True)
def test_strategy_choice_ddp_fork_in_interactive():
"""Test that when strategy is unspecified, the connector chooses DDP Fork in interactive environments by
default."""
connector = _Connector(accelerator="cpu", devices=2)
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, LightningEnvironment)
assert connector.strategy._start_method == "fork"
assert connector.strategy.launcher._start_method == "fork"
@RunIf(skip_windows=True)
def test_strategy_choice_ddp_fork_cpu():
connector = _Connector(strategy="ddp_fork", accelerator="cpu", devices=2)
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, LightningEnvironment)
assert connector.strategy._start_method == "fork"
assert connector.strategy.launcher._start_method == "fork"
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_strategy_choice_ddp(*_):
connector = _Connector(strategy="ddp", accelerator="gpu", devices=1)
assert isinstance(connector.accelerator, CUDAAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, LightningEnvironment)
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_strategy_choice_ddp_spawn(*_):
connector = _Connector(strategy="ddp_spawn", accelerator="gpu", devices=1)
assert isinstance(connector.accelerator, CUDAAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, LightningEnvironment)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
@pytest.mark.parametrize(
("job_name", "expected_env"), [("some_name", SLURMEnvironment), ("bash", LightningEnvironment)]
)
@pytest.mark.parametrize("strategy", ["auto", "ddp", DDPStrategy])
def test_strategy_choice_ddp_slurm(_, strategy, job_name, expected_env):
if strategy and not isinstance(strategy, str):
strategy = strategy()
with mock.patch.dict(
os.environ,
{
"CUDA_VISIBLE_DEVICES": "0,1",
"SLURM_NTASKS": "2",
"SLURM_NTASKS_PER_NODE": "1",
"SLURM_JOB_NAME": job_name,
"SLURM_NODEID": "0",
"SLURM_PROCID": "1",
"SLURM_LOCALID": "1",
},
):
connector = _Connector(strategy=strategy, accelerator="cuda", devices=2)
assert isinstance(connector.accelerator, CUDAAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, expected_env)
@mock.patch.dict(
os.environ,
{
"CUDA_VISIBLE_DEVICES": "0,1",
"WORLD_SIZE": "2",
"LOCAL_WORLD_SIZE": "2",
"RANK": "1",
"LOCAL_RANK": "1",
"GROUP_RANK": "0",
"TORCHELASTIC_RUN_ID": "1",
},
)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_strategy_choice_ddp_torchelastic(*_):
connector = _Connector(accelerator="gpu", devices=2)
assert isinstance(connector.accelerator, CUDAAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, TorchElasticEnvironment)
assert connector.strategy.cluster_environment.local_rank() == 1
assert connector.strategy.local_rank == 1
@mock.patch.dict(
os.environ,
{
"TORCHELASTIC_RUN_ID": "1",
"SLURM_NTASKS": "2",
"WORLD_SIZE": "2",
"RANK": "1",
"LOCAL_RANK": "1",
},
)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_torchelastic_priority_over_slurm(*_):
"""Test that the TorchElastic cluster environment is chosen over SLURM when both are detected."""
assert TorchElasticEnvironment.detect()
assert SLURMEnvironment.detect()
connector = _Connector(strategy="ddp")
assert isinstance(connector.strategy.cluster_environment, TorchElasticEnvironment)
@mock.patch.dict(
os.environ,
{
"CUDA_VISIBLE_DEVICES": "0",
"KUBERNETES_PORT": "tcp://127.0.0.1:443",
"MASTER_ADDR": "1.2.3.4",
"MASTER_PORT": "500",
"WORLD_SIZE": "20",
"RANK": "1",
},
)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=2)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_strategy_choice_ddp_kubeflow(*_):
connector = _Connector(accelerator="gpu", devices=2, plugins=KubeflowEnvironment())
assert isinstance(connector.accelerator, CUDAAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, KubeflowEnvironment)
assert connector.strategy.cluster_environment.local_rank() == 0
assert connector.strategy.local_rank == 0
@mock.patch.dict(
os.environ,
{
"KUBERNETES_PORT": "tcp://127.0.0.1:443",
"MASTER_ADDR": "1.2.3.4",
"MASTER_PORT": "500",
"WORLD_SIZE": "20",
"RANK": "1",
},
)
def test_strategy_choice_ddp_cpu_kubeflow():
connector = _Connector(accelerator="cpu", devices=2, plugins=KubeflowEnvironment())
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, KubeflowEnvironment)
assert connector.strategy.cluster_environment.local_rank() == 0
assert connector.strategy.local_rank == 0
@mock.patch.dict(
os.environ,
{
"SLURM_NTASKS": "2",
"SLURM_NTASKS_PER_NODE": "1",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_PROCID": "0",
"SLURM_LOCALID": "0",
},
)
@pytest.mark.parametrize("strategy", ["auto", "ddp", DDPStrategy()])
def test_strategy_choice_ddp_cpu_slurm(strategy):
connector = _Connector(strategy=strategy, accelerator="cpu", devices=2)
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert isinstance(connector.strategy.cluster_environment, SLURMEnvironment)
assert connector.strategy.local_rank == 0
@mock.patch.dict(os.environ, {}, clear=True)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_unsupported_tpu_choice(_, tpu_available):
# if user didn't set strategy, _Connector will choose the SingleDeviceXLAStrategy or XLAStrategy
with pytest.raises(ValueError, match="XLAAccelerator` can only be used with a `SingleDeviceXLAStrategy`"):
_Connector(accelerator="tpu", precision="16-true", strategy="ddp")
# wrong precision plugin type
with pytest.raises(TypeError, match="can only work with the `XLAPrecision` plugin"):
XLAStrategy(accelerator=XLAAccelerator(), precision=Precision())
# wrong strategy type
strategy = DDPStrategy(accelerator=XLAAccelerator(), precision=XLAPrecision(precision="16-true"))
with pytest.raises(ValueError, match="XLAAccelerator` can only be used with a `SingleDeviceXLAStrategy`"):
_Connector(strategy=strategy)
@RunIf(skip_windows=True)
def test_connector_with_tpu_accelerator_instance(tpu_available, monkeypatch):
monkeypatch.setattr(torch, "device", DeviceMock())
accelerator = XLAAccelerator()
connector = _Connector(accelerator=accelerator, devices=1)
assert connector.accelerator is accelerator
assert isinstance(connector.strategy, SingleDeviceXLAStrategy)
connector = _Connector(accelerator=accelerator)
assert connector.accelerator is accelerator
assert isinstance(connector.strategy, XLAStrategy)
@RunIf(mps=True)
def test_devices_auto_choice_mps():
connector = _Connector(accelerator="auto", devices="auto")
assert isinstance(connector.accelerator, MPSAccelerator)
assert isinstance(connector.strategy, SingleDeviceStrategy)
assert connector.strategy.root_device == torch.device("mps", 0)
assert connector._parallel_devices == [torch.device("mps", 0)]
@pytest.mark.parametrize(
("parallel_devices", "accelerator"),
[([torch.device("cpu")], "cuda"), ([torch.device("cuda", i) for i in range(8)], "tpu")],
)
def test_parallel_devices_in_strategy_conflict_with_accelerator(parallel_devices, accelerator):
with pytest.raises(ValueError, match=r"parallel_devices set through"):
_Connector(strategy=DDPStrategy(parallel_devices=parallel_devices), accelerator=accelerator)
@pytest.mark.parametrize(
("plugins", "expected"),
[
([LightningEnvironment(), SLURMEnvironment()], "ClusterEnvironment"),
([TorchCheckpointIO(), TorchCheckpointIO()], "CheckpointIO"),
(
[Precision(), DoublePrecision(), LightningEnvironment(), SLURMEnvironment()],
"Precision, ClusterEnvironment",
),
],
)
def test_plugin_only_one_instance_for_one_type(plugins, expected):
with pytest.raises(ValueError, match=f"Received multiple values for {expected}"):
_Connector(plugins=plugins)
@pytest.mark.parametrize("accelerator", ["cpu", "cuda", "mps", "tpu"])
@pytest.mark.parametrize("devices", ["0", 0, []])
def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices):
with pytest.raises(ValueError, match="value is not a valid input using"):
_Connector(accelerator=accelerator, devices=devices)
@pytest.mark.parametrize(
("expected_accelerator_flag", "expected_accelerator_class"),
[
pytest.param("cuda", CUDAAccelerator, marks=RunIf(min_cuda_gpus=1)),
pytest.param("mps", MPSAccelerator, marks=RunIf(mps=True)),
],
)
def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_accelerator_class):
connector = _Connector(accelerator="gpu")
assert connector._accelerator_flag == expected_accelerator_flag
assert isinstance(connector.accelerator, expected_accelerator_class)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=1)
def test_gpu_accelerator_backend_choice_cuda(*_):
connector = _Connector(accelerator="gpu")
assert connector._accelerator_flag == "cuda"
assert isinstance(connector.accelerator, CUDAAccelerator)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=True)
@mock.patch("lightning.fabric.accelerators.mps._get_all_available_mps_gpus", return_value=[0])
@mock.patch("torch.device", DeviceMock)
def test_gpu_accelerator_backend_choice_mps(*_: object) -> object:
connector = _Connector(accelerator="gpu")
assert connector._accelerator_flag == "mps"
assert isinstance(connector.accelerator, MPSAccelerator)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
@mock.patch("lightning.fabric.accelerators.cuda.CUDAAccelerator.is_available", return_value=False)
def test_gpu_accelerator_no_gpu_backend_found_error(*_):
with pytest.raises(RuntimeError, match="No supported gpu backend found!"):
_Connector(accelerator="gpu")
@pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES)
@mock.patch(
"lightning.fabric.connector.torch.multiprocessing.get_all_start_methods",
return_value=[],
)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_ddp_fork_on_unsupported_platform(_, __, strategy):
with pytest.raises(ValueError, match="process forking is not supported on this platform"):
_Connector(strategy=strategy)
@pytest.mark.parametrize(
("precision_str", "strategy_str", "expected_precision_cls"),
[
("64-true", "auto", DoublePrecision),
("32-true", "auto", Precision),
("16-true", "auto", HalfPrecision),
("bf16-true", "auto", HalfPrecision),
("16-mixed", "auto", MixedPrecision),
("bf16-mixed", "auto", MixedPrecision),
pytest.param("32-true", "fsdp", FSDPPrecision, marks=RunIf(min_cuda_gpus=1)),
pytest.param("16-true", "fsdp", FSDPPrecision, marks=RunIf(min_cuda_gpus=1)),
pytest.param("bf16-true", "fsdp", FSDPPrecision, marks=RunIf(min_cuda_gpus=1)),
pytest.param("16-mixed", "fsdp", FSDPPrecision, marks=RunIf(min_cuda_gpus=1)),
pytest.param("bf16-mixed", "fsdp", FSDPPrecision, marks=RunIf(min_cuda_gpus=1)),
pytest.param("32-true", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)),
pytest.param("16-true", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)),
pytest.param("bf16-true", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)),
pytest.param("16-mixed", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)),
pytest.param("bf16-mixed", "deepspeed", DeepSpeedPrecision, marks=RunIf(deepspeed=True, mps=False)),
],
)
def test_precision_selection(precision_str, strategy_str, expected_precision_cls):
connector = _Connector(precision=precision_str, strategy=strategy_str)
assert isinstance(connector.precision, expected_precision_cls)
def test_precision_selection_16_on_cpu_warns():
with pytest.warns(
UserWarning,
match=r"precision='16-mixed'\)` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'",
):
_Connector(accelerator="cpu", precision="16-mixed")
class MyAMP(MixedPrecision):
pass
@RunIf(mps=False)
@pytest.mark.parametrize(("strategy", "devices"), [("ddp", 2), ("ddp_spawn", 2)])
@pytest.mark.parametrize(
("is_custom_plugin", "plugin_cls"),
[(False, MixedPrecision), (True, MyAMP)],
)
def test_precision_selection_amp_ddp(strategy, devices, is_custom_plugin, plugin_cls):
plugin = None
precision = None
if is_custom_plugin:
plugin = plugin_cls("16-mixed", "cpu")
else:
precision = "16-mixed"
connector = _Connector(
accelerator="cpu",
precision=precision,
devices=devices,
strategy=strategy,
plugins=plugin,
)
assert isinstance(connector.precision, plugin_cls)
@RunIf(min_torch="2.4")
@pytest.mark.parametrize(
("precision", "raises"),
[("32-true", False), ("16-true", False), ("bf16-true", False), ("16-mixed", True), ("bf16-mixed", False)],
)
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_precision_selection_model_parallel(_, precision, raises):
error_context = pytest.raises(ValueError, match=f"does not support .*{precision}") if raises else nullcontext()
with error_context:
_Connector(precision=precision, strategy=ModelParallelStrategy(lambda x, _: x))
def test_bitsandbytes_precision_cuda_required(monkeypatch):
monkeypatch.setattr(lightning.fabric.plugins.precision.bitsandbytes, "_BITSANDBYTES_AVAILABLE", True)
monkeypatch.setitem(sys.modules, "bitsandbytes", Mock())
with pytest.raises(RuntimeError, match="Bitsandbytes is only supported on CUDA GPUs"):
_Connector(accelerator="cpu", plugins=BitsandbytesPrecision(mode="int8"))
@pytest.mark.parametrize(("strategy", "strategy_cls"), [("DDP", DDPStrategy), ("Ddp", DDPStrategy)])
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_strategy_str_passed_being_case_insensitive(_, strategy, strategy_cls):
connector = _Connector(strategy=strategy)
assert isinstance(connector.strategy, strategy_cls)
@pytest.mark.parametrize(
("precision", "expected"),
[
(None, Precision),
("64-true", DoublePrecision),
("32-true", Precision),
("16-true", HalfPrecision),
("16-mixed", MixedPrecision),
],
)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=1)
def test_precision_from_environment(_, precision, expected):
"""Test that the precision input can be set through the environment variable."""
env_vars = {"LT_CLI_USED": "1"}
if precision is not None:
env_vars["LT_PRECISION"] = precision
with mock.patch.dict(os.environ, env_vars):
connector = _Connector(accelerator="cuda") # need to use cuda, because AMP not available on CPU
assert isinstance(connector.precision, expected)
@pytest.mark.parametrize(
("accelerator", "strategy", "expected_accelerator", "expected_strategy"),
[
(None, None, CPUAccelerator, SingleDeviceStrategy),
("cpu", None, CPUAccelerator, SingleDeviceStrategy),
("cpu", "ddp", CPUAccelerator, DDPStrategy),
pytest.param("mps", None, MPSAccelerator, SingleDeviceStrategy, marks=RunIf(mps=True)),
pytest.param("cuda", "dp", CUDAAccelerator, DataParallelStrategy, marks=RunIf(min_cuda_gpus=1)),
pytest.param(
"cuda", "deepspeed", CUDAAccelerator, DeepSpeedStrategy, marks=RunIf(min_cuda_gpus=1, deepspeed=True)
),
],
)
def test_accelerator_strategy_from_environment(accelerator, strategy, expected_accelerator, expected_strategy):
"""Test that the accelerator and strategy input can be set through the environment variables."""
env_vars = {"LT_CLI_USED": "1"}
if accelerator is not None:
env_vars["LT_ACCELERATOR"] = accelerator
if strategy is not None:
env_vars["LT_STRATEGY"] = strategy
with mock.patch.dict(os.environ, env_vars):
connector = _Connector(accelerator="cpu" if accelerator is None else "auto")
assert isinstance(connector.accelerator, expected_accelerator)
assert isinstance(connector.strategy, expected_strategy)
@mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=8)
def test_devices_from_environment(*_):
"""Test that the devices and number of nodes can be set through the environment variables."""
with mock.patch.dict(os.environ, {"LT_DEVICES": "2", "LT_NUM_NODES": "3", "LT_CLI_USED": "1"}):
connector = _Connector(accelerator="cuda")
assert isinstance(connector.accelerator, CUDAAccelerator)
assert isinstance(connector.strategy, DDPStrategy)
assert len(connector._parallel_devices) == 2
assert connector._num_nodes_flag == 3
def test_arguments_from_environment_collision():
"""Test that the connector raises an error when the CLI settings conflict with settings in the code."""
# Do not raise an error about collisions unless the CLI was used
with mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"}):
_Connector(accelerator="cuda")
with mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu", "LT_CLI_USED": "1"}), pytest.raises(
ValueError, match="`Fabric\\(accelerator='cuda', ...\\)` but .* `--accelerator=cpu`"
):
_Connector(accelerator="cuda")
with mock.patch.dict(os.environ, {"LT_STRATEGY": "ddp", "LT_CLI_USED": "1"}), pytest.raises(
ValueError, match="`Fabric\\(strategy='ddp_spawn', ...\\)` but .* `--strategy=ddp`"
):
_Connector(strategy="ddp_spawn")
with mock.patch.dict(os.environ, {"LT_DEVICES": "2", "LT_CLI_USED": "1"}), pytest.raises(
ValueError, match="`Fabric\\(devices=3, ...\\)` but .* `--devices=2`"
):
_Connector(devices=3)
with mock.patch.dict(os.environ, {"LT_NUM_NODES": "3", "LT_CLI_USED": "1"}), pytest.raises(
ValueError, match="`Fabric\\(num_nodes=2, ...\\)` but .* `--num_nodes=3`"
):
_Connector(num_nodes=2)
with mock.patch.dict(os.environ, {"LT_PRECISION": "16-mixed", "LT_CLI_USED": "1"}), pytest.raises(
ValueError, match="`Fabric\\(precision='64-true', ...\\)` but .* `--precision=16-mixed`"
):
_Connector(precision="64-true")
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_fsdp_unsupported_on_cpu(_):
"""Test that we raise an error if attempting to run FSDP without GPU."""
with pytest.raises(ValueError, match="You selected the FSDP strategy but FSDP is only available on GPU"):
_Connector(accelerator="cpu", strategy="fsdp")
class FSDPStrategySubclass(FSDPStrategy):
pass
class AcceleratorSubclass(CPUAccelerator):
pass
# we allow subclasses of FSDPStrategy to be used with other accelerators
_Connector(accelerator="cpu", strategy=FSDPStrategySubclass())
_Connector(accelerator=AcceleratorSubclass(), strategy=FSDPStrategySubclass())
def test_connector_defaults_match_fabric_defaults():
"""Test that the default values for the init arguments of Connector match the ones in Fabric."""
def get_defaults(cls):
init_signature = inspect.signature(cls)
return {k: v.default for k, v in init_signature.parameters.items()}
fabric_defaults = get_defaults(Fabric)
connector_defaults = get_defaults(_Connector)
# defaults should match on the intersection of argument names
for name, connector_default in connector_defaults.items():
assert connector_default == fabric_defaults[name]
@pytest.mark.parametrize("is_interactive", [False, True])
@RunIf(min_python="3.9") # mocking issue
def test_connector_auto_selection(monkeypatch, is_interactive):
no_cuda = mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=0)
single_cuda = mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=1)
multi_cuda = mock.patch("lightning.fabric.accelerators.cuda.num_cuda_devices", return_value=4)
no_mps = mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
single_mps = mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=True)
def _mock_interactive():
monkeypatch.setattr(lightning.fabric.utilities.imports, "_IS_INTERACTIVE", is_interactive)
monkeypatch.setattr(lightning.fabric.connector, "_IS_INTERACTIVE", is_interactive)
if _IS_WINDOWS:
# simulate fork support on windows
monkeypatch.setattr(torch.multiprocessing, "get_all_start_methods", lambda: ["fork", "spawn"])
_mock_interactive()
# CPU
with no_cuda, no_mps, monkeypatch.context():
mock_tpu_available(monkeypatch, False)
connector = _Connector()
assert isinstance(connector.accelerator, CPUAccelerator)
assert isinstance(connector.strategy, SingleDeviceStrategy)
assert connector._devices_flag == 1
# single CUDA
with single_cuda, no_mps, monkeypatch.context():
mock_tpu_available(monkeypatch, False)
connector = _Connector()
assert isinstance(connector.accelerator, CUDAAccelerator)
assert isinstance(connector.strategy, SingleDeviceStrategy)
assert connector._devices_flag == [0]
# multi CUDA
with multi_cuda, no_mps, monkeypatch.context():
mock_tpu_available(monkeypatch, False)
connector = _Connector()
assert isinstance(connector.accelerator, CUDAAccelerator)
assert isinstance(connector.strategy, (SingleDeviceStrategy if is_interactive else DDPStrategy))
assert connector._devices_flag == [0] if is_interactive else list(range(4))
if not is_interactive:
assert isinstance(connector.strategy.cluster_environment, LightningEnvironment)
assert connector.strategy._start_method == "fork" if is_interactive else "popen"
assert connector.strategy.launcher.is_interactive_compatible == is_interactive
# MPS (there's no distributed)
with no_cuda, single_mps, monkeypatch.context():
mock_tpu_available(monkeypatch, False)
connector = _Connector()
assert isinstance(connector.accelerator, MPSAccelerator)
assert isinstance(connector.strategy, SingleDeviceStrategy)
assert connector._devices_flag == [0]
# single TPU
with no_cuda, no_mps, monkeypatch.context():
mock_tpu_available(monkeypatch, True)
monkeypatch.setattr(lightning.fabric.accelerators.XLAAccelerator, "auto_device_count", lambda *_: 1)
monkeypatch.setattr(torch, "device", DeviceMock())
connector = _Connector()
assert isinstance(connector.accelerator, XLAAccelerator)
assert isinstance(connector.strategy, SingleDeviceXLAStrategy)
assert connector._devices_flag == 1
monkeypatch.undo() # for some reason `.context()` is not working properly
_mock_interactive()
# Multi TPU
with no_cuda, no_mps, monkeypatch.context():
mock_tpu_available(monkeypatch, True)
connector = _Connector()
assert isinstance(connector.accelerator, XLAAccelerator)
assert isinstance(connector.strategy, XLAStrategy)
assert connector._devices_flag == 8
assert isinstance(connector.strategy.cluster_environment, XLAEnvironment)
assert connector.strategy.launcher._start_method == "fork"
assert connector.strategy.launcher.is_interactive_compatible
# TPU and CUDA: prefers TPU
with multi_cuda, no_mps, monkeypatch.context():
mock_tpu_available(monkeypatch, True)
connector = _Connector()
assert isinstance(connector.accelerator, XLAAccelerator)
assert isinstance(connector.strategy, XLAStrategy)
assert connector._devices_flag == 8
assert isinstance(connector.strategy.cluster_environment, XLAEnvironment)
assert connector.strategy.launcher._start_method == "fork"
assert connector.strategy.launcher.is_interactive_compatible
@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False)
def test_xla_fsdp_automatic_strategy_selection(monkeypatch, tpu_available):
import lightning.fabric.strategies as strategies
added_fsdp = False
# manually register fsdp for when torch.distributed.is_initialized() != True
if "fsdp" not in strategies.STRATEGY_REGISTRY.available_strategies():
strategies.STRATEGY_REGISTRY.register("fsdp", FSDPStrategy)
added_fsdp = True
connector = _Connector(accelerator="tpu", strategy="fsdp")
assert isinstance(connector.strategy, XLAFSDPStrategy)
connector = _Connector(accelerator="tpu", strategy="xla_fsdp")
assert isinstance(connector.strategy, XLAFSDPStrategy)
connector = _Connector(accelerator="auto", strategy="fsdp")
assert isinstance(connector.strategy, XLAFSDPStrategy)
connector = _Connector(accelerator="auto", strategy="xla_fsdp")
assert isinstance(connector.strategy, XLAFSDPStrategy)
if added_fsdp:
strategies.STRATEGY_REGISTRY.pop("fsdp")