# Copyright The PyTorch Lightning team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License import os from re import escape from typing import Any, Dict from unittest import mock import pytest import torch import torch.distributed from tests_fabric.helpers.runif import RunIf import lightning_fabric from lightning_fabric.accelerators import TPUAccelerator from lightning_fabric.accelerators.accelerator import Accelerator from lightning_fabric.accelerators.cpu import CPUAccelerator from lightning_fabric.accelerators.cuda import CUDAAccelerator from lightning_fabric.accelerators.mps import MPSAccelerator from lightning_fabric.connector import _Connector from lightning_fabric.plugins import DoublePrecision, MixedPrecision, Precision, TPUPrecision from lightning_fabric.plugins.environments import ( KubeflowEnvironment, LightningEnvironment, LSFEnvironment, SLURMEnvironment, TorchElasticEnvironment, ) from lightning_fabric.plugins.io import TorchCheckpointIO from lightning_fabric.strategies import ( DataParallelStrategy, DDPShardedStrategy, DDPStrategy, DeepSpeedStrategy, SingleDeviceStrategy, SingleTPUStrategy, XLAStrategy, ) from lightning_fabric.strategies.ddp import _DDP_FORK_ALIASES from lightning_fabric.utilities.exceptions import MisconfigurationException def test_accelerator_choice_cpu(): connector = _Connector() assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, SingleDeviceStrategy) @RunIf(tpu=True, standalone=True) @pytest.mark.parametrize( ["accelerator", "devices"], [("tpu", None), ("tpu", 1), ("tpu", [1]), ("tpu", 8), ("auto", 1), ("auto", 8)] ) @mock.patch.dict(os.environ, os.environ.copy(), clear=True) def test_accelerator_choice_tpu(accelerator, devices): connector = _Connector(accelerator=accelerator, devices=devices) assert isinstance(connector.accelerator, TPUAccelerator) if devices is None or (isinstance(devices, int) and devices > 1): # accelerator=tpu, devices=None (default) maps to devices=auto (8) and then chooses XLAStrategy # This behavior may change in the future: https://github.com/Lightning-AI/lightning/issues/10606 assert isinstance(connector.strategy, XLAStrategy) else: assert isinstance(connector.strategy, SingleTPUStrategy) @RunIf(skip_windows=True, standalone=True) def test_strategy_choice_ddp_on_cpu(): """Test that selecting DDPStrategy on CPU works.""" _test_strategy_choice_ddp_and_cpu(ddp_strategy_class=DDPStrategy) def _test_strategy_choice_ddp_and_cpu(ddp_strategy_class): connector = _Connector( strategy=ddp_strategy_class(), accelerator="cpu", devices=2, ) assert isinstance(connector.strategy, ddp_strategy_class) assert isinstance(connector.accelerator, CPUAccelerator) assert connector.strategy.num_processes == 2 assert connector.strategy.parallel_devices == [torch.device("cpu")] * 2 @mock.patch.dict( os.environ, { "SLURM_NTASKS": "2", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", "SLURM_PROCID": "0", "SLURM_LOCALID": "0", }, ) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=0) def test_custom_cluster_environment_in_slurm_environment(_): """Test that we choose the custom cluster even when SLURM or TE flags are around.""" class CustomCluster(LightningEnvironment): @property def main_address(self): return "asdf" @property def creates_processes_externally(self) -> bool: return True connector = _Connector( plugins=[CustomCluster()], accelerator="cpu", strategy="ddp", devices=2, ) assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, CustomCluster) @mock.patch.dict( os.environ, { "SLURM_NTASKS": "2", "SLURM_NTASKS_PER_NODE": "1", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", "SLURM_PROCID": "0", "SLURM_LOCALID": "0", }, ) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=0) def test_custom_accelerator(*_): class Accel(Accelerator): def setup_device(self, device: torch.device) -> None: pass def get_device_stats(self, device: torch.device) -> Dict[str, Any]: pass def teardown(self) -> None: pass @staticmethod def parse_devices(devices): return devices @staticmethod def get_parallel_devices(devices): return [torch.device("cpu")] * devices @staticmethod def auto_device_count() -> int: return 1 @staticmethod def is_available() -> bool: return True @staticmethod def name() -> str: return "custom_acc_name" class Prec(Precision): pass class Strat(SingleDeviceStrategy): pass strategy = Strat(device=torch.device("cpu"), accelerator=Accel(), precision=Prec()) connector = _Connector(strategy=strategy, devices=2) assert isinstance(connector.accelerator, Accel) assert isinstance(connector.strategy, Strat) assert isinstance(connector.precision, Prec) assert connector.strategy is strategy class Strat(DDPStrategy): pass strategy = Strat(accelerator=Accel(), precision=Prec()) connector = _Connector(strategy=strategy, devices=2) assert isinstance(connector.accelerator, Accel) assert isinstance(connector.strategy, Strat) assert isinstance(connector.precision, Prec) assert connector.strategy is strategy @pytest.mark.parametrize( "env_vars,expected_environment", [ ( { "SLURM_NTASKS": "2", "SLURM_NTASKS_PER_NODE": "1", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", "SLURM_PROCID": "0", "SLURM_LOCALID": "0", }, SLURMEnvironment, ), ( { "LSB_JOBID": "1", "LSB_DJOB_RANKFILE": "SOME_RANK_FILE", "JSM_NAMESPACE_LOCAL_RANK": "1", "JSM_NAMESPACE_SIZE": "20", "JSM_NAMESPACE_RANK": "1", }, LSFEnvironment, ), ], ) @mock.patch("lightning_fabric.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"]) @mock.patch("lightning_fabric.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0) def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment): with mock.patch.dict(os.environ, env_vars, clear=True): trainer = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) assert isinstance(trainer.accelerator, CPUAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, expected_environment) @RunIf(mps=False) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) def test_interactive_incompatible_backend_error(_, monkeypatch): monkeypatch.setattr(lightning_fabric.connector, "_IS_INTERACTIVE", True) with pytest.raises(RuntimeError, match=r"strategy='ddp'\)`.*is not compatible"): _Connector(strategy="ddp", accelerator="gpu", devices=2) with pytest.raises(RuntimeError, match=r"strategy='ddp_spawn'\)`.*is not compatible"): _Connector(strategy="ddp_spawn", accelerator="gpu", devices=2) with pytest.raises(RuntimeError, match=r"strategy='ddp_sharded_spawn'\)`.*is not compatible"): _Connector(strategy="ddp_sharded_spawn", accelerator="gpu", devices=2) with pytest.raises(RuntimeError, match=r"strategy='ddp'\)`.*is not compatible"): # Edge case: _Connector maps dp to ddp if accelerator != gpu _Connector(strategy="dp") @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) def test_interactive_compatible_dp_strategy_gpu(_, monkeypatch): monkeypatch.setattr(lightning_fabric.utilities.imports, "_IS_INTERACTIVE", True) connector = _Connector(strategy="dp", accelerator="gpu") assert connector.strategy.launcher is None @RunIf(skip_windows=True) def test_interactive_compatible_strategy_tpu(tpu_available, monkeypatch): monkeypatch.setattr(lightning_fabric.utilities.imports, "_IS_INTERACTIVE", True) connector = _Connector(accelerator="tpu") assert connector.strategy.launcher.is_interactive_compatible @RunIf(skip_windows=True) def test_interactive_compatible_strategy_ddp_fork(monkeypatch): monkeypatch.setattr(lightning_fabric.utilities.imports, "_IS_INTERACTIVE", True) connector = _Connector(strategy="ddp_fork", accelerator="cpu") assert connector.strategy.launcher.is_interactive_compatible @RunIf(mps=False) @pytest.mark.parametrize( ["strategy", "strategy_class"], [ ("ddp", DDPStrategy), ("ddp_spawn", DDPStrategy), ("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPShardedStrategy), pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), ], ) @pytest.mark.parametrize("devices", [1, 2]) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) def test_strategy_choice_multi_node_gpu(_, strategy, strategy_class, devices): connector = _Connector(num_nodes=2, accelerator="gpu", strategy=strategy, devices=devices) assert isinstance(connector.strategy, strategy_class) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=0) def test_cuda_accelerator_can_not_run_on_system(_): connector = _Connector(accelerator="cpu") assert isinstance(connector.accelerator, CPUAccelerator) with pytest.raises( RuntimeError, match="CUDAAccelerator` can not run on your system since the accelerator is not available.", ): _Connector(accelerator="cuda", devices=1) @pytest.mark.skipif(TPUAccelerator.is_available(), reason="test requires missing TPU") @mock.patch("lightning_fabric.accelerators.tpu._XLA_AVAILABLE", True) def test_tpu_accelerator_can_not_run_on_system(): with pytest.raises(RuntimeError, match="TPUAccelerator` can not run on your system"): _Connector(accelerator="tpu", devices=8) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) @pytest.mark.parametrize("device_count", (["0"], [0, "1"], ["GPU"], [["0", "1"], [0, 1]], [False])) def test_accelererator_invalid_type_devices(_, device_count): with pytest.raises( MisconfigurationException, match=r"must be an int, a string, a sequence of ints or None, but you" ): _ = _Connector(accelerator="gpu", devices=device_count) @RunIf(min_cuda_gpus=1) def test_accelerator_gpu(): connector = _Connector(accelerator="gpu", devices=1) assert isinstance(connector.accelerator, CUDAAccelerator) connector = _Connector(accelerator="gpu") assert isinstance(connector.accelerator, CUDAAccelerator) connector = _Connector(accelerator="auto", devices=1) assert isinstance(connector.accelerator, CUDAAccelerator) @pytest.mark.parametrize(["devices", "strategy_class"], [(1, SingleDeviceStrategy), (5, DDPStrategy)]) def test_accelerator_cpu_with_devices(devices, strategy_class): connector = _Connector(accelerator="cpu", devices=devices) assert connector._parallel_devices == [torch.device("cpu")] * devices assert isinstance(connector.strategy, strategy_class) assert isinstance(connector.accelerator, CPUAccelerator) @RunIf(min_cuda_gpus=2) @pytest.mark.parametrize( ["devices", "strategy_class"], [(1, SingleDeviceStrategy), ([1], SingleDeviceStrategy), (2, DDPStrategy)] ) def test_accelerator_gpu_with_devices(devices, strategy_class): connector = _Connector(accelerator="gpu", devices=devices) assert len(connector._parallel_devices) == len(devices) if isinstance(devices, list) else devices assert isinstance(connector.strategy, strategy_class) assert isinstance(connector.accelerator, CUDAAccelerator) @RunIf(min_cuda_gpus=1) def test_accelerator_auto_with_devices_gpu(): connector = _Connector(accelerator="auto", devices=1) assert isinstance(connector.accelerator, CUDAAccelerator) assert connector._parallel_devices == [torch.device("cuda", 0)] def test_set_devices_if_none_cpu(): connector = _Connector(accelerator="cpu", devices=3) assert connector._parallel_devices == [torch.device("cpu")] * 3 def test_unsupported_strategy_types_on_cpu_and_fallback(): with pytest.warns(UserWarning, match="is not supported on CPUs, hence setting `strategy='ddp"): connector = _Connector(strategy="dp", devices=2) assert isinstance(connector.strategy, DDPStrategy) def test_invalid_accelerator_choice(): with pytest.raises(ValueError, match="You selected an invalid accelerator name: `accelerator='cocofruit'`"): _Connector(accelerator="cocofruit") def test_invalid_strategy_choice(): with pytest.raises(ValueError, match="You selected an invalid strategy name: `strategy='cocofruit'`"): _Connector(strategy="cocofruit") @pytest.mark.parametrize( ["strategy", "strategy_class"], [ ("ddp_spawn", DDPStrategy), ("ddp", DDPStrategy), ], ) def test_strategy_choice_cpu_str(strategy, strategy_class): connector = _Connector(strategy=strategy, accelerator="cpu", devices=2) assert isinstance(connector.strategy, strategy_class) @RunIf(min_cuda_gpus=2) @pytest.mark.parametrize( ["strategy", "strategy_class"], [ ("ddp_spawn", DDPStrategy), ("ddp", DDPStrategy), ("dp", DataParallelStrategy), ("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPShardedStrategy), pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), ], ) def test_strategy_choice_gpu_str(strategy, strategy_class): connector = _Connector(strategy=strategy, accelerator="gpu", devices=2) assert isinstance(connector.strategy, strategy_class) @RunIf(fairscale=True) @pytest.mark.parametrize( "strategy,expected_strategy", [("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPShardedStrategy)] ) @pytest.mark.parametrize( "precision,expected_precision", [(16, MixedPrecision), (32, Precision), ("bf16", MixedPrecision)] ) def test_strategy_choice_sharded(strategy, expected_strategy, precision, expected_precision): connector = _Connector(strategy=strategy, devices=1, precision=precision) assert isinstance(connector.strategy, expected_strategy) assert isinstance(connector.precision, expected_precision) def test_device_type_when_strategy_instance_cpu_passed(): connector = _Connector(strategy=DDPStrategy(), accelerator="cpu", devices=2) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.accelerator, CPUAccelerator) @RunIf(min_cuda_gpus=2) def test_device_type_when_strategy_instance_gpu_passed(): connector = _Connector(strategy=DDPStrategy(), accelerator="gpu", devices=2) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.accelerator, CUDAAccelerator) @pytest.mark.parametrize("precision", [1, 12, "invalid"]) def test_validate_precision_type(precision): with pytest.raises(ValueError, match=f"Precision {repr(precision)} is invalid"): _Connector(precision=precision) def test_strategy_choice_ddp_spawn_cpu(): connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) assert connector.strategy._start_method == "spawn" assert connector.strategy.launcher._start_method == "spawn" @RunIf(skip_windows=True) @mock.patch("lightning_fabric.connector._IS_INTERACTIVE", True) def test_strategy_choice_ddp_fork_in_interactive(): """Test that when accelerator and strategy are unspecified, the connector chooses DDP Fork in interactive environments by default.""" connector = _Connector(devices=2) assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) assert connector.strategy._start_method == "fork" assert connector.strategy.launcher._start_method == "fork" @RunIf(skip_windows=True) def test_strategy_choice_ddp_fork_cpu(): connector = _Connector(strategy="ddp_fork", accelerator="cpu", devices=2) assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) assert connector.strategy._start_method == "fork" assert connector.strategy.launcher._start_method == "fork" @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) @mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_strategy_choice_ddp(*_): connector = _Connector(strategy="ddp", accelerator="gpu", devices=1) assert isinstance(connector.accelerator, CUDAAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) @mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_strategy_choice_ddp_spawn(*_): connector = _Connector(strategy="ddp_spawn", accelerator="gpu", devices=1) assert isinstance(connector.accelerator, CUDAAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, LightningEnvironment) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) @pytest.mark.parametrize("job_name,expected_env", [("some_name", SLURMEnvironment), ("bash", LightningEnvironment)]) @pytest.mark.parametrize("strategy", ["ddp", DDPStrategy]) def test_strategy_choice_ddp_slurm(_, strategy, job_name, expected_env): if not isinstance(strategy, str): strategy = strategy() with mock.patch.dict( os.environ, { "CUDA_VISIBLE_DEVICES": "0,1", "SLURM_NTASKS": "2", "SLURM_NTASKS_PER_NODE": "1", "SLURM_JOB_NAME": job_name, "SLURM_NODEID": "0", "SLURM_PROCID": "1", "SLURM_LOCALID": "1", }, ): connector = _Connector(strategy=strategy, accelerator="cuda", devices=2) assert isinstance(connector.accelerator, CUDAAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, expected_env) @mock.patch.dict( os.environ, { "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2", "LOCAL_WORLD_SIZE": "2", "RANK": "1", "LOCAL_RANK": "1", "GROUP_RANK": "0", "TORCHELASTIC_RUN_ID": "1", }, ) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) @mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_strategy_choice_ddp_te(*_): connector = _Connector(strategy="ddp", accelerator="gpu", devices=2) assert isinstance(connector.accelerator, CUDAAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, TorchElasticEnvironment) assert connector.strategy.cluster_environment.local_rank() == 1 assert connector.strategy.local_rank == 1 @mock.patch.dict( os.environ, { "WORLD_SIZE": "2", "LOCAL_WORLD_SIZE": "2", "RANK": "1", "LOCAL_RANK": "1", "GROUP_RANK": "0", "TORCHELASTIC_RUN_ID": "1", }, ) def test_strategy_choice_ddp_cpu_te(): connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, TorchElasticEnvironment) assert connector.strategy.cluster_environment.local_rank() == 1 assert connector.strategy.local_rank == 1 @mock.patch.dict( os.environ, { "CUDA_VISIBLE_DEVICES": "0", "KUBERNETES_PORT": "tcp://127.0.0.1:443", "MASTER_ADDR": "1.2.3.4", "MASTER_PORT": "500", "WORLD_SIZE": "20", "RANK": "1", }, ) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=1) @mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_strategy_choice_ddp_kubeflow(*_): connector = _Connector(strategy="ddp", accelerator="gpu", devices=1) assert isinstance(connector.accelerator, CUDAAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, KubeflowEnvironment) assert connector.strategy.cluster_environment.local_rank() == 0 assert connector.strategy.local_rank == 0 @mock.patch.dict( os.environ, { "KUBERNETES_PORT": "tcp://127.0.0.1:443", "MASTER_ADDR": "1.2.3.4", "MASTER_PORT": "500", "WORLD_SIZE": "20", "RANK": "1", }, ) def test_strategy_choice_ddp_cpu_kubeflow(): connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, KubeflowEnvironment) assert connector.strategy.cluster_environment.local_rank() == 0 assert connector.strategy.local_rank == 0 @mock.patch.dict( os.environ, { "SLURM_NTASKS": "2", "SLURM_NTASKS_PER_NODE": "1", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", "SLURM_PROCID": "0", "SLURM_LOCALID": "0", }, ) @pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()]) def test_strategy_choice_ddp_cpu_slurm(strategy): connector = _Connector(strategy=strategy, accelerator="cpu", devices=2) assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert isinstance(connector.strategy.cluster_environment, SLURMEnvironment) assert connector.strategy.local_rank == 0 @mock.patch.dict(os.environ, {}, clear=True) def test_unsupported_tpu_choice(tpu_available): with pytest.raises(NotImplementedError, match=r"accelerator='tpu', precision=64\)` is not implemented"): _Connector(accelerator="tpu", precision=64) # if user didn't set strategy, _Connector will choose the TPUSingleStrategy or XLAStrategy with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"), pytest.warns( UserWarning, match=r"accelerator='tpu', precision=16\)` but native AMP is not supported" ): _Connector(accelerator="tpu", precision=16, strategy="ddp") # wrong precision plugin type strategy = XLAStrategy(accelerator=TPUAccelerator(), precision=Precision()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecision` plugin"): _Connector(strategy=strategy, devices=8) # wrong strategy type strategy = DDPStrategy(accelerator=TPUAccelerator(), precision=TPUPrecision()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"): _Connector(strategy=strategy, devices=8) @mock.patch("lightning_fabric.accelerators.cuda.CUDAAccelerator.is_available", return_value=False) @mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_devices_auto_choice_cpu(tpu_available, *_): connector = _Connector(accelerator="auto", devices="auto") assert isinstance(connector.accelerator, CPUAccelerator) assert isinstance(connector.strategy, SingleDeviceStrategy) assert connector.strategy.root_device == torch.device("cpu") @RunIf(mps=False) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) def test_devices_auto_choice_gpu(*_): connector = _Connector(accelerator="auto", devices="auto") assert isinstance(connector.accelerator, CUDAAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert len(connector._parallel_devices) == 2 @RunIf(mps=True) def test_devices_auto_choice_mps(): connector = _Connector(accelerator="auto", devices="auto") assert isinstance(connector.accelerator, MPSAccelerator) assert isinstance(connector.strategy, SingleDeviceStrategy) assert connector.strategy.root_device == torch.device("mps", 0) assert connector._parallel_devices == [torch.device("mps", 0)] @pytest.mark.parametrize( ["parallel_devices", "accelerator"], [([torch.device("cpu")], "cuda"), ([torch.device("cuda", i) for i in range(8)], "tpu")], ) def test_parallel_devices_in_strategy_conflict_with_accelerator(parallel_devices, accelerator): with pytest.raises(ValueError, match=r"parallel_devices set through"): _Connector(strategy=DDPStrategy(parallel_devices=parallel_devices), accelerator=accelerator) @pytest.mark.parametrize( ["plugins", "expected"], [ ([LightningEnvironment(), SLURMEnvironment()], "ClusterEnvironment"), ([TorchCheckpointIO(), TorchCheckpointIO()], "CheckpointIO"), ( [Precision(), DoublePrecision(), LightningEnvironment(), SLURMEnvironment()], "Precision, ClusterEnvironment", ), ], ) def test_plugin_only_one_instance_for_one_type(plugins, expected): with pytest.raises(ValueError, match=f"Received multiple values for {expected}"): _Connector(plugins=plugins) @pytest.mark.parametrize("accelerator", ("cpu", "cuda", "mps", "tpu")) @pytest.mark.parametrize("devices", ("0", 0, [])) def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): with pytest.raises(ValueError, match="value is not a valid input using"): _Connector(accelerator=accelerator, devices=devices) @pytest.mark.parametrize( "expected_accelerator_flag,expected_accelerator_class", [ pytest.param("cuda", CUDAAccelerator, marks=RunIf(min_cuda_gpus=1)), pytest.param("mps", MPSAccelerator, marks=RunIf(mps=True)), ], ) def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_accelerator_class): connector = _Connector(accelerator="gpu") assert connector._accelerator_flag == expected_accelerator_flag assert isinstance(connector.accelerator, expected_accelerator_class) @mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=1) def test_gpu_accelerator_backend_choice_cuda(*_): connector = _Connector(accelerator="gpu") assert connector._accelerator_flag == "cuda" assert isinstance(connector.accelerator, CUDAAccelerator) @RunIf(min_torch="1.12") @mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=True) @mock.patch("lightning_fabric.accelerators.mps._get_all_available_mps_gpus", return_value=[0]) def test_gpu_accelerator_backend_choice_mps(*_): connector = _Connector(accelerator="gpu") assert connector._accelerator_flag == "mps" assert isinstance(connector.accelerator, MPSAccelerator) @mock.patch("lightning_fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) @mock.patch("lightning_fabric.accelerators.cuda.CUDAAccelerator.is_available", return_value=False) def test_gpu_accelerator_no_gpu_backend_found_error(*_): with pytest.raises(RuntimeError, match="No supported gpu backend found!"): _Connector(accelerator="gpu") @pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES) @mock.patch( "lightning_fabric.connector.torch.multiprocessing.get_all_start_methods", return_value=[], ) def test_ddp_fork_on_unsupported_platform(_, strategy): with pytest.raises(ValueError, match="process forking is not supported on this platform"): _Connector(strategy=strategy) def test_precision_selection_16_on_cpu_warns(): with pytest.warns( UserWarning, match=r"precision=16\)` but native AMP is not supported on CPU. Using `precision='bf16" ): _Connector(precision=16) class MyNativeAMP(MixedPrecision): pass @RunIf(mps=False) @pytest.mark.parametrize("strategy,devices", [("ddp", 2), ("ddp_spawn", 2)]) @pytest.mark.parametrize( "is_custom_plugin,plugin_cls", [(False, MixedPrecision), (True, MyNativeAMP)], ) def test_precision_selection_amp_ddp(strategy, devices, is_custom_plugin, plugin_cls): plugin = None if is_custom_plugin: plugin = plugin_cls(16, "cpu") connector = _Connector( precision=16, devices=devices, strategy=strategy, plugins=plugin, ) assert isinstance(connector.precision, plugin_cls) @pytest.mark.parametrize(["strategy", "strategy_cls"], [("DDP", DDPStrategy), ("Ddp", DDPStrategy)]) def test_strategy_str_passed_being_case_insensitive(strategy, strategy_cls): connector = _Connector(strategy=strategy) assert isinstance(connector.strategy, strategy_cls) @pytest.mark.parametrize("precision", ["64", "32", "16", pytest.param("bf16", marks=RunIf(min_torch="1.10"))]) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=1) def test_precision_from_environment(_, precision): """Test that the precision input can be set through the environment variable.""" with mock.patch.dict(os.environ, {"LT_PRECISION": precision}): connector = _Connector(accelerator="cuda") # need to use cuda, because AMP not available on CPU assert isinstance(connector.precision, Precision) @pytest.mark.parametrize( "accelerator, strategy, expected_accelerator, expected_strategy", [ ("cpu", None, CPUAccelerator, SingleDeviceStrategy), ("cpu", "ddp", CPUAccelerator, DDPStrategy), pytest.param("mps", None, MPSAccelerator, SingleDeviceStrategy, marks=RunIf(mps=True)), pytest.param("cuda", "dp", CUDAAccelerator, DataParallelStrategy, marks=RunIf(min_cuda_gpus=1)), pytest.param( "cuda", "deepspeed", CUDAAccelerator, DeepSpeedStrategy, marks=RunIf(min_cuda_gpus=1, deepspeed=True) ), ], ) def test_accelerator_strategy_from_environment(accelerator, strategy, expected_accelerator, expected_strategy): """Test that the accelerator and strategy input can be set through the environment variables.""" env_vars = {"LT_ACCELERATOR": accelerator} if strategy is not None: env_vars["LT_STRATEGY"] = strategy with mock.patch.dict(os.environ, env_vars): connector = _Connector() assert isinstance(connector.accelerator, expected_accelerator) assert isinstance(connector.strategy, expected_strategy) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=8) def test_devices_from_environment(*_): """Test that the devices and number of nodes can be set through the environment variables.""" with mock.patch.dict(os.environ, {"LT_DEVICES": "2", "LT_NUM_NODES": "3"}): connector = _Connector(accelerator="cuda") assert isinstance(connector.accelerator, CUDAAccelerator) assert isinstance(connector.strategy, DDPStrategy) assert len(connector._parallel_devices) == 2 assert connector._num_nodes_flag == 3 def test_arguments_from_environment_collision(): """Test that the connector raises an error when the CLI settings conflict with settings in the code.""" with mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"}): with pytest.raises( ValueError, match=escape("Your code has `Fabric(accelerator='cuda', ...)` but it conflicts") ): _Connector(accelerator="cuda") with mock.patch.dict(os.environ, {"LT_STRATEGY": "ddp"}): with pytest.raises( ValueError, match=escape("Your code has `Fabric(strategy='ddp_spawn', ...)` but it conflicts") ): _Connector(strategy="ddp_spawn") with mock.patch.dict(os.environ, {"LT_DEVICES": "2"}): with pytest.raises(ValueError, match=escape("Your code has `Fabric(devices=3, ...)` but it conflicts")): _Connector(devices=3) with mock.patch.dict(os.environ, {"LT_NUM_NODES": "3"}): with pytest.raises(ValueError, match=escape("Your code has `Fabric(num_nodes=2, ...)` but it conflicts")): _Connector(num_nodes=2) with mock.patch.dict(os.environ, {"LT_PRECISION": "16"}): with pytest.raises(ValueError, match=escape("Your code has `Fabric(precision=64, ...)` but it conflicts")): _Connector(precision=64) @RunIf(min_torch="1.12") def test_fsdp_unsupported_on_cpu(): """Test that we raise an error if attempting to run FSDP without GPU.""" with pytest.raises(ValueError, match="You selected the FSDP strategy but FSDP is only available on GPU"): _Connector(strategy="fsdp")