# Copyright The Lightning AI team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from contextlib import nullcontext from re import escape from unittest import mock from unittest.mock import ANY, MagicMock, Mock, PropertyMock, call import lightning.fabric import pytest import torch import torch.distributed import torch.nn.functional from lightning.fabric.fabric import Fabric from lightning.fabric.strategies import ( DataParallelStrategy, DDPStrategy, DeepSpeedStrategy, ParallelStrategy, SingleDeviceStrategy, Strategy, XLAStrategy, ) from lightning.fabric.strategies.strategy import _Sharded from lightning.fabric.utilities.exceptions import MisconfigurationException from lightning.fabric.utilities.seed import pl_worker_init_function, seed_everything from lightning.fabric.utilities.warnings import PossibleUserWarning from lightning.fabric.wrappers import _FabricDataLoader, _FabricModule, _FabricOptimizer from lightning_utilities.test.warning import no_warning_call from torch import nn from torch.utils.data import DataLoader, DistributedSampler, RandomSampler, Sampler, SequentialSampler, TensorDataset from tests_fabric.helpers.runif import RunIf class BoringModel(nn.Module): def __init__(self): super().__init__() self.layer = torch.nn.Linear(32, 2, bias=False) def forward(self, x): x = self.layer(x) return torch.nn.functional.mse_loss(x, torch.ones_like(x)) def test_run_input_output(): """Test that the dynamically patched run() method receives the input arguments and returns the result.""" class RunFabric(Fabric): run_args = () run_kwargs = {} def run(self, *args, **kwargs): self.run_args = args self.run_kwargs = kwargs return "result" fabric = RunFabric() result = fabric.run(1, 2, three=3) assert result == "result" assert fabric.run_args == (1, 2) assert fabric.run_kwargs == {"three": 3} @mock.patch("lightning.fabric.strategies.ddp.DistributedDataParallel") @pytest.mark.parametrize("setup_method", ["setup", "setup_module"]) def test_setup_module(ddp_mock, setup_method): """Test that the setup method lets the strategy wrap the model, but keeps a reference to the original model.""" fabric = Fabric(accelerator="cpu", strategy="ddp", devices=2) fabric._launched = True # pretend we have launched multiple processes model = nn.Linear(1, 2) setup_method = getattr(fabric, setup_method) fabric_model = setup_method(model) ddp_mock.assert_called_with(module=model, device_ids=ANY) assert fabric_model.module == model assert fabric_model.weight is model.weight assert fabric_model.forward != model.forward @RunIf(skip_windows=True, dynamo=True) @pytest.mark.parametrize("setup_method", ["setup", "setup_module"]) @pytest.mark.parametrize("reapply_compile", [True, False, None]) def test_setup_compiled_module(reapply_compile, setup_method): """Test that an `OptimizedModule` can be passed to the setup method.""" from torch._dynamo.eval_frame import OptimizedModule fabric = Fabric(devices=1) model = nn.Linear(1, 2) compiled_model = torch.compile(model) assert compiled_model._compile_kwargs is not None assert isinstance(compiled_model, OptimizedModule) setup_method = getattr(fabric, setup_method) fabric_model = setup_method(compiled_model, _reapply_compile=reapply_compile) assert isinstance(fabric_model._forward_module, OptimizedModule) if reapply_compile: # The forward_module got rewrapped into a new OptimizedModule assert fabric_model._forward_module != fabric_model._original_module # The original_module points to the pure module assert fabric_model._original_module is model assert fabric_model._forward_module._orig_mod is model else: assert fabric_model._forward_module is fabric_model._original_module # Attributes get passed through assert fabric_model.weight is model.weight @pytest.mark.parametrize( ("accelerator", "initial_device", "target_device"), [ ("cpu", "cpu", "cpu"), pytest.param("cpu", "cuda:0", "cpu", marks=RunIf(min_cuda_gpus=1)), pytest.param("cpu", "mps:0", "cpu", marks=RunIf(mps=True)), pytest.param("cuda", "cpu", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("cuda", "cuda:1", "cuda:0", marks=RunIf(min_cuda_gpus=2)), pytest.param("mps", "cpu", "mps:0", marks=RunIf(mps=True)), ], ) @pytest.mark.parametrize("move_to_device", [True, False]) @pytest.mark.parametrize("setup_method", ["setup", "setup_module"]) def test_setup_module_move_to_device(setup_method, move_to_device, accelerator, initial_device, target_device): """Test that `move_to_device` leads to parameters being moved to the correct device and that the device attributes on the wrapper are updated.""" initial_device = torch.device(initial_device) target_device = torch.device(target_device) expected_device = target_device if move_to_device else initial_device fabric = Fabric(accelerator=accelerator, devices=1) model = nn.Linear(1, 2) model.to(initial_device) setup_method = getattr(fabric, setup_method) fabric_model = setup_method(model, move_to_device=move_to_device) # all parameters on the expected device assert all(param.device == expected_device for param in model.parameters()) assert all(param.device == expected_device for param in fabric_model.parameters()) assert fabric_model.device == expected_device assert fabric.device == target_device # edge case: model has no parameters model = nn.Sequential() fabric_model = setup_method(model, move_to_device=move_to_device) assert fabric_model.device == target_device if move_to_device else torch.device("cpu") @RunIf(min_cuda_gpus=1) @pytest.mark.parametrize("move_to_device", [True, False]) @pytest.mark.parametrize("setup_method", ["setup", "setup_module"]) def test_setup_module_parameters_on_different_devices(setup_method, move_to_device): """Test that a warning is emitted when model parameters are on a different device prior to calling `setup()`.""" device0 = torch.device("cpu") device1 = torch.device("cuda", 0) fabric = Fabric(accelerator="cuda", devices=1) module0 = nn.Linear(1, 2, device=device0) module1 = nn.Linear(1, 2, device=device1) model = nn.Sequential(module0, module1) setup_method = getattr(fabric, setup_method) match = r"has 2 parameters on different devices \(for example '1.weight' on cuda:0 and '0.weight' on cpu\)" if move_to_device: with pytest.warns(PossibleUserWarning, match=match): fabric_model = setup_method(model, move_to_device=move_to_device) # both have the same device now assert fabric_model.device == device1 assert module0.weight.device == module0.bias.device == device1 assert module1.weight.device == module1.bias.device == device1 else: with no_warning_call(expected_warning=PossibleUserWarning, match=match): fabric_model = setup_method(model, move_to_device=move_to_device) # the first device is set at the root assert fabric_model.device == device0 assert fabric_model._device == device0 # the weights were not moved assert module0.weight.device == module0.bias.device == device0 assert module1.weight.device == module1.bias.device == device1 def test_setup_module_and_optimizers(): """Test that `setup()` can handle no optimizers, one optimizer, or multiple optimizers.""" fabric = Fabric(devices=1) model = nn.Linear(1, 2) optimizer0 = torch.optim.SGD(model.parameters(), lr=0.1) optimizer1 = torch.optim.Adam(model.parameters(), lr=0.1) # no optimizer fabric_model = fabric.setup(model) assert isinstance(fabric_model, _FabricModule) assert fabric_model.module is model # single optimizer fabric_model, fabric_optimizer = fabric.setup(model, optimizer0) assert isinstance(fabric_model, _FabricModule) assert isinstance(fabric_optimizer, _FabricOptimizer) assert fabric_model.module is model assert fabric_optimizer.optimizer is optimizer0 # multiple optimizers fabric_model, fabric_optimizer0, fabric_optimizer1 = fabric.setup(model, optimizer0, optimizer1) assert isinstance(fabric_model, _FabricModule) assert isinstance(fabric_optimizer0, _FabricOptimizer) assert isinstance(fabric_optimizer1, _FabricOptimizer) assert fabric_model.module is model assert fabric_optimizer0.optimizer is optimizer0 assert fabric_optimizer1.optimizer is optimizer1 def test_setup_optimizers(): """Test that `setup_optimizers()` can handle one or more optimizers.""" fabric = Fabric() model = nn.Linear(1, 2) optimizer0 = torch.optim.SGD(model.parameters(), lr=0.1) optimizer1 = torch.optim.Adam(model.parameters(), lr=0.1) # single optimizer fabric_optimizer = fabric.setup_optimizers(optimizer0) assert isinstance(fabric_optimizer, _FabricOptimizer) assert fabric_optimizer.optimizer is optimizer0 # multiple optimizers fabric_optimizer0, fabric_optimizer1 = fabric.setup_optimizers(optimizer0, optimizer1) assert isinstance(fabric_optimizer0, _FabricOptimizer) assert isinstance(fabric_optimizer1, _FabricOptimizer) assert fabric_optimizer0.optimizer is optimizer0 assert fabric_optimizer1.optimizer is optimizer1 def test_setup_twice_fails(): """Test that calling `setup` with a model or optimizer that is already wrapped fails.""" fabric = Fabric(devices=1) model = nn.Linear(1, 2) optimizer = torch.optim.Adam(model.parameters()) fabric_model, fabric_optimizer = fabric.setup(model, optimizer) with pytest.raises(ValueError, match="A model should be passed only once to the"): fabric.setup(fabric_model, optimizer) fabric_model, fabric_optimizer = fabric.setup(model, optimizer) with pytest.raises(ValueError, match="An optimizer should be passed only once to the"): fabric.setup(model, fabric_optimizer) def test_setup_module_twice_fails(): """Test that calling `setup_module` with a model that is already wrapped fails.""" fabric = Fabric(devices=1) model = nn.Linear(1, 2) fabric_model = fabric.setup_module(model) with pytest.raises(ValueError, match="A model should be passed only once to the"): fabric.setup_module(fabric_model) def test_setup_optimizers_twice_fails(): """Test that calling `setup_module` with a model that is already wrapped fails.""" fabric = Fabric() model = nn.Linear(1, 2) optimizer = torch.optim.Adam(model.parameters()) fabric_optimizer = fabric.setup_optimizers(optimizer) with pytest.raises(ValueError, match="An optimizer should be passed only once to"): fabric.setup_optimizers(fabric_optimizer) @pytest.mark.parametrize("strategy_cls", [DeepSpeedStrategy, XLAStrategy]) def test_setup_optimizers_not_supported(strategy_cls): """Test that `setup_optimizers` validates the strategy supports setting up model and optimizers independently.""" fabric = Fabric() fabric._launched = True # pretend we have launched multiple processes model = nn.Linear(1, 2) optimizer = torch.optim.Adam(model.parameters()) fabric._strategy = Mock(spec=strategy_cls) with pytest.raises(RuntimeError, match=escape("requires the model and optimizer(s) to be set up jointly through")): fabric.setup_optimizers(optimizer) @RunIf(min_cuda_gpus=1) def test_setup_optimizer_on_meta_device(): """Test that the setup-methods validate that the optimizer doesn't have references to meta-device parameters.""" fabric = Fabric(strategy="fsdp", devices=1) fabric._launched = True # pretend we have launched multiple processes with fabric.init_module(empty_init=True): model = nn.Linear(1, 2) assert model.weight.is_meta optimizer = torch.optim.Adam(model.parameters()) # optimizer references meta device params with pytest.raises(RuntimeError, match="The optimizer has references to the model's meta-device parameters"): fabric.setup(model, optimizer) with pytest.raises(RuntimeError, match="The optimizer has references to the model's meta-device parameters"): fabric.setup_optimizers(optimizer) def test_setup_tracks_num_models(): """Test that setup() tracks how many times it has setup a model.""" fabric = Fabric(devices=1) model = nn.Linear(1, 2) optimizer = torch.optim.Adam(model.parameters()) assert fabric._models_setup == 0 fabric.setup(model, optimizer) assert fabric._models_setup == 1 fabric.setup(model, optimizer) assert fabric._models_setup == 2 fabric.setup_module(model) assert fabric._models_setup == 3 def test_setup_dataloaders_unsupported_input(): """Test that the setup_dataloaders method fails when provided with non-DataLoader objects.""" fabric = Fabric() with pytest.raises(ValueError, match="`setup_dataloaders` requires at least one dataloader"): fabric.setup_dataloaders() with pytest.raises(TypeError, match="Only PyTorch DataLoader are currently supported"): fabric.setup_dataloaders(range(2)) # type: ignore def test_setup_dataloaders_return_type(): """Test that the setup method returns the dataloaders wrapped as FabricDataLoader and in the right order.""" fabric = Fabric(devices=1) # single dataloader fabric_dataloader = fabric.setup_dataloaders(DataLoader(range(2))) assert isinstance(fabric_dataloader, _FabricDataLoader) # multiple dataloaders dataset0 = Mock() dataset1 = Mock() dataloader0 = DataLoader(dataset0) dataloader1 = DataLoader(dataset1) fabric_dataloader0, fabric_dataloader1 = fabric.setup_dataloaders(dataloader0, dataloader1) assert isinstance(fabric_dataloader0, _FabricDataLoader) assert isinstance(fabric_dataloader1, _FabricDataLoader) assert fabric_dataloader0.dataset is dataset0 assert fabric_dataloader1.dataset is dataset1 @mock.patch("lightning.fabric.fabric._replace_dunder_methods") def test_setup_dataloaders_captures_dataloader_arguments(ctx_manager): """Test that Fabric intercepts the DataLoader constructor arguments with a context manager when launching a function.""" def run(_): # One for BatchSampler, another for DataLoader assert ctx_manager().__enter__.call_count == 2 fabric = Fabric() fabric.launch(run) assert ctx_manager().__exit__.call_count == 2 def test_setup_dataloaders_raises_for_unknown_custom_args(): """Test that an error raises when custom dataloaders with unknown arguments are created from outside Fabric's run method.""" class CustomDataLoader(DataLoader): def __init__(self, new_arg, *args, **kwargs): super().__init__(range(5), *args, **kwargs) dataloader = CustomDataLoader(2, batch_size=2) # If no distributed sampler is required, reinstantiation is not necessary fabric = Fabric(devices=1) fabric_dataloader = fabric.setup_dataloaders(dataloader) assert fabric_dataloader._dataloader is dataloader # If a distributed sampler is required, sampler needs to be reinstantiatied fabric = Fabric(devices=2, accelerator="cpu") fabric._launched = True with pytest.raises( MisconfigurationException, match=( r"Trying to inject custom `Sampler` into the `CustomDataLoader` instance.*" r"The missing attributes are \['new_arg'\]" ), ): fabric.setup_dataloaders(dataloader) def test_setup_dataloaders_twice_fails(): """Test that calling setup_dataloaders with a dataloader that is already wrapped fails.""" fabric = Fabric() dataloader = DataLoader(range(2)) fabric_dataloader = fabric.setup_dataloaders(dataloader) with pytest.raises(ValueError, match="A dataloader should be passed only once to the"): fabric.setup_dataloaders(fabric_dataloader) @mock.patch( "lightning.fabric.fabric.Fabric.device", new_callable=PropertyMock, return_value=torch.device("cuda", 1), ) def test_setup_dataloaders_move_to_device(fabric_device_mock): """Test that the setup configures FabricDataLoader to move the data to the device automatically.""" fabric = Fabric(devices=1) fabric_dataloaders = fabric.setup_dataloaders(DataLoader(Mock()), DataLoader(Mock()), move_to_device=False) assert all(dl.device is None for dl in fabric_dataloaders) fabric_device_mock.assert_not_called() fabric = Fabric(devices=1) fabric_dataloaders = fabric.setup_dataloaders(DataLoader(Mock()), DataLoader(Mock()), move_to_device=True) assert all(dl.device == torch.device("cuda", 1) for dl in fabric_dataloaders) fabric_device_mock.assert_called() def test_setup_dataloaders_distributed_sampler_not_needed(): """Test that `use_distributed_sampler` option has no effect when no distributed sampler is needed.""" custom_sampler = Mock(spec=Sampler) dataloader = DataLoader(Mock(), sampler=custom_sampler) # if no distributed sampler is required, dataloader reinstantiation is not necessary fabric = Fabric(devices=1) fabric_dataloader = fabric.setup_dataloaders(dataloader, use_distributed_sampler=True) assert fabric_dataloader._dataloader is dataloader assert fabric_dataloader.sampler is custom_sampler def test_setup_dataloaders_distributed_sampler_shuffle(): """Test that the DataLoader(shuffle=True|False) setting gets carried over correctly into the distributed sampler.""" fabric = Fabric(accelerator="cpu", strategy="ddp_spawn", devices=2) # no fabric.launch(): pretend we are on rank 0 now fabric._launched = True dataset = TensorDataset(torch.arange(8)) # shuffling turned off no_shuffle_dataloaders = [ DataLoader(dataset), DataLoader(dataset, shuffle=False), DataLoader(dataset, sampler=SequentialSampler(dataset)), ] for dataloader in no_shuffle_dataloaders: dataloader = fabric.setup_dataloaders(dataloader) assert [t[0].item() for t in iter(dataloader)] == [0, 2, 4, 6] # shuffling turned on shuffle_dataloaders = [DataLoader(dataset, shuffle=True), DataLoader(dataset, sampler=RandomSampler(dataset))] for dataloader in shuffle_dataloaders: seed_everything(1) dataloader = fabric.setup_dataloaders(dataloader) assert [t[0].item() for t in iter(dataloader)] == [5, 2, 7, 1] @pytest.mark.parametrize("shuffle", [True, False]) @pytest.mark.parametrize("batch_size", [1, 2, 3]) def test_setup_dataloaders_distributed_sampler_parity(shuffle, batch_size): """Test that the distributed sampler setup in Fabric leads to the same sequence of data as in raw PyTorch.""" torch.manual_seed(1) fabric = Fabric(accelerator="cpu", strategy="ddp", devices=2) # no fabric.launch(): pretend we are on rank 0 now fabric._launched = True dataset = torch.arange(10) torch_dataloader = DataLoader( dataset, sampler=DistributedSampler(dataset, num_replicas=2, rank=0, shuffle=shuffle), batch_size=batch_size, ) fabric_dataloader = DataLoader(dataset, shuffle=shuffle, batch_size=batch_size) fabric_dataloader = fabric.setup_dataloaders(fabric_dataloader) def fetch_epoch(loader): iterator = iter(loader) # we fetch 2 batches per epoch return torch.cat((next(iterator), next(iterator))) # 1st epoch # PyTorch users needs to set the epoch, while in Fabric it gets handled automatically torch_dataloader.sampler.set_epoch(0) torch_data = fetch_epoch(torch_dataloader) fabric_data = fetch_epoch(fabric_dataloader) assert torch.equal(torch_data, fabric_data) # 2nd epoch # PyTorch users needs to set the epoch, while in Fabric it gets handled automatically torch_dataloader.sampler.set_epoch(1) torch_data = fetch_epoch(torch_dataloader) fabric_data = fetch_epoch(fabric_dataloader) assert torch.equal(torch_data, fabric_data) assert torch_dataloader.sampler.epoch == 1 assert fabric_dataloader._dataloader.sampler.epoch == 1 @mock.patch.dict(os.environ, {}, clear=True) def test_seed_everything(): """Test that seed everything is static and sets the worker init function on the dataloader.""" Fabric.seed_everything(3) fabric = Fabric(devices=1) fabric_dataloader = fabric.setup_dataloaders(DataLoader(Mock())) assert fabric_dataloader.worker_init_fn.func is pl_worker_init_function assert os.environ == {"PL_GLOBAL_SEED": "3", "PL_SEED_WORKERS": "1"} @pytest.mark.parametrize( "strategy", [ "dp", "ddp", "ddp_spawn", pytest.param("ddp_fork", marks=RunIf(skip_windows=True)), pytest.param("deepspeed", marks=RunIf(deepspeed=True)), ], ) def test_setup_dataloaders_replace_custom_sampler(strategy): """Test that asking to replace a custom sampler results in an error when a distributed sampler would be needed.""" custom_sampler = Mock(spec=Sampler) dataloader = DataLoader(Mock(), sampler=custom_sampler) # explicitly asking to replace when a custom sampler is already configured raises an exception fabric = Fabric(accelerator="cpu", strategy=strategy, devices=2) fabric._launched = True # pretend we have launched multiple processes if hasattr(fabric.strategy, "distributed_sampler_kwargs"): with pytest.raises(TypeError, match="You seem to have configured a sampler in your DataLoader"): fabric.setup_dataloaders(dataloader, use_distributed_sampler=True) # setting `use_distributed_sampler=False` leaves the sampler untouched fabric_dataloader = fabric.setup_dataloaders(dataloader, use_distributed_sampler=False) assert fabric_dataloader.sampler is custom_sampler @pytest.mark.parametrize( "strategy", [ "dp", "ddp", "ddp_spawn", pytest.param("ddp_fork", marks=RunIf(skip_windows=True)), pytest.param("deepspeed", marks=RunIf(deepspeed=True)), ], ) @pytest.mark.parametrize("shuffle", [True, False]) def test_setup_dataloaders_replace_standard_sampler(shuffle, strategy): """Test that Fabric replaces the default samplers with DistributedSampler automatically.""" fabric = Fabric(accelerator="cpu", strategy=strategy, devices=2) fabric._launched = True # pretend we have launched multiple processes is_distributed = hasattr(fabric.strategy, "distributed_sampler_kwargs") fabric_dataloader = fabric.setup_dataloaders(DataLoader(range(3), shuffle=shuffle)) assert not is_distributed or isinstance(fabric_dataloader.sampler, DistributedSampler) @pytest.mark.parametrize( ("accelerator", "expected"), [ ("cpu", "cpu"), pytest.param("cuda", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("gpu", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("tpu", "xla:0", marks=RunIf(tpu=True, standalone=True)), pytest.param("mps", "mps:0", marks=RunIf(mps=True)), pytest.param("gpu", "mps:0", marks=RunIf(mps=True)), ], ) @mock.patch.dict(os.environ, os.environ.copy(), clear=True) def test_to_device(accelerator, expected): """Test that the to_device method can move various objects to the device determined by the accelerator.""" fabric = Fabric(accelerator=accelerator, devices=1) fabric.launch() expected_device = torch.device(expected) # module module = torch.nn.Linear(2, 3) module = fabric.to_device(module) assert all(param.device == expected_device for param in module.parameters()) # tensor tensor = torch.rand(2, 2) tensor = fabric.to_device(tensor) assert tensor.device == expected_device # collection collection = {"data": torch.rand(2, 2), "int": 1} collection = fabric.to_device(collection) assert collection["data"].device == expected_device def test_rank_properties(): """Test that the rank properties are determined by the strategy.""" fabric = Fabric() fabric._strategy = Mock(spec=Strategy) fabric._strategy.world_size = 1000 assert fabric.world_size == 1000 fabric._strategy.global_rank = 100 assert fabric.global_rank == 100 fabric._strategy.local_rank = 10 assert fabric.local_rank == 10 fabric._strategy.node_rank = 1 assert fabric.node_rank == 1 def test_backward(): """Test that backward() calls into the precision plugin.""" fabric = Fabric() fabric._strategy = Mock(spec=Strategy) loss = Mock() fabric.backward(loss, "arg", keyword="kwarg") fabric._strategy.backward.assert_called_with(loss, None, "arg", keyword="kwarg") @pytest.mark.parametrize( ("strategy", "precision", "error_expected"), [ ("auto", "32-true", False), ("auto", "bf16-true", False), ("auto", "bf16-mixed", True), pytest.param("fsdp", "32-true", True, marks=RunIf(min_cuda_gpus=1)), ], ) @pytest.mark.parametrize("setup_method", ["setup", "setup_module"]) @mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) def test_backward_required(_, strategy, precision, error_expected, setup_method): """Test under which strategy and precision configurations the `fabric.backward()` call is required.""" fabric = Fabric( accelerator=("cuda" if strategy == "fsdp" else "cpu"), strategy=strategy, precision=precision, devices=1 ) fabric._launched = True fabric.strategy.setup_module = lambda module: module error_context = ( pytest.raises(RuntimeError, match=escape("requires you to call `fabric.backward(loss)`")) if error_expected else nullcontext() ) batch = torch.rand(2, 2) # One model model1 = nn.Linear(2, 2) model1 = getattr(fabric, setup_method)(model1) output = model1(batch) assert output._backward_hooks is not None loss = output.sum() with error_context: loss.backward() loss = model1(batch).sum() assert not lightning.fabric.wrappers._in_fabric_backward fabric.backward(loss) # no error assert not lightning.fabric.wrappers._in_fabric_backward # Two models chained model2 = torch.nn.Linear(2, 2) model2 = getattr(fabric, setup_method)(model2) output = model2(model1(batch)) assert output._backward_hooks is not None loss = output.sum() with error_context: loss.backward() loss = model2(model1(batch)).sum() fabric.backward(loss) # no error # Two independent models loss1 = model1(batch).sum() loss2 = model2(batch).sum() with error_context: loss1.backward() with error_context: loss2.backward() loss1 = model1(batch).sum() loss2 = model2(batch).sum() fabric.backward(loss1) # no error fabric.backward(loss2) # no error # Model that returns a datastructure of tensors class DictReturnModel(nn.Linear): def forward(self, x): return { "loss": super().forward(x).sum(), "other": torch.rand(2, 2), # does not require grad } model3 = DictReturnModel(2, 2) model3 = getattr(fabric, setup_method)(model3) output = model3(batch) loss = output["loss"] other = output["other"] assert loss._backward_hooks is not None assert other._backward_hooks is None with error_context: (loss * 2).backward() loss = model3(batch)["loss"] fabric.backward(loss * 2) # no error @RunIf(deepspeed=True, mps=False) def test_backward_model_input_required(): """Test that when using deepspeed and multiple models, backward() requires the model as input.""" fabric = Fabric(strategy="deepspeed", devices=1) fabric._launched = True # pretend we have launched model0 = nn.Linear(1, 2) model1 = nn.Linear(1, 2) optimizer0 = torch.optim.Adam(model0.parameters()) optimizer1 = torch.optim.Adam(model1.parameters()) fabric._strategy.setup_module_and_optimizers = lambda *args: args fabric.setup(model0, optimizer0) fabric.setup(model1, optimizer1) loss = model0(torch.randn(1, 1, device=fabric.device)).sum() with pytest.raises(ValueError, match="please provide the model used to perform"): fabric.backward(loss) def test_autocast(): """Test that the Fabric autocast context manager lets the precision plugin handle casting.""" fabric = Fabric() fabric._precision.forward_context = MagicMock() fabric._precision.forward_context().__enter__.assert_not_called() with fabric.autocast(): fabric._precision.forward_context().__enter__.assert_called() fabric._precision.forward_context().__exit__.assert_called() def test_no_backward_sync(): """Test that `Fabric.no_backward_sync()` validates the strategy and model is compatible.""" fabric = Fabric(devices=1) model = nn.Linear(3, 3) with pytest.raises(TypeError, match="You need to set up the model first"), fabric.no_backward_sync(model): pass model = fabric.setup(model) # pretend that the strategy does not support skipping backward sync fabric._strategy = Mock(spec=ParallelStrategy, _backward_sync_control=None) with pytest.warns( PossibleUserWarning, match="The `ParallelStrategy` does not support skipping the" ), fabric.no_backward_sync(model): pass # for single-device strategies, it becomes a no-op without warning fabric._strategy = Mock(spec=SingleDeviceStrategy, _backward_sync_control=MagicMock()) with fabric.no_backward_sync(model): pass fabric._strategy._backward_sync_control.no_backward_sync.assert_not_called() # same for XLA fabric._strategy = Mock(spec=XLAStrategy, _backward_sync_control=MagicMock()) with fabric.no_backward_sync(model): pass fabric._strategy._backward_sync_control.no_backward_sync.assert_not_called() # pretend that the strategy supports skipping backward sync fabric._strategy = Mock(_backward_sync_control=MagicMock()) # disabling the context manager makes it a no-op with fabric.no_backward_sync(model, enabled=False): pass fabric._strategy._backward_sync_control.no_backward_sync.assert_called_once_with(model._forward_module, False) fabric._strategy._backward_sync_control.reset_mock() with fabric.no_backward_sync(model): pass fabric._strategy._backward_sync_control.no_backward_sync.assert_called_once_with(model._forward_module, True) def test_launch_without_function(): """Test the various ways `Fabric.launch()` can be called.""" # default: no launcher, single process fabric = Fabric() nothing = Mock() fabric.launch(nothing) nothing.assert_called() # with a launcher on the strategy fabric = Fabric() fabric._strategy._launcher = Mock() fabric.launch() fabric._strategy._launcher.launch.assert_called() def test_launch_with_function(): """Test the various ways `Fabric.launch(function)` can be called.""" def fn_without_args(): pass fabric = Fabric() with pytest.raises(TypeError, match="needs to take at least one argument"): fabric.launch(fn_without_args) def fn_with_one_arg(arg): assert isinstance(arg, Fabric) fn_with_one_arg.called = True fabric = Fabric() fabric.launch(fn_with_one_arg) assert fn_with_one_arg.called # common user mistake fabric = Fabric() with pytest.raises(TypeError, match="needs to be a callable"): fabric.launch(fn_with_one_arg(fabric)) @mock.patch.dict(os.environ, {"LT_CLI_USED": "1"}) # pretend we are using the CLI def test_launch_and_cli_not_allowed(): fabric = Fabric(devices=1) with pytest.raises(RuntimeError, match=escape("Calling `.launch()` again is not allowed")): fabric.launch() @RunIf(mps=False) @pytest.mark.parametrize("strategy", ["xla", "ddp_spawn"]) def test_launch_and_strategies_unsupported_combinations(strategy, xla_available): fabric = Fabric(strategy=strategy) with pytest.raises(TypeError, match=r"launch\(\)` needs to be called with a function"): fabric.launch() @mock.patch.dict(os.environ, {"LT_CLI_USED": "1"}) # pretend we are using the CLI def test_overridden_run_and_cli_not_allowed(): class FabricWithRun(Fabric): def run(self): pass with pytest.raises(TypeError, match=escape("Overriding `Fabric.run()` and launching from the CLI is not allowed")): FabricWithRun() def test_module_sharding_context(): """Test that the sharding context manager gets applied when the strategy supports it and is a no-op otherwise.""" fabric = Fabric() fabric._launched = True fabric._strategy = MagicMock(spec=DDPStrategy, module_sharded_context=Mock()) with pytest.warns(DeprecationWarning, match="sharded_model"), fabric.sharded_model(): pass fabric._strategy.module_sharded_context.assert_not_called() fabric._strategy = MagicMock(spec=_Sharded) with pytest.warns(DeprecationWarning, match="sharded_model"), fabric.sharded_model(): pass fabric._strategy.module_sharded_context.assert_called_once() def test_init_module_context(monkeypatch): """Test that the strategy returns the context manager for initializing the module.""" fabric = Fabric(accelerator="cpu") strategy = SingleDeviceStrategy(device=torch.device("cuda")) strategy.module_init_context = Mock(wraps=strategy.module_init_context) fabric._strategy = strategy with fabric.init_module(): pass strategy.module_init_context.assert_called_once_with(empty_init=None) strategy.module_init_context.reset_mock() def test_init_tensor_context(monkeypatch): fabric = Fabric(accelerator="cpu") strategy = SingleDeviceStrategy(device=torch.device("cuda")) strategy.tensor_init_context = Mock(wraps=strategy.tensor_init_context) fabric._strategy = strategy with fabric.init_tensor(): pass strategy.tensor_init_context.assert_called_once() strategy.tensor_init_context.reset_mock() def test_callbacks_input(): """Test the various ways in which callbacks can be registered with Fabric.""" callback0 = Mock() callback1 = Mock() # single callback fabric = Fabric(callbacks=callback0) assert fabric._callbacks == [callback0] # multiple callbacks fabric = Fabric(callbacks=[callback0, callback1]) assert fabric._callbacks == [callback0, callback1] def test_call(): """Test that `fabric.call` triggers the callback implementations.""" callback0 = Mock() callback1 = Mock() fabric = Fabric(callbacks=[callback0, callback1]) # No arguments fabric.call("on_train_end") callback0.on_train_end.assert_called_once() callback1.on_train_end.assert_called_once() # Optional arguments fabric.call("on_train_end", "positional", keyword="keyword") callback0.on_train_end.assert_called_with("positional", keyword="keyword") callback1.on_train_end.assert_called_with("positional", keyword="keyword") # Some callbacks don't implement the requested hook callback0 = Mock() callback1 = Mock(spec_set={}) # `on_train_end` not defined for this callback fabric = Fabric(callbacks=[callback0, callback1]) fabric.call("on_train_end") callback0.on_train_end.assert_called_once() assert not callback1.mock_calls # no methods were called on callback1 # Skip callback attributes that are not callable callback = Mock(not_a_method=1) fabric = Fabric(callbacks=[callback]) with pytest.warns(UserWarning, match="Skipping the callback `Mock.not_a_method`"): fabric.call("not_a_method") assert not callback1.mock_calls def test_special_callbacks(): """Tests special callbacks that have hooks for internal Fabric events.""" class SpecialCallback: def on_after_optimizer_step(self, strategy, optimizer): pass def on_after_setup(self, fabric, module): pass callback = Mock(wraps=SpecialCallback()) fabric = Fabric(accelerator="cpu", callbacks=[callback]) model = torch.nn.Linear(2, 2) optimizer = torch.optim.SGD(model.parameters(), lr=0.1) fabric_model, fabric_optimizer = fabric.setup(model, optimizer) callback.on_after_setup.assert_called_once_with(fabric=fabric, module=fabric_model) model(torch.randn(2, 2)).sum().backward() fabric_optimizer.step() callback.on_after_optimizer_step.assert_called_once_with(strategy=fabric._strategy, optimizer=optimizer) def test_loggers_input(): """Test the various ways in which loggers can be registered with Fabric.""" logger0 = Mock() logger1 = Mock() # no logger fabric = Fabric(loggers=None) assert fabric._loggers == [] fabric = Fabric(loggers=[]) assert fabric._loggers == [] # single logger fabric = Fabric(loggers=logger0) assert fabric._loggers == [logger0] # multiple loggers fabric = Fabric(loggers=[logger0, logger1]) assert fabric._loggers == [logger0, logger1] def test_log(): """Test that `fabric.log` sends the metrics to each logger.""" logger0 = Mock() logger1 = Mock() fabric = Fabric(loggers=[logger0, logger1]) fabric.log("test", 1) logger0.log_metrics.assert_called_with(metrics={"test": 1}, step=None) logger1.log_metrics.assert_called_with(metrics={"test": 1}, step=None) fabric.log("test", 2, step=15) logger0.log_metrics.assert_called_with(metrics={"test": 2}, step=15) logger1.log_metrics.assert_called_with(metrics={"test": 2}, step=15) def test_log_dict(): """Test that `fabric.log_dict` sends the metrics dict to each logger.""" logger0 = Mock() logger1 = Mock() fabric = Fabric(loggers=[logger0, logger1]) fabric.log_dict({"foo": 1, "bar": 2}, step=None) logger0.log_metrics.assert_called_with(metrics={"foo": 1, "bar": 2}, step=None) logger1.log_metrics.assert_called_with(metrics={"foo": 1, "bar": 2}, step=None) fabric.log_dict({"foo": 3, "bar": 4}, step=15) logger0.log_metrics.assert_called_with(metrics={"foo": 3, "bar": 4}, step=15) logger1.log_metrics.assert_called_with(metrics={"foo": 3, "bar": 4}, step=15) def test_log_dict_input_parsing(): """Test validation of input data types and preprocessing.""" logger = Mock() fabric = Fabric(loggers=[logger]) # Tensor scalar, 0 dims fabric.log("log", torch.tensor(1)) logger.log_metrics.assert_called_with(metrics={"log": 1}, step=None) fabric.log_dict({"log_dict": torch.tensor(1)}) logger.log_metrics.assert_called_with(metrics={"log_dict": 1}, step=None) # Tensor scalar, 1 dims fabric.log("log", torch.tensor([2])) logger.log_metrics.assert_called_with(metrics={"log": 2}, step=None) fabric.log_dict({"log_dict": torch.tensor([2])}) logger.log_metrics.assert_called_with(metrics={"log_dict": 2}, step=None) # Tensor, multiple dims with pytest.raises(ValueError, match="it cannot be converted to a scalar."): fabric.log("log", torch.tensor([3, 4])) with pytest.raises(ValueError, match="it cannot be converted to a scalar."): fabric.log_dict({"log_dict": torch.tensor([3, 4])}) @pytest.mark.parametrize("setup", [True, False]) def test_save_wrapped_objects(setup, tmp_path): """Test that when modules and optimizers are in the state, they get unwrapped properly.""" fabric = Fabric(devices=1) save_checkpoint_mock = Mock() fabric.strategy.save_checkpoint = save_checkpoint_mock unwrapped_model = BoringModel() unwrapped_optimizer = torch.optim.Adam(unwrapped_model.parameters()) if setup: model, optimizer = fabric.setup(unwrapped_model, unwrapped_optimizer) assert isinstance(model, _FabricModule) assert isinstance(optimizer, _FabricOptimizer) else: model, optimizer = unwrapped_model, unwrapped_optimizer anything = {"cocofruit": 1} state = {"model": model, "optimizer": optimizer, "anything": anything} expected = {"model": unwrapped_model, "optimizer": unwrapped_optimizer, "anything": anything} fabric.save(tmp_path, state) save_checkpoint_mock.assert_called_with(state=expected, path=tmp_path, filter=None) def test_save_filter(tmp_path): fabric = Fabric(devices=1) checkpoint_io_mock = Mock() fabric.strategy.checkpoint_io = checkpoint_io_mock model = BoringModel() optimizer = torch.optim.Adam(model.parameters()) anything = {"cocofruit": 1} state = {"model": model, "optimizer": optimizer, "anything": anything, "foo": 1} save_path = tmp_path / "foo.pth" # filter all dicts filter = {k: lambda k, v: False for k in state} fabric.save(save_path, state, filter=filter) checkpoint_io_mock.save_checkpoint.assert_called_with(checkpoint={"foo": 1}, path=save_path, storage_options=None) # bad filters with pytest.raises(TypeError, match="should be a dict"): fabric.save(save_path, state, filter="foo") with pytest.raises(TypeError, match="callable, given 'foo"): fabric.save(save_path, state, filter={"model": "foo"}) with pytest.raises(ValueError, match="keys {'asd'} are not present in the state keys"): fabric.save(save_path, state, filter={"asd": lambda k, v: True}) # subset checkpoint_io_mock.reset_mock() filter = { "model": lambda k, v: "weight" in k, "anything": lambda k, v: isinstance(v, int), "optimizer": lambda k, v: "param_groups" in k, } fabric.save(save_path, state, filter=filter) checkpoint_io_mock.save_checkpoint.assert_called_with( checkpoint={"model": {"layer.weight": ANY}, "optimizer": {"param_groups": ANY}, "anything": anything, "foo": 1}, path=save_path, storage_options=None, ) @pytest.mark.parametrize("setup", [True, False]) def test_load_wrapped_objects(setup, tmp_path): """Test that loading happens in-place for model, optimizer, and other user data.""" fabric = Fabric(accelerator="cpu") expected_remainder = {"extra": "data"} def mocked_load_checkpoint(path, state, strict): assert not isinstance(state["model"], _FabricModule) assert not isinstance(state["optimizer"], _FabricOptimizer) state.update({"int": 5, "dict": {"x": 1}}) return expected_remainder fabric.strategy.load_checkpoint = mocked_load_checkpoint unwrapped_model = BoringModel() unwrapped_optimizer = torch.optim.Adam(unwrapped_model.parameters()) if setup: model, optimizer = fabric.setup(unwrapped_model, unwrapped_optimizer) assert isinstance(model, _FabricModule) assert isinstance(optimizer, _FabricOptimizer) else: model, optimizer = unwrapped_model, unwrapped_optimizer state = {"model": model, "optimizer": optimizer, "int": 0, "dict": {"x": 0}} expected = {"model": model, "optimizer": optimizer, "int": 5, "dict": {"x": 1}} remainder = fabric.load(tmp_path, state) assert state == expected assert remainder == expected_remainder def test_load_raw(): """Test that `Fabric.load_raw()` unwraps the object to load and calls into the strategy.""" fabric = Fabric(accelerator="cpu") fabric.strategy.load_checkpoint = Mock() model = torch.nn.Linear(2, 2) optimizer = torch.optim.Adam(model.parameters()) wrapped_model, wrapped_optimizer = fabric.setup(model, optimizer) fabric.load_raw(path="path0", obj=model) fabric.strategy.load_checkpoint.assert_called_with(path="path0", state=model, strict=True) fabric.load_raw(path="path1", obj=wrapped_model, strict=False) fabric.strategy.load_checkpoint.assert_called_with(path="path1", state=model, strict=False) fabric.load_raw(path="path2", obj=wrapped_optimizer) fabric.strategy.load_checkpoint.assert_called_with(path="path2", state=optimizer, strict=True) def test_barrier(): """Test that `Fabric.barrier()` calls into the strategy.""" fabric = Fabric() fabric._strategy = Mock() fabric._launched = True fabric.barrier("test") fabric._strategy.barrier.assert_called_once_with(name="test") def test_broadcast(): """Test that `Fabric.broadcast()` calls into the strategy.""" fabric = Fabric() fabric._strategy = Mock() fabric._launched = True fabric.broadcast(torch.tensor(1), src=2) fabric._strategy.broadcast.assert_called_once_with(torch.tensor(1), src=2) def test_all_gather(): """Test that `Fabric.all_gather()` applies itself to collections and calls into the strategy.""" fabric = Fabric() fabric._strategy = Mock(root_device=torch.device("cpu")) fabric._launched = True defaults = {"group": None, "sync_grads": False} # single tensor fabric.all_gather(torch.tensor(1)) fabric._strategy.all_gather.assert_called_once_with(torch.tensor(1), **defaults) fabric._strategy.reset_mock() # list fabric.all_gather([torch.tensor(2), torch.tensor(3), "string"]) fabric._strategy.all_gather.assert_has_calls([call(torch.tensor(2), **defaults), call(torch.tensor(3), **defaults)]) fabric._strategy.reset_mock() # dict fabric.all_gather({"a": torch.tensor(4), "b": [torch.tensor(5)], "c": "string"}) fabric._strategy.all_gather.assert_has_calls([call(torch.tensor(4), **defaults), call(torch.tensor(5), **defaults)]) def test_all_reduce(): """Test that `Fabric.all_reduce()` applies itself to collections and calls into the strategy.""" fabric = Fabric() fabric._strategy = Mock(root_device=torch.device("cpu")) fabric._launched = True defaults = {"group": None, "reduce_op": "mean"} # single tensor fabric.all_reduce(torch.tensor(1)) fabric._strategy.all_reduce.assert_called_once_with(torch.tensor(1), **defaults) fabric._strategy.reset_mock() # list fabric.all_reduce([torch.tensor(2), torch.tensor(3), "string"]) fabric._strategy.all_reduce.assert_has_calls([call(torch.tensor(2), **defaults), call(torch.tensor(3), **defaults)]) fabric._strategy.reset_mock() # dict fabric.all_reduce({"a": torch.tensor(4), "b": [torch.tensor(5)], "c": "string"}) fabric._strategy.all_reduce.assert_has_calls([call(torch.tensor(4), **defaults), call(torch.tensor(5), **defaults)]) def test_rank_zero_first(monkeypatch): """Test that rank 0 completes first before all other processes can execute under `.rank_zero_first()`.""" def record_calls_for_rank(rank): call_order = [] fabric = Fabric() fabric._strategy = Mock(global_rank=rank) barrier_mock = MagicMock(side_effect=lambda *_: call_order.append("barrier")) monkeypatch.setattr(lightning.fabric.utilities.distributed._InfiniteBarrier, "__call__", barrier_mock) target = Mock(run=Mock(side_effect=lambda *_: call_order.append("run"))) with fabric.rank_zero_first(): target.run() return call_order assert record_calls_for_rank(0) == ["run", "barrier"] assert record_calls_for_rank(1) == ["barrier", "run"] @pytest.mark.parametrize(("clip_val", "max_norm"), [(1e-3, None), (None, 1)]) def test_grad_clipping(clip_val, max_norm): fabric = Fabric(devices=1) fabric.strategy.clip_gradients_norm = Mock() fabric.strategy.clip_gradients_value = Mock() torch_model = nn.Linear(1, 1) torch_optimizer = torch.optim.SGD(torch_model.parameters(), lr=1e-3) model, optimizer = fabric.setup(torch_model, torch_optimizer) loss = model(torch.rand(1, 1).to(fabric.device)) fabric.backward(loss) fabric.strategy.clip_gradients_value.assert_not_called() fabric.strategy.clip_gradients_norm.assert_not_called() fabric.clip_gradients(model, optimizer, max_norm=max_norm, clip_val=clip_val) if clip_val is not None: fabric.strategy.clip_gradients_value.assert_called_once_with(torch_model, torch_optimizer, clip_val=clip_val) fabric.strategy.clip_gradients_norm.assert_not_called() else: fabric.strategy.clip_gradients_value.assert_not_called() fabric.strategy.clip_gradients_norm.assert_called_once_with( torch_model, torch_optimizer, max_norm=max_norm, norm_type=2.0, error_if_nonfinite=True ) def test_verify_launch_called(): """Test that the user gets an error message if they forgot to call `.launch()`.""" fabric = Fabric(accelerator="cpu") assert not fabric._launched fabric._strategy = Mock(spec=SingleDeviceStrategy) fabric._validate_launched() fabric._strategy = Mock(spec=DataParallelStrategy) fabric._validate_launched() fabric._strategy = Mock(spec=DDPStrategy) with pytest.raises(RuntimeError, match=r"you must call `.launch\(\)`"): fabric._validate_launched() # Methods method_names = ("setup", "setup_module", "setup_dataloaders", "broadcast", "barrier", "all_reduce", "all_gather") for method_name in method_names: method = getattr(fabric, method_name) with pytest.raises(RuntimeError, match=r"you must call `.launch\(\)`"): method(Mock()) # Context managers ctx_manager_names = ("init_module",) for ctx_manager_name in ctx_manager_names: ctx_manager = getattr(fabric, ctx_manager_name) with pytest.raises(RuntimeError, match=r"you must call `.launch\(\)`"), ctx_manager(): pass # the error is raised in the context manager and caught by `pytest.raises` fabric.launch() assert fabric._launched fabric._validate_launched()