# Copyright The Lightning AI team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import contextlib import json import logging import os from re import escape from typing import Any, Dict from unittest import mock from unittest.mock import ANY import pytest import torch import torch.nn.functional as F from lightning.pytorch import LightningDataModule, LightningModule, Trainer from lightning.pytorch.accelerators import CUDAAccelerator from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from lightning.pytorch.loggers import CSVLogger from lightning.pytorch.plugins import DeepSpeedPrecision from lightning.pytorch.strategies.deepspeed import _DEEPSPEED_AVAILABLE, DeepSpeedStrategy from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_0_11 as _TM_GE_0_11 from torch import Tensor, nn from torch.utils.data import DataLoader from torchmetrics import Accuracy from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf if _DEEPSPEED_AVAILABLE: import deepspeed from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict class ModelParallelBoringModel(BoringModel): def __init__(self): super().__init__() self.layer = None def configure_model(self) -> None: if self.layer is None: self.layer = torch.nn.Linear(32, 2) def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: self.configure_model() class ModelParallelBoringModelNoSchedulers(ModelParallelBoringModel): def configure_optimizers(self): return torch.optim.SGD(self.layer.parameters(), lr=0.1) class ModelParallelBoringModelManualOptim(BoringModel): def __init__(self): super().__init__() self.layer = None def training_step(self, batch, batch_idx): opt = self.optimizers() loss = self.step(batch) opt.zero_grad() self.manual_backward(loss) opt.step() def configure_model(self) -> None: if self.layer is None: self.layer = torch.nn.Linear(32, 2) def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: self.configure_model() @property def automatic_optimization(self) -> bool: return False @pytest.fixture() def deepspeed_config(): return { "optimizer": {"type": "SGD", "params": {"lr": 3e-5}}, "scheduler": { "type": "WarmupLR", "params": {"last_batch_iteration": -1, "warmup_min_lr": 0, "warmup_max_lr": 3e-5, "warmup_num_steps": 100}, }, } @pytest.fixture() def deepspeed_zero_config(deepspeed_config): return {**deepspeed_config, "zero_allow_untested_optimizer": True, "zero_optimization": {"stage": 2}} @RunIf(deepspeed=True) @pytest.mark.parametrize("strategy", ["deepspeed", DeepSpeedStrategy]) def test_deepspeed_strategy_string(tmpdir, strategy): """Test to ensure that the strategy can be passed via string or instance, and parallel devices is correctly set.""" trainer = Trainer( accelerator="cpu", fast_dev_run=True, default_root_dir=tmpdir, strategy=strategy if isinstance(strategy, str) else strategy(), ) assert isinstance(trainer.strategy, DeepSpeedStrategy) assert trainer.strategy.parallel_devices == [torch.device("cpu")] @RunIf(deepspeed=True) def test_deepspeed_strategy_env(tmpdir, monkeypatch, deepspeed_config): """Test to ensure that the strategy can be passed via a string with an environment variable.""" config_path = os.path.join(tmpdir, "temp.json") with open(config_path, "w") as f: f.write(json.dumps(deepspeed_config)) monkeypatch.setenv("PL_DEEPSPEED_CONFIG_PATH", config_path) trainer = Trainer(accelerator="cpu", fast_dev_run=True, default_root_dir=tmpdir, strategy="deepspeed") strategy = trainer.strategy assert isinstance(strategy, DeepSpeedStrategy) assert strategy.parallel_devices == [torch.device("cpu")] assert strategy.config == deepspeed_config @RunIf(deepspeed=True, mps=False) def test_deepspeed_precision_choice(cuda_count_1, tmpdir): """Test to ensure precision plugin is also correctly chosen. DeepSpeed handles precision via Custom DeepSpeedPrecision """ trainer = Trainer( fast_dev_run=True, default_root_dir=tmpdir, accelerator="gpu", strategy="deepspeed", precision="16-mixed", ) assert isinstance(trainer.strategy, DeepSpeedStrategy) assert isinstance(trainer.strategy.precision_plugin, DeepSpeedPrecision) assert trainer.strategy.precision_plugin.precision == "16-mixed" @RunIf(deepspeed=True) def test_deepspeed_with_invalid_config_path(): """Test to ensure if we pass an invalid config path we throw an exception.""" with pytest.raises( MisconfigurationException, match="You passed in a path to a DeepSpeed config but the path does not exist" ): DeepSpeedStrategy(config="invalid_path.json") @RunIf(deepspeed=True) def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config): """Test to ensure if we pass an env variable, we load the config from the path.""" config_path = os.path.join(tmpdir, "temp.json") with open(config_path, "w") as f: f.write(json.dumps(deepspeed_config)) monkeypatch.setenv("PL_DEEPSPEED_CONFIG_PATH", config_path) strategy = DeepSpeedStrategy() assert strategy.config == deepspeed_config @RunIf(deepspeed=True) def test_deepspeed_defaults(): """Ensure that defaults are correctly set as a config for DeepSpeed if no arguments are passed.""" strategy = DeepSpeedStrategy() assert strategy.config is not None assert isinstance(strategy.config["zero_optimization"], dict) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_warn_deepspeed_ignored(tmpdir): class TestModel(BoringModel): def backward(self, loss: Tensor, *args, **kwargs) -> None: return loss.backward() model = TestModel() trainer = Trainer( fast_dev_run=True, default_root_dir=tmpdir, strategy=DeepSpeedStrategy(), accelerator="gpu", devices=1, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) with pytest.warns(UserWarning, match="will be ignored since DeepSpeed handles the backward"): trainer.fit(model) @RunIf(min_cuda_gpus=1, deepspeed=True) @pytest.mark.parametrize( ("dataset_cls", "value"), [(RandomDataset, "auto"), (RandomDataset, 10), (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)], ) @mock.patch("deepspeed.init_distributed", autospec=True) @mock.patch("lightning.pytorch.Trainer.log_dir", new_callable=mock.PropertyMock, return_value="abc") def test_deepspeed_auto_batch_size_config_select(mock_deepspeed_distributed, mock_log_dir, tmpdir, dataset_cls, value): """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes.""" class TestModel(BoringModel): def train_dataloader(self): return DataLoader(dataset_cls(32, 64)) class AssertCallback(Callback): def setup(self, trainer, pl_module, stage: str) -> None: assert isinstance(trainer.strategy, DeepSpeedStrategy) config = trainer.strategy.config # int value overrides auto mode expected_value = value if isinstance(value, int) else 1 if dataset_cls == RandomDataset: expected_value = pl_module.train_dataloader().batch_size if value == "auto" else value assert config["train_micro_batch_size_per_gpu"] == expected_value raise SystemExit ck = AssertCallback() model = TestModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, callbacks=ck, accelerator="gpu", devices=1, strategy=DeepSpeedStrategy(logging_batch_size_per_gpu=value, zero_optimization=False), ) with pytest.raises(SystemExit): trainer.fit(model) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_run_configure_optimizers(tmpdir): """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using configure_optimizers for optimizers and schedulers.""" class TestCB(Callback): def on_train_start(self, trainer, pl_module) -> None: assert isinstance(trainer.optimizers[0], DeepSpeedZeroOptimizer) assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD) assert isinstance(trainer.lr_scheduler_configs[0].scheduler, torch.optim.lr_scheduler.StepLR) # check that the lr_scheduler config was preserved assert trainer.lr_scheduler_configs[0].name == "Sean" class TestModel(BoringModel): def configure_optimizers(self): [optimizer], [scheduler] = super().configure_optimizers() return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "name": "Sean"}} model = TestModel() lr_monitor = LearningRateMonitor() trainer = Trainer( strategy=DeepSpeedStrategy(), # disable ZeRO so our optimizers are not wrapped default_root_dir=tmpdir, accelerator="gpu", devices=1, fast_dev_run=True, precision="16-mixed", callbacks=[TestCB(), lr_monitor], logger=CSVLogger(tmpdir), enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) assert lr_monitor.lrs == {"Sean": [0.1]} _assert_save_model_is_equal(model, tmpdir, trainer) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_config(tmpdir, deepspeed_zero_config): """Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers and saves the model weights to load correctly.""" class TestCB(Callback): def on_train_start(self, trainer, pl_module) -> None: from deepspeed.runtime.lr_schedules import WarmupLR assert isinstance(trainer.optimizers[0], DeepSpeedZeroOptimizer) assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD) assert isinstance(trainer.lr_scheduler_configs[0].scheduler, WarmupLR) assert trainer.lr_scheduler_configs[0].interval == "step" model = BoringModel() lr_monitor = LearningRateMonitor() trainer = Trainer( strategy=DeepSpeedStrategy(config=deepspeed_zero_config), default_root_dir=tmpdir, accelerator="gpu", devices=1, log_every_n_steps=1, limit_train_batches=4, limit_val_batches=4, limit_test_batches=4, max_epochs=2, precision="16-mixed", callbacks=[TestCB(), lr_monitor], logger=CSVLogger(tmpdir), enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) trainer.test(model) assert list(lr_monitor.lrs) == ["lr-SGD"] assert len(set(lr_monitor.lrs["lr-SGD"])) == 8 @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_custom_precision_params(tmpdir): """Ensure if we modify the FP16 parameters via the DeepSpeedStrategy, the deepspeed config contains these changes.""" class TestCB(Callback): def on_train_start(self, trainer, pl_module) -> None: assert trainer.strategy.config["fp16"]["loss_scale"] == 10 assert trainer.strategy.config["fp16"]["initial_scale_power"] == 11 assert trainer.strategy.config["fp16"]["loss_scale_window"] == 12 assert trainer.strategy.config["fp16"]["hysteresis"] == 13 assert trainer.strategy.config["fp16"]["min_loss_scale"] == 14 raise SystemExit() model = BoringModel() ds = DeepSpeedStrategy( loss_scale=10, initial_scale_power=11, loss_scale_window=12, hysteresis=13, min_loss_scale=14 ) trainer = Trainer( default_root_dir=tmpdir, strategy=ds, precision="16-mixed", accelerator="gpu", devices=1, callbacks=[TestCB()], enable_progress_bar=False, enable_model_summary=False, ) with pytest.raises(SystemExit): trainer.fit(model) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) @pytest.mark.parametrize("precision", ["fp16", "bf16"]) def test_deepspeed_inference_precision_during_inference(precision, tmpdir): """Ensure if we modify the precision for deepspeed and execute inference-only, the deepspeed config contains these changes.""" class TestCB(Callback): def on_validation_start(self, trainer, pl_module) -> None: assert trainer.strategy.config[precision] raise SystemExit() model = BoringModel() strategy = DeepSpeedStrategy(config={precision: {"enabled": True}}) trainer = Trainer( default_root_dir=tmpdir, strategy=strategy, accelerator="cuda", devices=1, callbacks=[TestCB()], barebones=True, ) with pytest.raises(SystemExit): trainer.validate(model) @RunIf(deepspeed=True) def test_deepspeed_custom_activation_checkpointing_params(tmpdir): """Ensure if we modify the activation checkpointing parameters, the deepspeed config contains these changes.""" ds = DeepSpeedStrategy( partition_activations=True, cpu_checkpointing=True, contiguous_memory_optimization=True, synchronize_checkpoint_boundary=True, ) checkpoint_config = ds.config["activation_checkpointing"] assert checkpoint_config["partition_activations"] assert checkpoint_config["cpu_checkpointing"] assert checkpoint_config["contiguous_memory_optimization"] assert checkpoint_config["synchronize_checkpoint_boundary"] @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir): """Ensure if we modify the activation checkpointing parameters, we pass these to deepspeed.checkpointing.configure correctly.""" ds = DeepSpeedStrategy( partition_activations=True, cpu_checkpointing=True, contiguous_memory_optimization=True, synchronize_checkpoint_boundary=True, ) model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=1, strategy=ds, precision="16-mixed", accelerator="gpu", devices=1, enable_progress_bar=False, enable_model_summary=False, ) with mock.patch( "deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure ) as deepspeed_checkpointing_configure: trainer.fit(model) deepspeed_checkpointing_configure.assert_called_with( mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None ) @RunIf(min_cuda_gpus=1, deepspeed=True) def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config): """Ensure if we use a config and turn off offload_optimizer, that this is set to False within the config.""" deepspeed_zero_config["zero_optimization"]["offload_optimizer"] = False class TestCallback(Callback): def setup(self, trainer, pl_module, stage=None) -> None: assert trainer.strategy.config["zero_optimization"]["offload_optimizer"] is False raise SystemExit() model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, strategy=DeepSpeedStrategy(config=deepspeed_zero_config), precision="16-mixed", accelerator="gpu", devices=1, callbacks=[TestCallback()], ) with pytest.raises(SystemExit): trainer.fit(model) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multigpu(tmpdir): """Test to ensure that DeepSpeed with multiple GPUs works and deepspeed distributed is initialized correctly.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=2, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) with mock.patch.object( model, "configure_optimizers", wraps=model.configure_optimizers ) as mock_configure_optimizers: trainer.test(model) assert mock_configure_optimizers.call_count == 0 with mock.patch("deepspeed.init_distributed", wraps=deepspeed.init_distributed) as mock_deepspeed_distributed: trainer.fit(model) mock_deepspeed_distributed.assert_called_once() _assert_save_model_is_equal(model, tmpdir, trainer) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_fp32_works(tmpdir): model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="deepspeed_stage_3", fast_dev_run=True, enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_stage_3_save_warning(tmpdir): """Test to ensure that DeepSpeed Stage 3 gives a warning when saving on rank zero.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=2, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") # both ranks need to call save checkpoint, however only rank 0 needs to check the warning context_manager = ( pytest.warns(UserWarning, match="each worker will save a shard of the checkpoint within a directory.") if trainer.is_global_zero else contextlib.suppress() ) with context_manager: trainer.save_checkpoint(checkpoint_path) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_multigpu_single_file(tmpdir): """Test to ensure that DeepSpeed loads from a single file checkpoint.""" model = BoringModel() checkpoint_path = os.path.join(tmpdir, "model.pt") trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="cpu", devices=1) trainer.fit(model) trainer.save_checkpoint(checkpoint_path) trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=1, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) strategy = trainer.strategy assert isinstance(strategy, DeepSpeedStrategy) assert not strategy.load_full_weights with pytest.raises(FileNotFoundError, match="The provided path is not a valid DeepSpeed checkpoint"): trainer.test(model, ckpt_path=checkpoint_path) trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3, load_full_weights=True), accelerator="gpu", devices=1, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) strategy = trainer.strategy assert isinstance(strategy, DeepSpeedStrategy) assert strategy.load_full_weights trainer.test(model, ckpt_path=checkpoint_path) class ModelParallelClassificationModel(LightningModule): def __init__(self, lr: float = 0.01, num_blocks: int = 5): super().__init__() self.lr = lr self.num_blocks = num_blocks self.prepare_data_per_node = True self.train_acc = self.valid_acc = self.test_acc = None self.model = None def make_block(self): return nn.Sequential(nn.Linear(32, 32, bias=False), nn.ReLU()) def configure_model(self) -> None: # As of deepspeed v0.9.3, in ZeRO stage 3 all submodules need to be created within this hook, # including the metrics. Otherwise, modules that aren't affected by `deepspeed.zero.Init()` # won't be moved to the GPU. See https://github.com/microsoft/DeepSpeed/pull/3611 if self.model is None: metric = Accuracy(task="multiclass", num_classes=3) if _TM_GE_0_11 else Accuracy() self.train_acc = metric.clone() self.valid_acc = metric.clone() self.test_acc = metric.clone() self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3)) def forward(self, x): x = self.model(x) # Ensure output is in float32 for softmax operation x = x.float() return F.softmax(x, dim=1) def training_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) loss = F.cross_entropy(logits, y) self.log("train_loss", loss, prog_bar=True) self.log("train_acc", self.train_acc(logits, y), prog_bar=True, sync_dist=True) return {"loss": loss} def validation_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log("val_loss", F.cross_entropy(logits, y), prog_bar=False, sync_dist=True) self.log("val_acc", self.valid_acc(logits, y), prog_bar=True, sync_dist=True) def test_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log("test_loss", F.cross_entropy(logits, y), prog_bar=False, sync_dist=True) self.log("test_acc", self.test_acc(logits, y), prog_bar=True, sync_dist=True) def predict_step(self, batch, batch_idx, dataloader_idx=0): x, y = batch logits = self.forward(x) self.test_acc(logits, y) return self.test_acc.compute() def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}] def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: if not hasattr(self, "model"): self.configure_model() # Lightning saves the lr schedulers, but DeepSpeed saves the optimizer states separately assert len(checkpoint["lr_schedulers"]) == 1 assert "optimizer_states" not in checkpoint class ManualModelParallelClassificationModel(ModelParallelClassificationModel): @property def automatic_optimization(self) -> bool: return False def training_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) loss = F.cross_entropy(logits, y) opt = self.optimizers() self.log("train_loss", loss, prog_bar=True) self.log("train_acc", self.train_acc(logits, y), prog_bar=True, sync_dist=True) opt.zero_grad() self.manual_backward(loss) opt.step() @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multigpu_stage_3(tmpdir): """Test to ensure ZeRO Stage 3 works with a parallel model.""" model = ModelParallelBoringModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=2, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) trainer.test(model) trainer.fit(model) _assert_save_model_is_equal(model, tmpdir, trainer) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config): """Test to ensure ZeRO Stage 3 works with a parallel model.""" model = ModelParallelBoringModelManualOptim() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=2, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) trainer.test(model) trainer.fit(model) _assert_save_model_is_equal(model, tmpdir, trainer) @pytest.mark.xfail(strict=False, reason="skipped due to deepspeed/#2449, keep track @rohitgr7") @pytest.mark.parametrize(("accumulate_grad_batches", "automatic_optimization"), [(1, False), (2, True)]) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, sklearn=True) def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches): model = ModelParallelClassificationModel() if automatic_optimization else ManualModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer( default_root_dir=tmpdir, max_epochs=10, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=2, precision="16-mixed", accumulate_grad_batches=accumulate_grad_batches, callbacks=[ck], enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model, datamodule=dm) results = trainer.test(datamodule=dm) saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) assert saved_results == results model = ModelParallelClassificationModel() if automatic_optimization else ManualModelParallelClassificationModel() trainer = Trainer( default_root_dir=tmpdir, accelerator="gpu", devices=2, strategy=DeepSpeedStrategy(stage=3), precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True, sklearn=True) def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir): """Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the optimizer state and scheduler states cannot be restored.""" dm = ClassifDataModule() model = BoringModel() checkpoint_path = os.path.join(tmpdir, "model.pt") trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, enable_progress_bar=False, enable_model_summary=False, accelerator="cpu", devices=1, ) trainer.fit(model) trainer.save_checkpoint(checkpoint_path) trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, strategy=DeepSpeedStrategy(stage=3, load_full_weights=True), accelerator="gpu", devices=1, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) with pytest.warns( UserWarning, match="A single checkpoint file has been given. This means optimizer states cannot be restored. " "If you'd like to restore these states, you must " "provide a path to the originally saved DeepSpeed checkpoint.", ): trainer.fit(model, datamodule=dm, ckpt_path=checkpoint_path) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True, sklearn=True) def test_deepspeed_multigpu_stage_3_resume_training(tmpdir): """Test to ensure with Stage 3 and single GPU that we can resume training.""" initial_model = ModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) initial_trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=1, precision="16-mixed", callbacks=[ck], enable_progress_bar=False, enable_model_summary=False, ) initial_trainer.fit(initial_model, datamodule=dm) class TestCallback(Callback): def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) -> None: original_deepspeed_strategy = initial_trainer.strategy current_deepspeed_strategy = trainer.strategy assert isinstance(original_deepspeed_strategy, DeepSpeedStrategy) assert isinstance(current_deepspeed_strategy, DeepSpeedStrategy) # assert optimizer states are the correctly loaded original_optimizer_dict = original_deepspeed_strategy.deepspeed_engine.optimizer.state_dict() current_optimizer_dict = current_deepspeed_strategy.deepspeed_engine.optimizer.state_dict() for orig_tensor, current_tensor in zip( original_optimizer_dict["fp32_flat_groups"], current_optimizer_dict["fp32_flat_groups"] ): assert torch.all(orig_tensor.eq(current_tensor)) # assert model state is loaded correctly for current_param, initial_param in zip(pl_module.parameters(), initial_model.parameters()): assert torch.equal(current_param.cpu(), initial_param.cpu()) # assert epoch has correctly been restored assert trainer.current_epoch == 1 # assert lr-scheduler states are loaded correctly original_lr_scheduler = initial_trainer.lr_scheduler_configs[0].scheduler current_lr_scheduler = trainer.lr_scheduler_configs[0].scheduler assert original_lr_scheduler.state_dict() == current_lr_scheduler.state_dict() model = ModelParallelClassificationModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=1, max_epochs=2, limit_train_batches=1, limit_val_batches=0, precision="16-mixed", callbacks=TestCallback(), enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model, datamodule=dm, ckpt_path=ck.best_model_path) @pytest.mark.parametrize("offload_optimizer", [False, True]) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, sklearn=True) def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): """Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.""" class VerificationCallback(Callback): def __init__(self): self.on_train_batch_start_called = False def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int) -> None: deepspeed_engine = trainer.strategy.model assert trainer.global_step == deepspeed_engine.global_steps self.on_train_batch_start_called = True model = ModelParallelClassificationModel() dm = ClassifDataModule() verification_callback = VerificationCallback() strategy = DeepSpeedStrategy(stage=2, offload_optimizer=offload_optimizer) strategy.config["zero_force_ds_cpu_optimizer"] = False trainer = Trainer( default_root_dir=tmpdir, # TODO: this test fails with max_epochs >1 as there are leftover batches per epoch. # there's divergence in how Lightning handles the last batch of the epoch with how DeepSpeed does it. # we step the optimizers on the last batch but DeepSpeed keeps the accumulation for the next epoch max_epochs=1, strategy=strategy, accelerator="gpu", devices=2, limit_train_batches=5, limit_val_batches=2, precision="16-mixed", accumulate_grad_batches=2, callbacks=[verification_callback], enable_progress_bar=False, enable_model_summary=False, ) assert trainer.limit_train_batches % trainer.accumulate_grad_batches != 0, "leftover batches should be tested" trainer.fit(model, datamodule=dm) assert verification_callback.on_train_batch_start_called @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multigpu_test(tmpdir): """Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3.""" model = ModelParallelBoringModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=2, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) trainer.test(model) # TODO(Sean): Once partial parameter partitioning is supported this test should be re-enabled @pytest.mark.xfail(strict=False, reason="Partial parameter partitioning for DeepSpeed is currently broken.") @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_multigpu_partial_partition_parameters(tmpdir): """Test to ensure that a module that defines a layer inside the ``__init__`` and ``configure_model`` correctly converts all parameters to float16 when ``precision=16`` and runs successfully.""" class TestModel(ModelParallelBoringModel): def __init__(self): super().__init__() self.layer_2 = torch.nn.Linear(32, 32) def configure_model(self) -> None: if self.layer is None: self.layer = torch.nn.Linear(32, 2) def forward(self, x): x = self.layer_2(x) return self.layer(x) def on_train_epoch_start(self) -> None: assert all(x.dtype == torch.float16 for x in self.parameters()) model = TestModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=1, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_multigpu_test_rnn(tmpdir): """Test to ensure that turning off explicit partitioning of the entire module for ZeRO Stage 3 works when training with certain layers which will crash with explicit partitioning.""" class TestModel(BoringModel): def __init__(self): super().__init__() self.rnn = torch.nn.GRU(32, 32) def on_train_epoch_start(self) -> None: assert all(x.dtype == torch.float16 for x in self.parameters()) model = TestModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=1, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) @RunIf(deepspeed=True, mps=False) @mock.patch("deepspeed.init_distributed", autospec=True) @pytest.mark.parametrize("platform", ["Linux", "Windows"]) def test_deepspeed_strategy_env_variables(mock_deepspeed_distributed, tmpdir, platform): """Test to ensure that we setup distributed communication using correctly. When using windows, ranks environment variables should not be set, and deepspeed should handle this. """ trainer = Trainer(default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3)) strategy = trainer.strategy assert isinstance(strategy, DeepSpeedStrategy) with mock.patch("platform.system", return_value=platform) as mock_platform: strategy._init_deepspeed_distributed() mock_deepspeed_distributed.assert_called() mock_platform.assert_called() if platform == "Windows": # assert no env variables have been set within the DeepSpeedStrategy assert all(k not in os.environ for k in ("MASTER_PORT", "MASTER_ADDR", "RANK", "WORLD_SIZE", "LOCAL_RANK")) else: assert os.environ["MASTER_ADDR"] == str(trainer.strategy.cluster_environment.main_address) assert os.environ["MASTER_PORT"] == str(trainer.strategy.cluster_environment.main_port) assert os.environ["RANK"] == str(trainer.strategy.global_rank) assert os.environ["WORLD_SIZE"] == str(trainer.strategy.world_size) assert os.environ["LOCAL_RANK"] == str(trainer.strategy.local_rank) def _assert_save_model_is_equal(model, tmpdir, trainer): checkpoint_path = os.path.join(tmpdir, "model.pt") checkpoint_path = trainer.strategy.broadcast(checkpoint_path) trainer.save_checkpoint(checkpoint_path) # carry out the check only on rank 0 if trainer.is_global_zero: single_ckpt_path = os.path.join(tmpdir, "single_model.pt") convert_zero_checkpoint_to_fp32_state_dict(checkpoint_path, single_ckpt_path) state_dict = torch.load(single_ckpt_path) model = model.cpu() # Assert model parameters are identical after loading for orig_param, saved_model_param in zip(model.parameters(), state_dict.values()): if model.dtype == torch.half: # moved model to float32 for comparison with single fp32 saved weights saved_model_param = saved_model_param.half() assert torch.equal(orig_param, saved_model_param) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multigpu_no_schedulers(tmpdir): """Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers.""" model = ModelParallelBoringModelNoSchedulers() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3), accelerator="gpu", devices=2, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) _assert_save_model_is_equal(model, tmpdir, trainer) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_skip_backward_raises(tmpdir): class TestModel(BoringModel): def training_step(self, batch, batch_idx): return None model = TestModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(), accelerator="gpu", devices=1, fast_dev_run=True, precision="16-mixed", enable_progress_bar=False, enable_model_summary=False, ) with pytest.raises(MisconfigurationException, match="returning `None` .* is not supported"): trainer.fit(model) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_setup_train_dataloader(tmpdir): """Test DeepSpeed works when setup is required to call in the DataModule.""" class TestSetupIsCalledDataModule(LightningDataModule): def __init__(self): super().__init__() self._setup = False def setup(self, stage: str) -> None: self._setup = True def train_dataloader(self): assert self._setup return DataLoader(RandomDataset(32, 64), batch_size=2) def val_dataloader(self): assert self._setup return DataLoader(RandomDataset(32, 64), batch_size=2) def test_dataloader(self): assert self._setup return DataLoader(RandomDataset(32, 64), batch_size=2) model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, strategy=DeepSpeedStrategy(logging_level=logging.INFO), accelerator="gpu", devices=1, fast_dev_run=True, enable_progress_bar=False, enable_model_summary=False, ) dm = TestSetupIsCalledDataModule() with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object: trainer.fit(model, datamodule=dm) assert any("Tried to infer the batch size" in str(arg) for arg in mock_object.call_args_list) @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) @pytest.mark.parametrize("interval", ["step", "epoch"]) @pytest.mark.parametrize("max_epoch", [2]) @pytest.mark.parametrize("limit_train_batches", [2]) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_scheduler_step_count(mock_step, tmpdir, max_epoch, limit_train_batches, interval): """Test to ensure that the scheduler is called the correct amount of times during training when scheduler is set to step or epoch.""" class TestModel(BoringModel): def configure_optimizers(self): optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1) return { "optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": interval}, } model = TestModel() trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=limit_train_batches, limit_val_batches=0, max_epochs=max_epoch, accelerator="gpu", devices=1, strategy="deepspeed", enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) if interval == "epoch": # assert called once at init and once during training assert mock_step.call_count == 1 + max_epoch else: # assert called once at init and once during training assert mock_step.call_count == 1 + (max_epoch * limit_train_batches) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_configure_gradient_clipping(tmpdir): """Test to ensure that a warning is raised when `LightningModule.configure_gradient_clipping` is overridden in case of deepspeed.""" class TestModel(BoringModel): def configure_gradient_clipping(self, optimizer, gradient_clip_val, gradient_clip_algorithm): self.clip_gradients(optimizer, gradient_clip_val, gradient_clip_algorithm) model = TestModel() trainer = Trainer( default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="deepspeed", fast_dev_run=True, enable_progress_bar=False, enable_model_summary=False, ) with pytest.warns(UserWarning, match="handles gradient clipping internally"): trainer.fit(model) @RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True) def test_deepspeed_gradient_clip_by_value(tmpdir): """Test to ensure that an exception is raised when using `gradient_clip_algorithm='value'`.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="deepspeed", gradient_clip_algorithm="value", enable_progress_bar=False, enable_model_summary=False, ) with pytest.raises(MisconfigurationException, match="does not support clipping gradients by value"): trainer.fit(model) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multi_save_same_filepath(tmpdir): """Test that verifies that deepspeed saves only latest checkpoint in the specified path and deletes the old sharded checkpoints.""" class CustomModel(BoringModel): def training_step(self, *args, **kwargs): self.log("grank", self.global_rank) return super().training_step(*args, **kwargs) model = CustomModel() trainer = Trainer( default_root_dir=tmpdir, strategy="deepspeed", accelerator="gpu", devices=2, callbacks=[ModelCheckpoint(filename="{epoch}_{step}_{grank}", save_top_k=1)], limit_train_batches=1, limit_val_batches=0, num_sanity_val_steps=0, max_epochs=2, enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) filepath = "epoch=1_step=2_grank=0.0.ckpt" expected = {filepath} assert expected == set(os.listdir(trainer.checkpoint_callback.dirpath)) ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filepath) expected = {"latest", "zero_to_fp32.py", "checkpoint"} assert expected == set(os.listdir(ckpt_path)) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_with_bfloat16_precision(tmpdir): """Test that deepspeed works with bfloat16 precision.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, strategy="deepspeed_stage_3", accelerator="gpu", devices=2, fast_dev_run=True, precision="bf16-mixed", num_sanity_val_steps=0, enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) assert isinstance(trainer.strategy.precision_plugin, DeepSpeedPrecision) assert trainer.strategy.precision_plugin.precision == "bf16-mixed" assert trainer.strategy.config["zero_optimization"]["stage"] == 3 assert trainer.strategy.config["bf16"]["enabled"] assert model.layer.weight.dtype == torch.bfloat16 @RunIf(deepspeed=True) def test_error_with_invalid_accelerator(tmpdir): """Test DeepSpeedStrategy raises an exception if an invalid accelerator is used.""" trainer = Trainer( default_root_dir=tmpdir, accelerator="cpu", strategy="deepspeed", fast_dev_run=True, ) model = BoringModel() with pytest.raises(RuntimeError, match="DeepSpeed strategy is only supported on CUDA"): trainer.fit(model) @RunIf(min_cuda_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_configure_optimizer_device_set(tmpdir): """Test to ensure that the LM has access to the device within the ``configure_optimizer`` function, and estimated_stepping_batches works correctly as a result.""" class TestModel(BoringModel): def configure_optimizers(self): assert self.trainer.estimated_stepping_batches == 1 assert self.device.type == "cuda" raise SystemExit model = TestModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, accelerator="gpu", devices=2, strategy=DeepSpeedStrategy(), ) with pytest.raises(SystemExit): trainer.fit(model) @RunIf(deepspeed=True) @pytest.mark.parametrize("device_indices", [[1], [1, 0], [0, 2], [3, 2, 1]]) def test_validate_parallel_devices_indices(device_indices): """Test that the strategy validates that it doesn't support selecting specific devices by index. DeepSpeed doesn't support it and needs the index to match to the local rank of the process. """ strategy = DeepSpeedStrategy( accelerator=CUDAAccelerator(), parallel_devices=[torch.device("cuda", i) for i in device_indices] ) with pytest.raises( RuntimeError, match=escape(f"device indices {device_indices!r} don't match the local rank values of processes") ): strategy.setup_environment() @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, bf16_cuda=True) def test_deepspeed_init_module_with_stage_3(): """Tests how `.init_module()` behaves with ZeRO stage 3.""" trainer = Trainer( accelerator="cuda", devices=2, strategy="deepspeed_stage_3", precision="bf16-mixed", fast_dev_run=1 ) model = ModelParallelBoringModel() with mock.patch("deepspeed.zero.Init") as zero_init_mock: trainer.fit(model) zero_init_mock.assert_called_once_with(enabled=True, remote_device=None, config_dict_or_path=ANY) @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, bf16_cuda=True) @pytest.mark.parametrize("stage", [1, 2]) def test_deepspeed_init_module_with_stages_1_2(stage): """Tests how `.init_module()` behaves with ZeRO stages 1 and 2.""" strategy = DeepSpeedStrategy(stage=stage) trainer = Trainer(accelerator="cuda", devices=2, strategy=strategy, precision="bf16-mixed", fast_dev_run=1) model = ModelParallelBoringModel() with mock.patch("deepspeed.zero.Init") as zero_init_mock: trainer.fit(model) zero_init_mock.assert_called_once_with(enabled=False, remote_device=None, config_dict_or_path=ANY) assert model.layer.weight.dtype == torch.bfloat16 @RunIf(deepspeed=True) def test_deepspeed_load_checkpoint_validate_path(tmp_path): """Test that we validate the checkpoint path for a DeepSpeed checkpoint and give suggestions for user error.""" strategy = DeepSpeedStrategy() with pytest.raises(FileNotFoundError, match="The provided path is not a valid DeepSpeed checkpoint"): strategy.load_checkpoint(checkpoint_path=tmp_path) # User tries to pass the subfolder as the path checkpoint_path = tmp_path / "checkpoint" checkpoint_path.mkdir() with pytest.raises(FileNotFoundError, match=f"Try to load using this parent directory instead: {tmp_path}"): strategy.load_checkpoint(checkpoint_path=checkpoint_path) # User tries to pass an individual file inside the checkpoint folder checkpoint_path = checkpoint_path / "zero_pp_rank_0_mp_rank_00_model_states.pt" checkpoint_path.touch() with pytest.raises(FileNotFoundError, match=f"Try to load using this parent directory instead: {tmp_path}"): strategy.load_checkpoint(checkpoint_path=checkpoint_path)