1321 lines
49 KiB
Python
1321 lines
49 KiB
Python
# Copyright The Lightning AI team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import contextlib
|
|
import json
|
|
import logging
|
|
import os
|
|
from re import escape
|
|
from typing import Any, Dict
|
|
from unittest import mock
|
|
from unittest.mock import ANY
|
|
|
|
import pytest
|
|
import torch
|
|
import torch.nn.functional as F
|
|
from lightning.pytorch import LightningDataModule, LightningModule, Trainer
|
|
from lightning.pytorch.accelerators import CUDAAccelerator
|
|
from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
|
|
from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset
|
|
from lightning.pytorch.loggers import CSVLogger
|
|
from lightning.pytorch.plugins import DeepSpeedPrecision
|
|
from lightning.pytorch.strategies.deepspeed import _DEEPSPEED_AVAILABLE, DeepSpeedStrategy
|
|
from lightning.pytorch.utilities.exceptions import MisconfigurationException
|
|
from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_0_11 as _TM_GE_0_11
|
|
from torch import Tensor, nn
|
|
from torch.utils.data import DataLoader
|
|
from torchmetrics import Accuracy
|
|
|
|
from tests_pytorch.helpers.datamodules import ClassifDataModule
|
|
from tests_pytorch.helpers.runif import RunIf
|
|
|
|
if _DEEPSPEED_AVAILABLE:
|
|
import deepspeed
|
|
from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
|
|
from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
|
|
|
|
|
|
class ModelParallelBoringModel(BoringModel):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.layer = None
|
|
|
|
def configure_model(self) -> None:
|
|
if self.layer is None:
|
|
self.layer = torch.nn.Linear(32, 2)
|
|
|
|
def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
|
|
self.configure_model()
|
|
|
|
|
|
class ModelParallelBoringModelNoSchedulers(ModelParallelBoringModel):
|
|
def configure_optimizers(self):
|
|
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
|
|
|
|
|
|
class ModelParallelBoringModelManualOptim(BoringModel):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.layer = None
|
|
|
|
def training_step(self, batch, batch_idx):
|
|
opt = self.optimizers()
|
|
loss = self.step(batch)
|
|
opt.zero_grad()
|
|
self.manual_backward(loss)
|
|
opt.step()
|
|
|
|
def configure_model(self) -> None:
|
|
if self.layer is None:
|
|
self.layer = torch.nn.Linear(32, 2)
|
|
|
|
def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
|
|
self.configure_model()
|
|
|
|
@property
|
|
def automatic_optimization(self) -> bool:
|
|
return False
|
|
|
|
|
|
@pytest.fixture()
|
|
def deepspeed_config():
|
|
return {
|
|
"optimizer": {"type": "SGD", "params": {"lr": 3e-5}},
|
|
"scheduler": {
|
|
"type": "WarmupLR",
|
|
"params": {"last_batch_iteration": -1, "warmup_min_lr": 0, "warmup_max_lr": 3e-5, "warmup_num_steps": 100},
|
|
},
|
|
}
|
|
|
|
|
|
@pytest.fixture()
|
|
def deepspeed_zero_config(deepspeed_config):
|
|
return {**deepspeed_config, "zero_allow_untested_optimizer": True, "zero_optimization": {"stage": 2}}
|
|
|
|
|
|
@RunIf(deepspeed=True)
|
|
@pytest.mark.parametrize("strategy", ["deepspeed", DeepSpeedStrategy])
|
|
def test_deepspeed_strategy_string(tmpdir, strategy):
|
|
"""Test to ensure that the strategy can be passed via string or instance, and parallel devices is correctly set."""
|
|
|
|
trainer = Trainer(
|
|
accelerator="cpu",
|
|
fast_dev_run=True,
|
|
default_root_dir=tmpdir,
|
|
strategy=strategy if isinstance(strategy, str) else strategy(),
|
|
)
|
|
|
|
assert isinstance(trainer.strategy, DeepSpeedStrategy)
|
|
assert trainer.strategy.parallel_devices == [torch.device("cpu")]
|
|
|
|
|
|
@RunIf(deepspeed=True)
|
|
def test_deepspeed_strategy_env(tmpdir, monkeypatch, deepspeed_config):
|
|
"""Test to ensure that the strategy can be passed via a string with an environment variable."""
|
|
config_path = os.path.join(tmpdir, "temp.json")
|
|
with open(config_path, "w") as f:
|
|
f.write(json.dumps(deepspeed_config))
|
|
monkeypatch.setenv("PL_DEEPSPEED_CONFIG_PATH", config_path)
|
|
|
|
trainer = Trainer(accelerator="cpu", fast_dev_run=True, default_root_dir=tmpdir, strategy="deepspeed")
|
|
|
|
strategy = trainer.strategy
|
|
assert isinstance(strategy, DeepSpeedStrategy)
|
|
assert strategy.parallel_devices == [torch.device("cpu")]
|
|
assert strategy.config == deepspeed_config
|
|
|
|
|
|
@RunIf(deepspeed=True, mps=False)
|
|
def test_deepspeed_precision_choice(cuda_count_1, tmpdir):
|
|
"""Test to ensure precision plugin is also correctly chosen.
|
|
|
|
DeepSpeed handles precision via Custom DeepSpeedPrecision
|
|
|
|
"""
|
|
trainer = Trainer(
|
|
fast_dev_run=True,
|
|
default_root_dir=tmpdir,
|
|
accelerator="gpu",
|
|
strategy="deepspeed",
|
|
precision="16-mixed",
|
|
)
|
|
|
|
assert isinstance(trainer.strategy, DeepSpeedStrategy)
|
|
assert isinstance(trainer.strategy.precision_plugin, DeepSpeedPrecision)
|
|
assert trainer.strategy.precision_plugin.precision == "16-mixed"
|
|
|
|
|
|
@RunIf(deepspeed=True)
|
|
def test_deepspeed_with_invalid_config_path():
|
|
"""Test to ensure if we pass an invalid config path we throw an exception."""
|
|
with pytest.raises(
|
|
MisconfigurationException, match="You passed in a path to a DeepSpeed config but the path does not exist"
|
|
):
|
|
DeepSpeedStrategy(config="invalid_path.json")
|
|
|
|
|
|
@RunIf(deepspeed=True)
|
|
def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config):
|
|
"""Test to ensure if we pass an env variable, we load the config from the path."""
|
|
config_path = os.path.join(tmpdir, "temp.json")
|
|
with open(config_path, "w") as f:
|
|
f.write(json.dumps(deepspeed_config))
|
|
monkeypatch.setenv("PL_DEEPSPEED_CONFIG_PATH", config_path)
|
|
strategy = DeepSpeedStrategy()
|
|
assert strategy.config == deepspeed_config
|
|
|
|
|
|
@RunIf(deepspeed=True)
|
|
def test_deepspeed_defaults():
|
|
"""Ensure that defaults are correctly set as a config for DeepSpeed if no arguments are passed."""
|
|
strategy = DeepSpeedStrategy()
|
|
assert strategy.config is not None
|
|
assert isinstance(strategy.config["zero_optimization"], dict)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_warn_deepspeed_ignored(tmpdir):
|
|
class TestModel(BoringModel):
|
|
def backward(self, loss: Tensor, *args, **kwargs) -> None:
|
|
return loss.backward()
|
|
|
|
model = TestModel()
|
|
trainer = Trainer(
|
|
fast_dev_run=True,
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
with pytest.warns(UserWarning, match="will be ignored since DeepSpeed handles the backward"):
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, deepspeed=True)
|
|
@pytest.mark.parametrize(
|
|
("dataset_cls", "value"),
|
|
[(RandomDataset, "auto"), (RandomDataset, 10), (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)],
|
|
)
|
|
@mock.patch("deepspeed.init_distributed", autospec=True)
|
|
@mock.patch("lightning.pytorch.Trainer.log_dir", new_callable=mock.PropertyMock, return_value="abc")
|
|
def test_deepspeed_auto_batch_size_config_select(mock_deepspeed_distributed, mock_log_dir, tmpdir, dataset_cls, value):
|
|
"""Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes."""
|
|
|
|
class TestModel(BoringModel):
|
|
def train_dataloader(self):
|
|
return DataLoader(dataset_cls(32, 64))
|
|
|
|
class AssertCallback(Callback):
|
|
def setup(self, trainer, pl_module, stage: str) -> None:
|
|
assert isinstance(trainer.strategy, DeepSpeedStrategy)
|
|
config = trainer.strategy.config
|
|
|
|
# int value overrides auto mode
|
|
expected_value = value if isinstance(value, int) else 1
|
|
if dataset_cls == RandomDataset:
|
|
expected_value = pl_module.train_dataloader().batch_size if value == "auto" else value
|
|
|
|
assert config["train_micro_batch_size_per_gpu"] == expected_value
|
|
raise SystemExit
|
|
|
|
ck = AssertCallback()
|
|
model = TestModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
fast_dev_run=True,
|
|
callbacks=ck,
|
|
accelerator="gpu",
|
|
devices=1,
|
|
strategy=DeepSpeedStrategy(logging_batch_size_per_gpu=value, zero_optimization=False),
|
|
)
|
|
with pytest.raises(SystemExit):
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_run_configure_optimizers(tmpdir):
|
|
"""Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), whilst using
|
|
configure_optimizers for optimizers and schedulers."""
|
|
|
|
class TestCB(Callback):
|
|
def on_train_start(self, trainer, pl_module) -> None:
|
|
assert isinstance(trainer.optimizers[0], DeepSpeedZeroOptimizer)
|
|
assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
|
|
assert isinstance(trainer.lr_scheduler_configs[0].scheduler, torch.optim.lr_scheduler.StepLR)
|
|
# check that the lr_scheduler config was preserved
|
|
assert trainer.lr_scheduler_configs[0].name == "Sean"
|
|
|
|
class TestModel(BoringModel):
|
|
def configure_optimizers(self):
|
|
[optimizer], [scheduler] = super().configure_optimizers()
|
|
return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "name": "Sean"}}
|
|
|
|
model = TestModel()
|
|
lr_monitor = LearningRateMonitor()
|
|
trainer = Trainer(
|
|
strategy=DeepSpeedStrategy(), # disable ZeRO so our optimizers are not wrapped
|
|
default_root_dir=tmpdir,
|
|
accelerator="gpu",
|
|
devices=1,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
callbacks=[TestCB(), lr_monitor],
|
|
logger=CSVLogger(tmpdir),
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model)
|
|
|
|
assert lr_monitor.lrs == {"Sean": [0.1]}
|
|
|
|
_assert_save_model_is_equal(model, tmpdir, trainer)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_config(tmpdir, deepspeed_zero_config):
|
|
"""Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
|
|
and saves the model weights to load correctly."""
|
|
|
|
class TestCB(Callback):
|
|
def on_train_start(self, trainer, pl_module) -> None:
|
|
from deepspeed.runtime.lr_schedules import WarmupLR
|
|
|
|
assert isinstance(trainer.optimizers[0], DeepSpeedZeroOptimizer)
|
|
assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
|
|
assert isinstance(trainer.lr_scheduler_configs[0].scheduler, WarmupLR)
|
|
assert trainer.lr_scheduler_configs[0].interval == "step"
|
|
|
|
model = BoringModel()
|
|
lr_monitor = LearningRateMonitor()
|
|
trainer = Trainer(
|
|
strategy=DeepSpeedStrategy(config=deepspeed_zero_config),
|
|
default_root_dir=tmpdir,
|
|
accelerator="gpu",
|
|
devices=1,
|
|
log_every_n_steps=1,
|
|
limit_train_batches=4,
|
|
limit_val_batches=4,
|
|
limit_test_batches=4,
|
|
max_epochs=2,
|
|
precision="16-mixed",
|
|
callbacks=[TestCB(), lr_monitor],
|
|
logger=CSVLogger(tmpdir),
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
|
|
trainer.fit(model)
|
|
trainer.test(model)
|
|
assert list(lr_monitor.lrs) == ["lr-SGD"]
|
|
assert len(set(lr_monitor.lrs["lr-SGD"])) == 8
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_custom_precision_params(tmpdir):
|
|
"""Ensure if we modify the FP16 parameters via the DeepSpeedStrategy, the deepspeed config contains these
|
|
changes."""
|
|
|
|
class TestCB(Callback):
|
|
def on_train_start(self, trainer, pl_module) -> None:
|
|
assert trainer.strategy.config["fp16"]["loss_scale"] == 10
|
|
assert trainer.strategy.config["fp16"]["initial_scale_power"] == 11
|
|
assert trainer.strategy.config["fp16"]["loss_scale_window"] == 12
|
|
assert trainer.strategy.config["fp16"]["hysteresis"] == 13
|
|
assert trainer.strategy.config["fp16"]["min_loss_scale"] == 14
|
|
raise SystemExit()
|
|
|
|
model = BoringModel()
|
|
ds = DeepSpeedStrategy(
|
|
loss_scale=10, initial_scale_power=11, loss_scale_window=12, hysteresis=13, min_loss_scale=14
|
|
)
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=ds,
|
|
precision="16-mixed",
|
|
accelerator="gpu",
|
|
devices=1,
|
|
callbacks=[TestCB()],
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
with pytest.raises(SystemExit):
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
@pytest.mark.parametrize("precision", ["fp16", "bf16"])
|
|
def test_deepspeed_inference_precision_during_inference(precision, tmpdir):
|
|
"""Ensure if we modify the precision for deepspeed and execute inference-only, the deepspeed config contains these
|
|
changes."""
|
|
|
|
class TestCB(Callback):
|
|
def on_validation_start(self, trainer, pl_module) -> None:
|
|
assert trainer.strategy.config[precision]
|
|
raise SystemExit()
|
|
|
|
model = BoringModel()
|
|
strategy = DeepSpeedStrategy(config={precision: {"enabled": True}})
|
|
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=strategy,
|
|
accelerator="cuda",
|
|
devices=1,
|
|
callbacks=[TestCB()],
|
|
barebones=True,
|
|
)
|
|
with pytest.raises(SystemExit):
|
|
trainer.validate(model)
|
|
|
|
|
|
@RunIf(deepspeed=True)
|
|
def test_deepspeed_custom_activation_checkpointing_params(tmpdir):
|
|
"""Ensure if we modify the activation checkpointing parameters, the deepspeed config contains these changes."""
|
|
ds = DeepSpeedStrategy(
|
|
partition_activations=True,
|
|
cpu_checkpointing=True,
|
|
contiguous_memory_optimization=True,
|
|
synchronize_checkpoint_boundary=True,
|
|
)
|
|
checkpoint_config = ds.config["activation_checkpointing"]
|
|
assert checkpoint_config["partition_activations"]
|
|
assert checkpoint_config["cpu_checkpointing"]
|
|
assert checkpoint_config["contiguous_memory_optimization"]
|
|
assert checkpoint_config["synchronize_checkpoint_boundary"]
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir):
|
|
"""Ensure if we modify the activation checkpointing parameters, we pass these to deepspeed.checkpointing.configure
|
|
correctly."""
|
|
ds = DeepSpeedStrategy(
|
|
partition_activations=True,
|
|
cpu_checkpointing=True,
|
|
contiguous_memory_optimization=True,
|
|
synchronize_checkpoint_boundary=True,
|
|
)
|
|
|
|
model = BoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
fast_dev_run=1,
|
|
strategy=ds,
|
|
precision="16-mixed",
|
|
accelerator="gpu",
|
|
devices=1,
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
with mock.patch(
|
|
"deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure
|
|
) as deepspeed_checkpointing_configure:
|
|
trainer.fit(model)
|
|
|
|
deepspeed_checkpointing_configure.assert_called_with(
|
|
mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None
|
|
)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, deepspeed=True)
|
|
def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config):
|
|
"""Ensure if we use a config and turn off offload_optimizer, that this is set to False within the config."""
|
|
deepspeed_zero_config["zero_optimization"]["offload_optimizer"] = False
|
|
|
|
class TestCallback(Callback):
|
|
def setup(self, trainer, pl_module, stage=None) -> None:
|
|
assert trainer.strategy.config["zero_optimization"]["offload_optimizer"] is False
|
|
raise SystemExit()
|
|
|
|
model = BoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
enable_progress_bar=False,
|
|
max_epochs=1,
|
|
strategy=DeepSpeedStrategy(config=deepspeed_zero_config),
|
|
precision="16-mixed",
|
|
accelerator="gpu",
|
|
devices=1,
|
|
callbacks=[TestCallback()],
|
|
)
|
|
with pytest.raises(SystemExit):
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
|
|
def test_deepspeed_multigpu(tmpdir):
|
|
"""Test to ensure that DeepSpeed with multiple GPUs works and deepspeed distributed is initialized correctly."""
|
|
model = BoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=2,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
|
|
with mock.patch.object(
|
|
model, "configure_optimizers", wraps=model.configure_optimizers
|
|
) as mock_configure_optimizers:
|
|
trainer.test(model)
|
|
assert mock_configure_optimizers.call_count == 0
|
|
|
|
with mock.patch("deepspeed.init_distributed", wraps=deepspeed.init_distributed) as mock_deepspeed_distributed:
|
|
trainer.fit(model)
|
|
mock_deepspeed_distributed.assert_called_once()
|
|
|
|
_assert_save_model_is_equal(model, tmpdir, trainer)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_fp32_works(tmpdir):
|
|
model = BoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
accelerator="gpu",
|
|
devices=1,
|
|
strategy="deepspeed_stage_3",
|
|
fast_dev_run=True,
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
|
|
def test_deepspeed_stage_3_save_warning(tmpdir):
|
|
"""Test to ensure that DeepSpeed Stage 3 gives a warning when saving on rank zero."""
|
|
model = BoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=2,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model)
|
|
checkpoint_path = os.path.join(tmpdir, "model.pt")
|
|
|
|
# both ranks need to call save checkpoint, however only rank 0 needs to check the warning
|
|
context_manager = (
|
|
pytest.warns(UserWarning, match="each worker will save a shard of the checkpoint within a directory.")
|
|
if trainer.is_global_zero
|
|
else contextlib.suppress()
|
|
)
|
|
with context_manager:
|
|
trainer.save_checkpoint(checkpoint_path)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_multigpu_single_file(tmpdir):
|
|
"""Test to ensure that DeepSpeed loads from a single file checkpoint."""
|
|
model = BoringModel()
|
|
checkpoint_path = os.path.join(tmpdir, "model.pt")
|
|
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator="cpu", devices=1)
|
|
trainer.fit(model)
|
|
trainer.save_checkpoint(checkpoint_path)
|
|
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
strategy = trainer.strategy
|
|
assert isinstance(strategy, DeepSpeedStrategy)
|
|
assert not strategy.load_full_weights
|
|
with pytest.raises(FileNotFoundError, match="The provided path is not a valid DeepSpeed checkpoint"):
|
|
trainer.test(model, ckpt_path=checkpoint_path)
|
|
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3, load_full_weights=True),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
strategy = trainer.strategy
|
|
assert isinstance(strategy, DeepSpeedStrategy)
|
|
assert strategy.load_full_weights
|
|
trainer.test(model, ckpt_path=checkpoint_path)
|
|
|
|
|
|
class ModelParallelClassificationModel(LightningModule):
|
|
def __init__(self, lr: float = 0.01, num_blocks: int = 5):
|
|
super().__init__()
|
|
self.lr = lr
|
|
self.num_blocks = num_blocks
|
|
self.prepare_data_per_node = True
|
|
self.train_acc = self.valid_acc = self.test_acc = None
|
|
self.model = None
|
|
|
|
def make_block(self):
|
|
return nn.Sequential(nn.Linear(32, 32, bias=False), nn.ReLU())
|
|
|
|
def configure_model(self) -> None:
|
|
# As of deepspeed v0.9.3, in ZeRO stage 3 all submodules need to be created within this hook,
|
|
# including the metrics. Otherwise, modules that aren't affected by `deepspeed.zero.Init()`
|
|
# won't be moved to the GPU. See https://github.com/microsoft/DeepSpeed/pull/3611
|
|
if self.model is None:
|
|
metric = Accuracy(task="multiclass", num_classes=3) if _TM_GE_0_11 else Accuracy()
|
|
self.train_acc = metric.clone()
|
|
self.valid_acc = metric.clone()
|
|
self.test_acc = metric.clone()
|
|
self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3))
|
|
|
|
def forward(self, x):
|
|
x = self.model(x)
|
|
# Ensure output is in float32 for softmax operation
|
|
x = x.float()
|
|
return F.softmax(x, dim=1)
|
|
|
|
def training_step(self, batch, batch_idx):
|
|
x, y = batch
|
|
logits = self.forward(x)
|
|
loss = F.cross_entropy(logits, y)
|
|
self.log("train_loss", loss, prog_bar=True)
|
|
self.log("train_acc", self.train_acc(logits, y), prog_bar=True, sync_dist=True)
|
|
return {"loss": loss}
|
|
|
|
def validation_step(self, batch, batch_idx):
|
|
x, y = batch
|
|
logits = self.forward(x)
|
|
self.log("val_loss", F.cross_entropy(logits, y), prog_bar=False, sync_dist=True)
|
|
self.log("val_acc", self.valid_acc(logits, y), prog_bar=True, sync_dist=True)
|
|
|
|
def test_step(self, batch, batch_idx):
|
|
x, y = batch
|
|
logits = self.forward(x)
|
|
self.log("test_loss", F.cross_entropy(logits, y), prog_bar=False, sync_dist=True)
|
|
self.log("test_acc", self.test_acc(logits, y), prog_bar=True, sync_dist=True)
|
|
|
|
def predict_step(self, batch, batch_idx, dataloader_idx=0):
|
|
x, y = batch
|
|
logits = self.forward(x)
|
|
self.test_acc(logits, y)
|
|
return self.test_acc.compute()
|
|
|
|
def configure_optimizers(self):
|
|
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
|
|
|
|
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
|
|
return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}]
|
|
|
|
def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
|
|
if not hasattr(self, "model"):
|
|
self.configure_model()
|
|
|
|
# Lightning saves the lr schedulers, but DeepSpeed saves the optimizer states separately
|
|
assert len(checkpoint["lr_schedulers"]) == 1
|
|
assert "optimizer_states" not in checkpoint
|
|
|
|
|
|
class ManualModelParallelClassificationModel(ModelParallelClassificationModel):
|
|
@property
|
|
def automatic_optimization(self) -> bool:
|
|
return False
|
|
|
|
def training_step(self, batch, batch_idx):
|
|
x, y = batch
|
|
logits = self.forward(x)
|
|
loss = F.cross_entropy(logits, y)
|
|
opt = self.optimizers()
|
|
self.log("train_loss", loss, prog_bar=True)
|
|
self.log("train_acc", self.train_acc(logits, y), prog_bar=True, sync_dist=True)
|
|
opt.zero_grad()
|
|
self.manual_backward(loss)
|
|
opt.step()
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
|
|
def test_deepspeed_multigpu_stage_3(tmpdir):
|
|
"""Test to ensure ZeRO Stage 3 works with a parallel model."""
|
|
model = ModelParallelBoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=2,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.test(model)
|
|
trainer.fit(model)
|
|
|
|
_assert_save_model_is_equal(model, tmpdir, trainer)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
|
|
def test_deepspeed_multigpu_stage_3_manual_optimization(tmpdir, deepspeed_config):
|
|
"""Test to ensure ZeRO Stage 3 works with a parallel model."""
|
|
model = ModelParallelBoringModelManualOptim()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=2,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.test(model)
|
|
trainer.fit(model)
|
|
|
|
_assert_save_model_is_equal(model, tmpdir, trainer)
|
|
|
|
|
|
@pytest.mark.xfail(strict=False, reason="skipped due to deepspeed/#2449, keep track @rohitgr7")
|
|
@pytest.mark.parametrize(("accumulate_grad_batches", "automatic_optimization"), [(1, False), (2, True)])
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, sklearn=True)
|
|
def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, automatic_optimization, accumulate_grad_batches):
|
|
model = ModelParallelClassificationModel() if automatic_optimization else ManualModelParallelClassificationModel()
|
|
dm = ClassifDataModule()
|
|
ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1)
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
max_epochs=10,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=2,
|
|
precision="16-mixed",
|
|
accumulate_grad_batches=accumulate_grad_batches,
|
|
callbacks=[ck],
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model, datamodule=dm)
|
|
|
|
results = trainer.test(datamodule=dm)
|
|
saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
|
|
assert saved_results == results
|
|
|
|
model = ModelParallelClassificationModel() if automatic_optimization else ManualModelParallelClassificationModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
accelerator="gpu",
|
|
devices=2,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.test(model, datamodule=dm, ckpt_path=ck.best_model_path)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True, sklearn=True)
|
|
def test_deepspeed_multigpu_stage_3_warns_resume_training(tmpdir):
|
|
"""Test to ensure with Stage 3 and multiple GPUs that we can resume from training, throwing a warning that the
|
|
optimizer state and scheduler states cannot be restored."""
|
|
dm = ClassifDataModule()
|
|
model = BoringModel()
|
|
checkpoint_path = os.path.join(tmpdir, "model.pt")
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
fast_dev_run=True,
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
accelerator="cpu",
|
|
devices=1,
|
|
)
|
|
trainer.fit(model)
|
|
trainer.save_checkpoint(checkpoint_path)
|
|
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
fast_dev_run=True,
|
|
strategy=DeepSpeedStrategy(stage=3, load_full_weights=True),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
with pytest.warns(
|
|
UserWarning,
|
|
match="A single checkpoint file has been given. This means optimizer states cannot be restored. "
|
|
"If you'd like to restore these states, you must "
|
|
"provide a path to the originally saved DeepSpeed checkpoint.",
|
|
):
|
|
trainer.fit(model, datamodule=dm, ckpt_path=checkpoint_path)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True, sklearn=True)
|
|
def test_deepspeed_multigpu_stage_3_resume_training(tmpdir):
|
|
"""Test to ensure with Stage 3 and single GPU that we can resume training."""
|
|
initial_model = ModelParallelClassificationModel()
|
|
dm = ClassifDataModule()
|
|
|
|
ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1)
|
|
initial_trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
max_epochs=1,
|
|
limit_train_batches=2,
|
|
limit_val_batches=2,
|
|
limit_test_batches=2,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
precision="16-mixed",
|
|
callbacks=[ck],
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
initial_trainer.fit(initial_model, datamodule=dm)
|
|
|
|
class TestCallback(Callback):
|
|
def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
|
|
original_deepspeed_strategy = initial_trainer.strategy
|
|
current_deepspeed_strategy = trainer.strategy
|
|
|
|
assert isinstance(original_deepspeed_strategy, DeepSpeedStrategy)
|
|
assert isinstance(current_deepspeed_strategy, DeepSpeedStrategy)
|
|
# assert optimizer states are the correctly loaded
|
|
original_optimizer_dict = original_deepspeed_strategy.deepspeed_engine.optimizer.state_dict()
|
|
current_optimizer_dict = current_deepspeed_strategy.deepspeed_engine.optimizer.state_dict()
|
|
for orig_tensor, current_tensor in zip(
|
|
original_optimizer_dict["fp32_flat_groups"], current_optimizer_dict["fp32_flat_groups"]
|
|
):
|
|
assert torch.all(orig_tensor.eq(current_tensor))
|
|
# assert model state is loaded correctly
|
|
for current_param, initial_param in zip(pl_module.parameters(), initial_model.parameters()):
|
|
assert torch.equal(current_param.cpu(), initial_param.cpu())
|
|
# assert epoch has correctly been restored
|
|
assert trainer.current_epoch == 1
|
|
|
|
# assert lr-scheduler states are loaded correctly
|
|
original_lr_scheduler = initial_trainer.lr_scheduler_configs[0].scheduler
|
|
current_lr_scheduler = trainer.lr_scheduler_configs[0].scheduler
|
|
assert original_lr_scheduler.state_dict() == current_lr_scheduler.state_dict()
|
|
|
|
model = ModelParallelClassificationModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
max_epochs=2,
|
|
limit_train_batches=1,
|
|
limit_val_batches=0,
|
|
precision="16-mixed",
|
|
callbacks=TestCallback(),
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model, datamodule=dm, ckpt_path=ck.best_model_path)
|
|
|
|
|
|
@pytest.mark.parametrize("offload_optimizer", [False, True])
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, sklearn=True)
|
|
def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
|
|
"""Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works."""
|
|
|
|
class VerificationCallback(Callback):
|
|
def __init__(self):
|
|
self.on_train_batch_start_called = False
|
|
|
|
def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int) -> None:
|
|
deepspeed_engine = trainer.strategy.model
|
|
assert trainer.global_step == deepspeed_engine.global_steps
|
|
self.on_train_batch_start_called = True
|
|
|
|
model = ModelParallelClassificationModel()
|
|
dm = ClassifDataModule()
|
|
verification_callback = VerificationCallback()
|
|
strategy = DeepSpeedStrategy(stage=2, offload_optimizer=offload_optimizer)
|
|
strategy.config["zero_force_ds_cpu_optimizer"] = False
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
# TODO: this test fails with max_epochs >1 as there are leftover batches per epoch.
|
|
# there's divergence in how Lightning handles the last batch of the epoch with how DeepSpeed does it.
|
|
# we step the optimizers on the last batch but DeepSpeed keeps the accumulation for the next epoch
|
|
max_epochs=1,
|
|
strategy=strategy,
|
|
accelerator="gpu",
|
|
devices=2,
|
|
limit_train_batches=5,
|
|
limit_val_batches=2,
|
|
precision="16-mixed",
|
|
accumulate_grad_batches=2,
|
|
callbacks=[verification_callback],
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
assert trainer.limit_train_batches % trainer.accumulate_grad_batches != 0, "leftover batches should be tested"
|
|
trainer.fit(model, datamodule=dm)
|
|
assert verification_callback.on_train_batch_start_called
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
|
|
def test_deepspeed_multigpu_test(tmpdir):
|
|
"""Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3."""
|
|
model = ModelParallelBoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=2,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.test(model)
|
|
|
|
|
|
# TODO(Sean): Once partial parameter partitioning is supported this test should be re-enabled
|
|
@pytest.mark.xfail(strict=False, reason="Partial parameter partitioning for DeepSpeed is currently broken.")
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_multigpu_partial_partition_parameters(tmpdir):
|
|
"""Test to ensure that a module that defines a layer inside the ``__init__`` and ``configure_model`` correctly
|
|
converts all parameters to float16 when ``precision=16`` and runs successfully."""
|
|
|
|
class TestModel(ModelParallelBoringModel):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.layer_2 = torch.nn.Linear(32, 32)
|
|
|
|
def configure_model(self) -> None:
|
|
if self.layer is None:
|
|
self.layer = torch.nn.Linear(32, 2)
|
|
|
|
def forward(self, x):
|
|
x = self.layer_2(x)
|
|
return self.layer(x)
|
|
|
|
def on_train_epoch_start(self) -> None:
|
|
assert all(x.dtype == torch.float16 for x in self.parameters())
|
|
|
|
model = TestModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_multigpu_test_rnn(tmpdir):
|
|
"""Test to ensure that turning off explicit partitioning of the entire module for ZeRO Stage 3 works when training
|
|
with certain layers which will crash with explicit partitioning."""
|
|
|
|
class TestModel(BoringModel):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.rnn = torch.nn.GRU(32, 32)
|
|
|
|
def on_train_epoch_start(self) -> None:
|
|
assert all(x.dtype == torch.float16 for x in self.parameters())
|
|
|
|
model = TestModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(deepspeed=True, mps=False)
|
|
@mock.patch("deepspeed.init_distributed", autospec=True)
|
|
@pytest.mark.parametrize("platform", ["Linux", "Windows"])
|
|
def test_deepspeed_strategy_env_variables(mock_deepspeed_distributed, tmpdir, platform):
|
|
"""Test to ensure that we setup distributed communication using correctly.
|
|
|
|
When using windows, ranks environment variables should not be set, and deepspeed should handle this.
|
|
|
|
"""
|
|
trainer = Trainer(default_root_dir=tmpdir, strategy=DeepSpeedStrategy(stage=3))
|
|
strategy = trainer.strategy
|
|
assert isinstance(strategy, DeepSpeedStrategy)
|
|
with mock.patch("platform.system", return_value=platform) as mock_platform:
|
|
strategy._init_deepspeed_distributed()
|
|
mock_deepspeed_distributed.assert_called()
|
|
mock_platform.assert_called()
|
|
if platform == "Windows":
|
|
# assert no env variables have been set within the DeepSpeedStrategy
|
|
assert all(k not in os.environ for k in ("MASTER_PORT", "MASTER_ADDR", "RANK", "WORLD_SIZE", "LOCAL_RANK"))
|
|
else:
|
|
assert os.environ["MASTER_ADDR"] == str(trainer.strategy.cluster_environment.main_address)
|
|
assert os.environ["MASTER_PORT"] == str(trainer.strategy.cluster_environment.main_port)
|
|
assert os.environ["RANK"] == str(trainer.strategy.global_rank)
|
|
assert os.environ["WORLD_SIZE"] == str(trainer.strategy.world_size)
|
|
assert os.environ["LOCAL_RANK"] == str(trainer.strategy.local_rank)
|
|
|
|
|
|
def _assert_save_model_is_equal(model, tmpdir, trainer):
|
|
checkpoint_path = os.path.join(tmpdir, "model.pt")
|
|
checkpoint_path = trainer.strategy.broadcast(checkpoint_path)
|
|
trainer.save_checkpoint(checkpoint_path)
|
|
|
|
# carry out the check only on rank 0
|
|
if trainer.is_global_zero:
|
|
single_ckpt_path = os.path.join(tmpdir, "single_model.pt")
|
|
convert_zero_checkpoint_to_fp32_state_dict(checkpoint_path, single_ckpt_path)
|
|
state_dict = torch.load(single_ckpt_path)
|
|
|
|
model = model.cpu()
|
|
# Assert model parameters are identical after loading
|
|
for orig_param, saved_model_param in zip(model.parameters(), state_dict.values()):
|
|
if model.dtype == torch.half:
|
|
# moved model to float32 for comparison with single fp32 saved weights
|
|
saved_model_param = saved_model_param.half()
|
|
assert torch.equal(orig_param, saved_model_param)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
|
|
def test_deepspeed_multigpu_no_schedulers(tmpdir):
|
|
"""Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers."""
|
|
model = ModelParallelBoringModelNoSchedulers()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(stage=3),
|
|
accelerator="gpu",
|
|
devices=2,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model)
|
|
|
|
_assert_save_model_is_equal(model, tmpdir, trainer)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_skip_backward_raises(tmpdir):
|
|
class TestModel(BoringModel):
|
|
def training_step(self, batch, batch_idx):
|
|
return None
|
|
|
|
model = TestModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
fast_dev_run=True,
|
|
precision="16-mixed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
with pytest.raises(MisconfigurationException, match="returning `None` .* is not supported"):
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_setup_train_dataloader(tmpdir):
|
|
"""Test DeepSpeed works when setup is required to call in the DataModule."""
|
|
|
|
class TestSetupIsCalledDataModule(LightningDataModule):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self._setup = False
|
|
|
|
def setup(self, stage: str) -> None:
|
|
self._setup = True
|
|
|
|
def train_dataloader(self):
|
|
assert self._setup
|
|
return DataLoader(RandomDataset(32, 64), batch_size=2)
|
|
|
|
def val_dataloader(self):
|
|
assert self._setup
|
|
return DataLoader(RandomDataset(32, 64), batch_size=2)
|
|
|
|
def test_dataloader(self):
|
|
assert self._setup
|
|
return DataLoader(RandomDataset(32, 64), batch_size=2)
|
|
|
|
model = BoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy=DeepSpeedStrategy(logging_level=logging.INFO),
|
|
accelerator="gpu",
|
|
devices=1,
|
|
fast_dev_run=True,
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
dm = TestSetupIsCalledDataModule()
|
|
with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object:
|
|
trainer.fit(model, datamodule=dm)
|
|
assert any("Tried to infer the batch size" in str(arg) for arg in mock_object.call_args_list)
|
|
|
|
|
|
@mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True)
|
|
@pytest.mark.parametrize("interval", ["step", "epoch"])
|
|
@pytest.mark.parametrize("max_epoch", [2])
|
|
@pytest.mark.parametrize("limit_train_batches", [2])
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_scheduler_step_count(mock_step, tmpdir, max_epoch, limit_train_batches, interval):
|
|
"""Test to ensure that the scheduler is called the correct amount of times during training when scheduler is set to
|
|
step or epoch."""
|
|
|
|
class TestModel(BoringModel):
|
|
def configure_optimizers(self):
|
|
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
|
|
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)
|
|
return {
|
|
"optimizer": optimizer,
|
|
"lr_scheduler": {"scheduler": scheduler, "interval": interval},
|
|
}
|
|
|
|
model = TestModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
limit_train_batches=limit_train_batches,
|
|
limit_val_batches=0,
|
|
max_epochs=max_epoch,
|
|
accelerator="gpu",
|
|
devices=1,
|
|
strategy="deepspeed",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model)
|
|
if interval == "epoch":
|
|
# assert called once at init and once during training
|
|
assert mock_step.call_count == 1 + max_epoch
|
|
else:
|
|
# assert called once at init and once during training
|
|
assert mock_step.call_count == 1 + (max_epoch * limit_train_batches)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_configure_gradient_clipping(tmpdir):
|
|
"""Test to ensure that a warning is raised when `LightningModule.configure_gradient_clipping` is overridden in case
|
|
of deepspeed."""
|
|
|
|
class TestModel(BoringModel):
|
|
def configure_gradient_clipping(self, optimizer, gradient_clip_val, gradient_clip_algorithm):
|
|
self.clip_gradients(optimizer, gradient_clip_val, gradient_clip_algorithm)
|
|
|
|
model = TestModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
accelerator="gpu",
|
|
devices=1,
|
|
strategy="deepspeed",
|
|
fast_dev_run=True,
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
with pytest.warns(UserWarning, match="handles gradient clipping internally"):
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
|
|
def test_deepspeed_gradient_clip_by_value(tmpdir):
|
|
"""Test to ensure that an exception is raised when using `gradient_clip_algorithm='value'`."""
|
|
model = BoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
accelerator="gpu",
|
|
devices=1,
|
|
strategy="deepspeed",
|
|
gradient_clip_algorithm="value",
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
with pytest.raises(MisconfigurationException, match="does not support clipping gradients by value"):
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
|
|
def test_deepspeed_multi_save_same_filepath(tmpdir):
|
|
"""Test that verifies that deepspeed saves only latest checkpoint in the specified path and deletes the old sharded
|
|
checkpoints."""
|
|
|
|
class CustomModel(BoringModel):
|
|
def training_step(self, *args, **kwargs):
|
|
self.log("grank", self.global_rank)
|
|
return super().training_step(*args, **kwargs)
|
|
|
|
model = CustomModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy="deepspeed",
|
|
accelerator="gpu",
|
|
devices=2,
|
|
callbacks=[ModelCheckpoint(filename="{epoch}_{step}_{grank}", save_top_k=1)],
|
|
limit_train_batches=1,
|
|
limit_val_batches=0,
|
|
num_sanity_val_steps=0,
|
|
max_epochs=2,
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
trainer.fit(model)
|
|
|
|
filepath = "epoch=1_step=2_grank=0.0.ckpt"
|
|
expected = {filepath}
|
|
assert expected == set(os.listdir(trainer.checkpoint_callback.dirpath))
|
|
|
|
ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filepath)
|
|
expected = {"latest", "zero_to_fp32.py", "checkpoint"}
|
|
assert expected == set(os.listdir(ckpt_path))
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
|
|
def test_deepspeed_with_bfloat16_precision(tmpdir):
|
|
"""Test that deepspeed works with bfloat16 precision."""
|
|
model = BoringModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
strategy="deepspeed_stage_3",
|
|
accelerator="gpu",
|
|
devices=2,
|
|
fast_dev_run=True,
|
|
precision="bf16-mixed",
|
|
num_sanity_val_steps=0,
|
|
enable_progress_bar=False,
|
|
enable_model_summary=False,
|
|
)
|
|
|
|
trainer.fit(model)
|
|
assert isinstance(trainer.strategy.precision_plugin, DeepSpeedPrecision)
|
|
assert trainer.strategy.precision_plugin.precision == "bf16-mixed"
|
|
assert trainer.strategy.config["zero_optimization"]["stage"] == 3
|
|
assert trainer.strategy.config["bf16"]["enabled"]
|
|
assert model.layer.weight.dtype == torch.bfloat16
|
|
|
|
|
|
@RunIf(deepspeed=True)
|
|
def test_error_with_invalid_accelerator(tmpdir):
|
|
"""Test DeepSpeedStrategy raises an exception if an invalid accelerator is used."""
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
accelerator="cpu",
|
|
strategy="deepspeed",
|
|
fast_dev_run=True,
|
|
)
|
|
model = BoringModel()
|
|
with pytest.raises(RuntimeError, match="DeepSpeed strategy is only supported on CUDA"):
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, deepspeed=True, standalone=True)
|
|
def test_deepspeed_configure_optimizer_device_set(tmpdir):
|
|
"""Test to ensure that the LM has access to the device within the ``configure_optimizer`` function, and
|
|
estimated_stepping_batches works correctly as a result."""
|
|
|
|
class TestModel(BoringModel):
|
|
def configure_optimizers(self):
|
|
assert self.trainer.estimated_stepping_batches == 1
|
|
assert self.device.type == "cuda"
|
|
raise SystemExit
|
|
|
|
model = TestModel()
|
|
trainer = Trainer(
|
|
default_root_dir=tmpdir,
|
|
fast_dev_run=True,
|
|
accelerator="gpu",
|
|
devices=2,
|
|
strategy=DeepSpeedStrategy(),
|
|
)
|
|
with pytest.raises(SystemExit):
|
|
trainer.fit(model)
|
|
|
|
|
|
@RunIf(deepspeed=True)
|
|
@pytest.mark.parametrize("device_indices", [[1], [1, 0], [0, 2], [3, 2, 1]])
|
|
def test_validate_parallel_devices_indices(device_indices):
|
|
"""Test that the strategy validates that it doesn't support selecting specific devices by index.
|
|
|
|
DeepSpeed doesn't support it and needs the index to match to the local rank of the process.
|
|
|
|
"""
|
|
strategy = DeepSpeedStrategy(
|
|
accelerator=CUDAAccelerator(), parallel_devices=[torch.device("cuda", i) for i in device_indices]
|
|
)
|
|
with pytest.raises(
|
|
RuntimeError, match=escape(f"device indices {device_indices!r} don't match the local rank values of processes")
|
|
):
|
|
strategy.setup_environment()
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, bf16_cuda=True)
|
|
def test_deepspeed_init_module_with_stage_3():
|
|
"""Tests how `.init_module()` behaves with ZeRO stage 3."""
|
|
trainer = Trainer(
|
|
accelerator="cuda", devices=2, strategy="deepspeed_stage_3", precision="bf16-mixed", fast_dev_run=1
|
|
)
|
|
model = ModelParallelBoringModel()
|
|
with mock.patch("deepspeed.zero.Init") as zero_init_mock:
|
|
trainer.fit(model)
|
|
|
|
zero_init_mock.assert_called_once_with(enabled=True, remote_device=None, config_dict_or_path=ANY)
|
|
|
|
|
|
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, bf16_cuda=True)
|
|
@pytest.mark.parametrize("stage", [1, 2])
|
|
def test_deepspeed_init_module_with_stages_1_2(stage):
|
|
"""Tests how `.init_module()` behaves with ZeRO stages 1 and 2."""
|
|
strategy = DeepSpeedStrategy(stage=stage)
|
|
trainer = Trainer(accelerator="cuda", devices=2, strategy=strategy, precision="bf16-mixed", fast_dev_run=1)
|
|
model = ModelParallelBoringModel()
|
|
with mock.patch("deepspeed.zero.Init") as zero_init_mock:
|
|
trainer.fit(model)
|
|
|
|
zero_init_mock.assert_called_once_with(enabled=False, remote_device=None, config_dict_or_path=ANY)
|
|
assert model.layer.weight.dtype == torch.bfloat16
|
|
|
|
|
|
@RunIf(deepspeed=True)
|
|
def test_deepspeed_load_checkpoint_validate_path(tmp_path):
|
|
"""Test that we validate the checkpoint path for a DeepSpeed checkpoint and give suggestions for user error."""
|
|
strategy = DeepSpeedStrategy()
|
|
with pytest.raises(FileNotFoundError, match="The provided path is not a valid DeepSpeed checkpoint"):
|
|
strategy.load_checkpoint(checkpoint_path=tmp_path)
|
|
|
|
# User tries to pass the subfolder as the path
|
|
checkpoint_path = tmp_path / "checkpoint"
|
|
checkpoint_path.mkdir()
|
|
with pytest.raises(FileNotFoundError, match=f"Try to load using this parent directory instead: {tmp_path}"):
|
|
strategy.load_checkpoint(checkpoint_path=checkpoint_path)
|
|
|
|
# User tries to pass an individual file inside the checkpoint folder
|
|
checkpoint_path = checkpoint_path / "zero_pp_rank_0_mp_rank_00_model_states.pt"
|
|
checkpoint_path.touch()
|
|
with pytest.raises(FileNotFoundError, match=f"Try to load using this parent directory instead: {tmp_path}"):
|
|
strategy.load_checkpoint(checkpoint_path=checkpoint_path)
|