lightning/tests/tests_fabric/strategies/test_dp.py

# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest import mock
from unittest.mock import MagicMock, Mock

import pytest
import torch
from lightning.fabric import Fabric
from lightning.fabric.strategies import DataParallelStrategy

from tests_fabric.helpers.runif import RunIf
from tests_fabric.strategies.test_single_device import _run_test_clip_gradients


def test_data_parallel_root_device():
    strategy = DataParallelStrategy()
    strategy.parallel_devices = [torch.device("cuda", 2), torch.device("cuda", 0), torch.device("cuda", 1)]
    assert strategy.root_device == torch.device("cuda", 2)


def test_data_parallel_ranks():
    strategy = DataParallelStrategy()
    assert strategy.world_size == 1
    assert strategy.local_rank == 0
    assert strategy.global_rank == 0
    assert strategy.is_global_zero


@mock.patch("lightning.fabric.strategies.dp.DataParallel")
def test_data_parallel_setup_module(data_parallel_mock):
    strategy = DataParallelStrategy()
    strategy.parallel_devices = [0, 2, 1]
    module = torch.nn.Linear(2, 2)
    wrapped_module = strategy.setup_module(module)
    assert wrapped_module == data_parallel_mock(module=module, device_ids=[0, 2, 1])


def test_data_parallel_module_to_device():
    strategy = DataParallelStrategy()
    strategy.parallel_devices = [torch.device("cuda", 2)]
    module = Mock()
    strategy.module_to_device(module)
    module.to.assert_called_with(torch.device("cuda", 2))


def test_dp_module_state_dict():
    """Test that the module state dict gets retrieved without the prefixed wrapper keys from DP."""

    class DataParallelMock(MagicMock):
        def __instancecheck__(self, instance):
            # to make the strategy's `isinstance(model, DataParallel)` pass with a mock as class
            return True

    strategy = DataParallelStrategy(parallel_devices=[torch.device("cpu"), torch.device("cpu")])

    # Without DP applied (no setup call)
    original_module = torch.nn.Linear(2, 3)
    assert strategy.get_module_state_dict(original_module).keys() == original_module.state_dict().keys()

    # With DP applied (setup called)
    with mock.patch("lightning.fabric.strategies.dp.DataParallel", DataParallelMock):
        wrapped_module = strategy.setup_module(original_module)
        assert strategy.get_module_state_dict(wrapped_module).keys() == original_module.state_dict().keys()


@pytest.mark.parametrize(
    "precision",
    [
        "32-true",
        "16-mixed",
        pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True)),
    ],
)
@pytest.mark.parametrize("clip_type", ["norm", "val"])
@RunIf(min_cuda_gpus=2)
def test_clip_gradients(clip_type, precision):
    if clip_type == "norm" and precision == "16-mixed":
        pytest.skip(reason="Clipping by norm with 16-mixed is numerically unstable.")

    fabric = Fabric(accelerator="cuda", devices=2, precision=precision, strategy="dp")
    fabric.launch()
    _run_test_clip_gradients(fabric=fabric, clip_type=clip_type)