lightning/tests/tests_fabric/strategies/test_ddp.py

# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from datetime import timedelta
from unittest import mock
from unittest.mock import MagicMock, Mock

import pytest
import torch
from torch.nn.parallel import DistributedDataParallel

from lightning.fabric.plugins import DoublePrecision, HalfPrecision, Precision
from lightning.fabric.plugins.environments import LightningEnvironment
from lightning.fabric.strategies import DDPStrategy
from lightning.fabric.strategies.ddp import _DDPBackwardSyncControl
from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
from tests_fabric.helpers.runif import RunIf
from tests_fabric.strategies.test_single_device import _MyFabricGradNorm, _MyFabricGradVal


@pytest.mark.parametrize(
    ["process_group_backend", "device_str", "expected_process_group_backend"],
    [
        pytest.param("foo", "cpu", "foo"),
        pytest.param("foo", "cuda:0", "foo"),
        pytest.param(None, "cuda:0", "nccl"),
        pytest.param(None, "cpu", "gloo"),
    ],
)
def test_ddp_process_group_backend(process_group_backend, device_str, expected_process_group_backend):
    """Test settings for process group backend."""

    class MockDDPStrategy(DDPStrategy):
        def __init__(self, root_device, process_group_backend):
            self._root_device = root_device
            super().__init__(process_group_backend=process_group_backend)

        @property
        def root_device(self):
            return self._root_device

    strategy = MockDDPStrategy(process_group_backend=process_group_backend, root_device=torch.device(device_str))
    assert strategy._get_process_group_backend() == expected_process_group_backend


def test_ddp_no_backward_sync():
    """Test that the backward sync control calls `.no_sync()`, and only on a DDP-wrapped module."""
    strategy = DDPStrategy()
    assert isinstance(strategy._backward_sync_control, _DDPBackwardSyncControl)

    with pytest.raises(
        TypeError, match="is only possible if the module passed to .* is wrapped in `DistributedDataParallel`"
    ), strategy._backward_sync_control.no_backward_sync(Mock()):
        pass

    module = MagicMock(spec=DistributedDataParallel)
    with strategy._backward_sync_control.no_backward_sync(module):
        pass

    module.no_sync.assert_called_once()


@mock.patch("lightning.fabric.strategies.ddp.DistributedDataParallel")
def test_ddp_extra_kwargs(ddp_mock):
    """Test that additional kwargs passed to the DDPStrategy get passed down to the DistributedDataParallel
    wrapper."""
    module = torch.nn.Linear(1, 1)
    strategy = DDPStrategy(parallel_devices=[torch.device("cpu"), torch.device("cpu")])
    strategy.setup_module(module)
    ddp_mock.assert_called_with(module=module, device_ids=None)

    ddp_mock.reset_mock()

    strategy = DDPStrategy(parallel_devices=[torch.device("cpu"), torch.device("cpu")], find_unused_parameters=True)
    strategy.setup_module(module)
    ddp_mock.assert_called_with(module=module, device_ids=None, find_unused_parameters=True)


def test_ddp_module_state_dict():
    """Test that the module state dict gets retrieved without the prefixed wrapper keys from DDP."""

    class DistributedDataParallelMock(MagicMock):
        def __instancecheck__(self, instance):
            # to make the strategy's `isinstance(model, DistributedDataParallel)` pass with a mock as class
            return True

    strategy = DDPStrategy(parallel_devices=[torch.device("cpu"), torch.device("cpu")])

    # Without DDP applied (no setup call)
    original_module = torch.nn.Linear(2, 3)
    assert strategy.get_module_state_dict(original_module).keys() == original_module.state_dict().keys()

    # With DDP applied (setup called)
    with mock.patch("lightning.fabric.strategies.ddp.DistributedDataParallel", DistributedDataParallelMock):
        wrapped_module = strategy.setup_module(original_module)
        assert strategy.get_module_state_dict(wrapped_module).keys() == original_module.state_dict().keys()


@pytest.mark.parametrize(
    "clip_type,accelerator,precision",
    [
        ("norm", "cpu", "32-true"),
        ("val", "cpu", "32-true"),
        ("norm", "cpu", "bf16-mixed"),
        ("val", "cpu", "bf16-mixed"),
        pytest.param("norm", "cuda", "32-true", marks=RunIf(min_cuda_gpus=2)),
        pytest.param("val", "cuda", "32-true", marks=RunIf(min_cuda_gpus=2)),
        pytest.param("norm", "cuda", "16-mixed", marks=RunIf(min_cuda_gpus=2)),
        pytest.param("val", "cuda", "16-mixed", marks=RunIf(min_cuda_gpus=2)),
        pytest.param("norm", "cuda", "bf16-mixed", marks=RunIf(min_cuda_gpus=2, bf16_cuda=True)),
        pytest.param("val", "cuda", "bf16-mixed", marks=RunIf(min_cuda_gpus=2, bf16_cuda=True)),
    ],
)
@RunIf(standalone=True)
def test_ddp_grad_clipping(clip_type, accelerator, precision):
    clipping_test_cls = _MyFabricGradNorm if clip_type == "norm" else _MyFabricGradVal
    fabric = clipping_test_cls(accelerator=accelerator, devices=2, precision=precision, strategy="ddp")
    fabric.run()


@RunIf(min_cuda_gpus=2)
@pytest.mark.parametrize(
    "precision,expected_dtype",
    [
        (Precision(), torch.float32),
        (HalfPrecision("16-true"), torch.float16),
        pytest.param(HalfPrecision("bf16-true"), torch.bfloat16, marks=RunIf(bf16_cuda=True)),
        (DoublePrecision(), torch.float64),
    ],
)
@mock.patch.dict(os.environ, {"LOCAL_RANK": "1"})
def test_module_init_context(precision, expected_dtype):
    """Test that the module under the init-context gets moved to the right device and dtype."""
    parallel_devices = [torch.device("cuda", 0), torch.device("cuda", 1)]
    expected_device = parallel_devices[1] if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu")

    strategy = DDPStrategy(
        parallel_devices=parallel_devices, precision=precision, cluster_environment=LightningEnvironment()
    )
    assert strategy.local_rank == 1
    with strategy.module_init_context():
        module = torch.nn.Linear(2, 2)
    assert module.weight.device == module.bias.device == expected_device
    assert module.weight.dtype == module.bias.dtype == expected_dtype


@mock.patch("torch.distributed.init_process_group")
def test_set_timeout(init_process_group_mock):
    """Test that the timeout gets passed to the ``torch.distributed.init_process_group`` function."""
    test_timedelta = timedelta(seconds=30)
    strategy = DDPStrategy(timeout=test_timedelta, parallel_devices=[torch.device("cpu")])
    strategy.cluster_environment = LightningEnvironment()
    strategy.accelerator = Mock()
    strategy.setup_environment()
    process_group_backend = strategy._get_process_group_backend()
    global_rank = strategy.cluster_environment.global_rank()
    world_size = strategy.cluster_environment.world_size()
    init_process_group_mock.assert_called_with(
        process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta
    )
Standalone Lite: DDP Strategy Family (#14670) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> 2022-09-15 01:36:17 +00:00			`# Copyright The Lightning AI team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
Improved model initialization API for Fabric (#17462) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> 2023-04-26 15:25:33 +00:00			`import os`
Add timeout argument for `FSDPStrategy` (#17274) Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: awaelchli <aedu.waelchli@gmail.com> 2023-04-28 00:27:06 +00:00			`from datetime import timedelta`
Remove redundant `find_unused_parameters=False` in Lite (#16026) 2022-12-17 19:00:37 +00:00			`from unittest import mock`
Efficient gradient accumulation in LightningLite (#14966) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2022-10-19 19:55:12 +00:00			`from unittest.mock import MagicMock, Mock`

Standalone Lite: DDP Strategy Family (#14670) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> 2022-09-15 01:36:17 +00:00			`import pytest`
			`import torch`
Efficient gradient accumulation in LightningLite (#14966) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2022-10-19 19:55:12 +00:00			`from torch.nn.parallel import DistributedDataParallel`
Standalone Lite: DDP Strategy Family (#14670) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> 2022-09-15 01:36:17 +00:00
True half-precision support in Fabric (#17287) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> 2023-04-27 12:37:33 +00:00			`from lightning.fabric.plugins import DoublePrecision, HalfPrecision, Precision`
Improved model initialization API for Fabric (#17462) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> 2023-04-26 15:25:33 +00:00			`from lightning.fabric.plugins.environments import LightningEnvironment`
tests: switch imports for fabric (#16592) 2023-02-01 20:34:38 +00:00			`from lightning.fabric.strategies import DDPStrategy`
			`from lightning.fabric.strategies.ddp import _DDPBackwardSyncControl`
Improved model initialization API for Fabric (#17462) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> 2023-04-26 15:25:33 +00:00			`from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0`
update list of fist party packages (#16859) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2023-03-03 16:55:48 +00:00			`from tests_fabric.helpers.runif import RunIf`
			`from tests_fabric.strategies.test_single_device import _MyFabricGradNorm, _MyFabricGradVal`
Standalone Lite: DDP Strategy Family (#14670) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> 2022-09-15 01:36:17 +00:00

			`@pytest.mark.parametrize(`
			`["process_group_backend", "device_str", "expected_process_group_backend"],`
			`[`
			`pytest.param("foo", "cpu", "foo"),`
			`pytest.param("foo", "cuda:0", "foo"),`
			`pytest.param(None, "cuda:0", "nccl"),`
			`pytest.param(None, "cpu", "gloo"),`
			`],`
			`)`
			`def test_ddp_process_group_backend(process_group_backend, device_str, expected_process_group_backend):`
			`"""Test settings for process group backend."""`

			`class MockDDPStrategy(DDPStrategy):`
			`def __init__(self, root_device, process_group_backend):`
			`self._root_device = root_device`
			`super().__init__(process_group_backend=process_group_backend)`

			`@property`
			`def root_device(self):`
			`return self._root_device`

			`strategy = MockDDPStrategy(process_group_backend=process_group_backend, root_device=torch.device(device_str))`
			`assert strategy._get_process_group_backend() == expected_process_group_backend`
Efficient gradient accumulation in LightningLite (#14966) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2022-10-19 19:55:12 +00:00

			`def test_ddp_no_backward_sync():`
			"""Test that the backward sync control calls `.no_sync()`, and only on a DDP-wrapped module."""
			`strategy = DDPStrategy()`
			`assert isinstance(strategy._backward_sync_control, _DDPBackwardSyncControl)`

			`with pytest.raises(`
			TypeError, match="is only possible if the module passed to .* is wrapped in `DistributedDataParallel`"
add & apply flake8-simplify (#17386) 2023-04-24 21:57:08 +00:00			`), strategy._backward_sync_control.no_backward_sync(Mock()):`
			`pass`
Efficient gradient accumulation in LightningLite (#14966) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> 2022-10-19 19:55:12 +00:00
			`module = MagicMock(spec=DistributedDataParallel)`
			`with strategy._backward_sync_control.no_backward_sync(module):`
			`pass`

			`module.no_sync.assert_called_once()`
Remove redundant `find_unused_parameters=False` in Lite (#16026) 2022-12-17 19:00:37 +00:00

tests: switch imports for fabric (#16592) 2023-02-01 20:34:38 +00:00			`@mock.patch("lightning.fabric.strategies.ddp.DistributedDataParallel")`
Remove redundant `find_unused_parameters=False` in Lite (#16026) 2022-12-17 19:00:37 +00:00			`def test_ddp_extra_kwargs(ddp_mock):`
			`"""Test that additional kwargs passed to the DDPStrategy get passed down to the DistributedDataParallel`
			`wrapper."""`
			`module = torch.nn.Linear(1, 1)`
			`strategy = DDPStrategy(parallel_devices=[torch.device("cpu"), torch.device("cpu")])`
			`strategy.setup_module(module)`
			`ddp_mock.assert_called_with(module=module, device_ids=None)`

			`ddp_mock.reset_mock()`

			`strategy = DDPStrategy(parallel_devices=[torch.device("cpu"), torch.device("cpu")], find_unused_parameters=True)`
			`strategy.setup_module(module)`
			`ddp_mock.assert_called_with(module=module, device_ids=None, find_unused_parameters=True)`
Fabric checkpointing 3/n: Implement missing `get_module_state_dict` for strategies (#16487) 2023-01-26 13:10:14 +00:00

			`def test_ddp_module_state_dict():`
			`"""Test that the module state dict gets retrieved without the prefixed wrapper keys from DDP."""`

			`class DistributedDataParallelMock(MagicMock):`
			`def __instancecheck__(self, instance):`
			# to make the strategy's `isinstance(model, DistributedDataParallel)` pass with a mock as class
			`return True`

			`strategy = DDPStrategy(parallel_devices=[torch.device("cpu"), torch.device("cpu")])`

			`# Without DDP applied (no setup call)`
			`original_module = torch.nn.Linear(2, 3)`
			`assert strategy.get_module_state_dict(original_module).keys() == original_module.state_dict().keys()`

			`# With DDP applied (setup called)`
tests: switch imports for fabric (#16592) 2023-02-01 20:34:38 +00:00			`with mock.patch("lightning.fabric.strategies.ddp.DistributedDataParallel", DistributedDataParallelMock):`
Fabric checkpointing 3/n: Implement missing `get_module_state_dict` for strategies (#16487) 2023-01-26 13:10:14 +00:00			`wrapped_module = strategy.setup_module(original_module)`
			`assert strategy.get_module_state_dict(wrapped_module).keys() == original_module.state_dict().keys()`
Adds Gradient Clipping to Fabric (#16715) Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> 2023-02-27 23:44:13 +00:00

			`@pytest.mark.parametrize(`
			`"clip_type,accelerator,precision",`
			`[`
			`("norm", "cpu", "32-true"),`
			`("val", "cpu", "32-true"),`
			`("norm", "cpu", "bf16-mixed"),`
			`("val", "cpu", "bf16-mixed"),`
			`pytest.param("norm", "cuda", "32-true", marks=RunIf(min_cuda_gpus=2)),`
			`pytest.param("val", "cuda", "32-true", marks=RunIf(min_cuda_gpus=2)),`
			`pytest.param("norm", "cuda", "16-mixed", marks=RunIf(min_cuda_gpus=2)),`
			`pytest.param("val", "cuda", "16-mixed", marks=RunIf(min_cuda_gpus=2)),`
			`pytest.param("norm", "cuda", "bf16-mixed", marks=RunIf(min_cuda_gpus=2, bf16_cuda=True)),`
			`pytest.param("val", "cuda", "bf16-mixed", marks=RunIf(min_cuda_gpus=2, bf16_cuda=True)),`
			`],`
			`)`
			`@RunIf(standalone=True)`
			`def test_ddp_grad_clipping(clip_type, accelerator, precision):`
add & apply flake8-simplify (#17386) 2023-04-24 21:57:08 +00:00			`clipping_test_cls = _MyFabricGradNorm if clip_type == "norm" else _MyFabricGradVal`
Adds Gradient Clipping to Fabric (#16715) Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> 2023-02-27 23:44:13 +00:00			`fabric = clipping_test_cls(accelerator=accelerator, devices=2, precision=precision, strategy="ddp")`
			`fabric.run()`
Improved model initialization API for Fabric (#17462) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> 2023-04-26 15:25:33 +00:00

			`@RunIf(min_cuda_gpus=2)`
			`@pytest.mark.parametrize(`
			`"precision,expected_dtype",`
			`[`
			`(Precision(), torch.float32),`
True half-precision support in Fabric (#17287) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> 2023-04-27 12:37:33 +00:00			`(HalfPrecision("16-true"), torch.float16),`
			`pytest.param(HalfPrecision("bf16-true"), torch.bfloat16, marks=RunIf(bf16_cuda=True)),`
Improved model initialization API for Fabric (#17462) Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> 2023-04-26 15:25:33 +00:00			`(DoublePrecision(), torch.float64),`
			`],`
			`)`
			`@mock.patch.dict(os.environ, {"LOCAL_RANK": "1"})`
			`def test_module_init_context(precision, expected_dtype):`
			`"""Test that the module under the init-context gets moved to the right device and dtype."""`
			`parallel_devices = [torch.device("cuda", 0), torch.device("cuda", 1)]`
			`expected_device = parallel_devices[1] if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu")`

			`strategy = DDPStrategy(`
			`parallel_devices=parallel_devices, precision=precision, cluster_environment=LightningEnvironment()`
			`)`
			`assert strategy.local_rank == 1`
			`with strategy.module_init_context():`
			`module = torch.nn.Linear(2, 2)`
			`assert module.weight.device == module.bias.device == expected_device`
			`assert module.weight.dtype == module.bias.dtype == expected_dtype`
Add timeout argument for `FSDPStrategy` (#17274) Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: awaelchli <aedu.waelchli@gmail.com> 2023-04-28 00:27:06 +00:00

			`@mock.patch("torch.distributed.init_process_group")`
			`def test_set_timeout(init_process_group_mock):`
			"""Test that the timeout gets passed to the ``torch.distributed.init_process_group`` function."""
			`test_timedelta = timedelta(seconds=30)`
			`strategy = DDPStrategy(timeout=test_timedelta, parallel_devices=[torch.device("cpu")])`
			`strategy.cluster_environment = LightningEnvironment()`
			`strategy.accelerator = Mock()`
			`strategy.setup_environment()`
			`process_group_backend = strategy._get_process_group_backend()`
			`global_rank = strategy.cluster_environment.global_rank()`
			`world_size = strategy.cluster_environment.world_size()`
			`init_process_group_mock.assert_called_with(`
			`process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta`
			`)`
No results found.