Set `find_unused_parameters=False` as the default (#16611)

2023-02-06 16:51:21 +01:00 · 2023-02-06 16:51:21 +01:00 · cd0eedb082
parent 7bbbe22636
commit cd0eedb082
15 changed files with 74 additions and 82 deletions
--- a/docs/source-pytorch/advanced/model_parallel.rst
+++ b/docs/source-pytorch/advanced/model_parallel.rst
@ -811,36 +811,6 @@ DDP Optimizations
 *****************


-When Using DDP Strategies, Set find_unused_parameters=False
-===========================================================
-
-By default, we have set ``find_unused_parameters=True`` for compatibility reasons that have been observed in the past (refer to the `discussion <https://github.com/Lightning-AI/lightning/discussions/6219>`_ for more details).
-When enabled, it can result in a performance hit and can be disabled in most cases. Read more about it `here <https://pytorch.org/docs/stable/notes/ddp.html#internal-design>`_.
-
-.. tip::
-    It applies to all DDP strategies that support ``find_unused_parameters`` as input.
-
-.. code-block:: python
-
-    from pytorch_lightning.strategies import DDPStrategy
-
-    trainer = pl.Trainer(
-        accelerator="gpu",
-        devices=2,
-        strategy=DDPStrategy(find_unused_parameters=False),
-    )
-
-.. code-block:: python
-
-    from pytorch_lightning.strategies import DDPSpawnStrategy
-
-    trainer = pl.Trainer(
-        accelerator="gpu",
-        devices=2,
-        strategy=DDPSpawnStrategy(find_unused_parameters=False),
-    )
-
-
 DDP Static Graph
 ================

--- a/docs/source-pytorch/advanced/strategy_registry.rst
+++ b/docs/source-pytorch/advanced/strategy_registry.rst
@ -11,8 +11,8 @@ It also returns the optional description and parameters for initialising the Str

 .. code-block:: python

-    # Training with the DDP Strategy with `find_unused_parameters` as False
-    trainer = Trainer(strategy="ddp_find_unused_parameters_false", accelerator="gpu", devices=4)
+    # Training with the DDP Strategy
+    trainer = Trainer(strategy="ddp", accelerator="gpu", devices=4)

    # Training with DeepSpeed ZeRO Stage 3 and CPU Offload
    trainer = Trainer(strategy="deepspeed_stage_3_offload", accelerator="gpu", devices=3)
--- a/docs/source-pytorch/extensions/strategy.rst
+++ b/docs/source-pytorch/extensions/strategy.rst
@ -43,7 +43,7 @@ Here are some examples:
    trainer = Trainer(strategy="ddp", accelerator="gpu", devices=4)

    # Training with the DistributedDataParallel strategy on 4 GPUs, with options configured
-    trainer = Trainer(strategy=DDPStrategy(find_unused_parameters=False), accelerator="gpu", devices=4)
+    trainer = Trainer(strategy=DDPStrategy(static_graph=True), accelerator="gpu", devices=4)

    # Training with the DDP Spawn strategy using auto accelerator selection
    trainer = Trainer(strategy="ddp_spawn", accelerator="auto", devices=4)
--- a/src/lightning/pytorch/CHANGELOG.md
+++ b/src/lightning/pytorch/CHANGELOG.md
@ -27,6 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

 - Added a `kill` method to launchers to kill all launched processes ([#16525](https://github.com/Lightning-AI/lightning/pull/16525))

+- Added suffix option to DDP strategy names to enable `find_unused_parameters=True`, for example `strategy="ddp_find_unused_parameters_true"` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))
+
+
 ### Changed

 - "Native" suffix removal ([#16490](https://github.com/Lightning-AI/lightning/pull/16490))
@ -48,6 +51,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

 - Manual optimization is now required for working with multiple optimizers ([#16539](https://github.com/Lightning-AI/lightning/pull/16539))

+- DDP's `find_unused_parameters` now defaults to `False` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))
+
+- The strategy selected by `accelerator="hpu"` now defaults to `find_unused_parameters=False` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))
+

 ### Deprecated

--- a/src/lightning/pytorch/strategies/ddp.py
+++ b/src/lightning/pytorch/strategies/ddp.py
@ -190,13 +190,6 @@ class DDPStrategy(ParallelStrategy):
        self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
        rank_zero_only.rank = self.cluster_environment.global_rank()

-    def pre_configure_ddp(self) -> None:
-        # if unset, default `find_unused_parameters` `True`
-        # Many models require setting this parameter to True, as there are corner cases
-        # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
-        # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-
    def _register_ddp_hooks(self) -> None:
        log.detail(f"{self.__class__.__name__}: registering ddp hooks")
        if self.root_device.type == "cuda" and self._is_single_process_single_device:
@ -263,7 +256,6 @@ class DDPStrategy(ParallelStrategy):

    def configure_ddp(self) -> None:
        log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
-        self.pre_configure_ddp()
        assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
        self.model = self._setup_model(_LightningModuleWrapperBase(self.model))
        self._register_ddp_hooks()
@ -360,6 +352,12 @@ class DDPStrategy(ParallelStrategy):
            description="DDP Strategy with `find_unused_parameters` as False",
            find_unused_parameters=False,
        )
+        strategy_registry.register(
+            "ddp_find_unused_parameters_true",
+            cls,
+            description="DDP Strategy with `find_unused_parameters` as True",
+            find_unused_parameters=True,
+        )
        strategy_registry.register(
            cls.strategy_name,
            cls,
--- a/src/lightning/pytorch/strategies/ddp_spawn.py
+++ b/src/lightning/pytorch/strategies/ddp_spawn.py
@ -50,8 +50,10 @@ log = logging.getLogger(__name__)
 _DDP_FORK_ALIASES = (
    "ddp_fork",
    "ddp_fork_find_unused_parameters_false",
+    "ddp_fork_find_unused_parameters_true",
    "ddp_notebook",
    "ddp_notebook_find_unused_parameters_false",
+    "ddp_notebook_find_unused_parameters_true",
 )


@ -186,13 +188,6 @@ class DDPSpawnStrategy(ParallelStrategy):
    def _get_process_group_backend(self) -> str:
        return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)

-    def pre_configure_ddp(self) -> None:
-        # if unset, default `find_unused_parameters` `True`
-        # Many models require setting this parameter to True, as there are corner cases
-        # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
-        # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-
    def _register_ddp_hooks(self) -> None:
        # currently, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
        # https://github.com/pytorch/pytorch/blob/v1.8.0/torch/nn/parallel/distributed.py#L1080-L1084
@ -206,7 +201,6 @@ class DDPSpawnStrategy(ParallelStrategy):
            )

    def configure_ddp(self) -> None:
-        self.pre_configure_ddp()
        assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
        self.model = self._setup_model(_LightningModuleWrapperBase(self.model))
        self._register_ddp_hooks()
@ -320,16 +314,19 @@ class DDPSpawnStrategy(ParallelStrategy):
            )

        entries = (
-            ("ddp_spawn_find_unused_parameters_false", "spawn"),
-            ("ddp_fork_find_unused_parameters_false", "fork"),
-            ("ddp_notebook_find_unused_parameters_false", "fork"),
+            ("ddp_spawn_find_unused_parameters_false", False, "spawn"),
+            ("ddp_spawn_find_unused_parameters_true", True, "spawn"),
+            ("ddp_fork_find_unused_parameters_false", False, "fork"),
+            ("ddp_fork_find_unused_parameters_true", True, "fork"),
+            ("ddp_notebook_find_unused_parameters_false", False, "fork"),
+            ("ddp_notebook_find_unused_parameters_true", True, "fork"),
        )
-        for name, start_method in entries:
+        for name, fup, start_method in entries:
            strategy_registry.register(
                name,
                cls,
-                description=f"DDP strategy with `find_unused_parameters` as False and `start_method` '{start_method}'",
-                find_unused_parameters=False,
+                description=f"DDP strategy with `find_unused_parameters` as {fup} and `start_method` '{start_method}'",
+                find_unused_parameters=fup,
                start_method=start_method,
            )

--- a/src/lightning/pytorch/strategies/hpu_parallel.py
+++ b/src/lightning/pytorch/strategies/hpu_parallel.py
@ -99,24 +99,6 @@ class HPUParallelStrategy(DDPStrategy):
    def determine_ddp_device_ids(self) -> None:
        return None

-    def _pre_configure_ddp(self) -> None:
-        # if unset, default `find_unused_parameters` `True`
-        # Many models require setting this parameter to True, as there are corner cases
-        # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
-        # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
-        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
-
-        self._static_graph = False
-        static_graph = self._ddp_kwargs.get("static_graph")
-        if static_graph:
-            # when _set_static_graph() is called find_unused_parameters does not have any significance.
-            # Resetting the value of find_unused_parameters to False which is the default value to DDP
-            self._ddp_kwargs["find_unused_parameters"] = False
-            self._static_graph = True
-        if static_graph is not None:
-            # DDP does not accept static_graph as a parameter, hence removing it from the list
-            del self._ddp_kwargs["static_graph"]
-
    def broadcast(self, obj: object, src: int = 0) -> object:  # type: ignore
        obj = [obj]
        if self.global_rank != src:
--- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py
+++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py
@ -495,7 +495,11 @@ class AcceleratorConnector:
        # TODO this logic should apply to both str and object config
        strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag

-        if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
+        if strategy_flag in (
+            "ddp_spawn",
+            "ddp_spawn_find_unused_parameters_false",
+            "ddp_spawn_find_unused_parameters_true",
+        ) and (
            TorchElasticEnvironment.detect()
            or KubeflowEnvironment.detect()
            or SLURMEnvironment.detect()
--- a/tests/tests_pytorch/benchmarks/test_sync_batchnorm_parity.py
+++ b/tests/tests_pytorch/benchmarks/test_sync_batchnorm_parity.py
@ -60,7 +60,7 @@ def test_sync_batchnorm_parity(tmpdir):
    trainer = Trainer(
        default_root_dir=tmpdir,
        accelerator="gpu",
-        strategy="ddp",
+        strategy="ddp_find_unused_parameters_true",
        devices=2,
        max_steps=3,
        sync_batchnorm=True,
--- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
+++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
@ -295,7 +295,7 @@ def _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=False)
        "default_root_dir": tmpdir,
        "max_epochs": 5,
        "accelerator": "cpu",
-        "strategy": "ddp_spawn_find_unused_parameters_false" if ddp else None,
+        "strategy": "ddp_spawn" if ddp else None,
        "devices": 2 if ddp else 1,
        "limit_train_batches": 5,
        "limit_val_batches": 0,
--- a/tests/tests_pytorch/strategies/test_ddp.py
+++ b/tests/tests_pytorch/strategies/test_ddp.py
@ -163,8 +163,9 @@ def test_ddp_process_group_backend(process_group_backend, device_str, expected_p
    [
        ("ddp", {}),
        ("ddp_find_unused_parameters_false", {"find_unused_parameters": False}),
+        ("ddp_find_unused_parameters_true", {"find_unused_parameters": True}),
    ],
 )
-def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
+def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_0):
    trainer = Trainer(strategy=strategy_name)
    assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
--- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py
+++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py
@ -155,16 +155,25 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group):
        pytest.param("ddp_fork", {}, marks=RunIf(skip_windows=True)),
        pytest.param("ddp_notebook", {}, marks=RunIf(skip_windows=True)),
        ("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}),
+        ("ddp_spawn_find_unused_parameters_true", {"find_unused_parameters": True}),
        pytest.param(
            "ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True)
        ),
+        pytest.param(
+            "ddp_fork_find_unused_parameters_true", {"find_unused_parameters": True}, marks=RunIf(skip_windows=True)
+        ),
        pytest.param(
            "ddp_notebook_find_unused_parameters_false",
            {"find_unused_parameters": False},
            marks=RunIf(skip_windows=True),
        ),
+        pytest.param(
+            "ddp_notebook_find_unused_parameters_true",
+            {"find_unused_parameters": True},
+            marks=RunIf(skip_windows=True),
+        ),
    ],
 )
-def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
+def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_0):
    trainer = Trainer(strategy=strategy_name)
    assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
--- a/tests/tests_pytorch/strategies/test_registry.py
+++ b/tests/tests_pytorch/strategies/test_registry.py
@ -83,26 +83,50 @@ def test_fsdp_strategy_registry(cuda_count_1):
            DDPStrategy,
            {"find_unused_parameters": False},
        ),
+        (
+            "ddp_find_unused_parameters_true",
+            DDPStrategy,
+            {"find_unused_parameters": True},
+        ),
        (
            "ddp_spawn_find_unused_parameters_false",
            DDPSpawnStrategy,
            {"find_unused_parameters": False, "start_method": "spawn"},
        ),
+        (
+            "ddp_spawn_find_unused_parameters_true",
+            DDPSpawnStrategy,
+            {"find_unused_parameters": True, "start_method": "spawn"},
+        ),
        pytest.param(
            "ddp_fork_find_unused_parameters_false",
            DDPSpawnStrategy,
            {"find_unused_parameters": False, "start_method": "fork"},
            marks=RunIf(skip_windows=True),
        ),
+        pytest.param(
+            "ddp_fork_find_unused_parameters_true",
+            DDPSpawnStrategy,
+            {"find_unused_parameters": True, "start_method": "fork"},
+            marks=RunIf(skip_windows=True),
+        ),
        pytest.param(
            "ddp_notebook_find_unused_parameters_false",
            DDPSpawnStrategy,
            {"find_unused_parameters": False, "start_method": "fork"},
            marks=RunIf(skip_windows=True),
        ),
+        pytest.param(
+            "ddp_notebook_find_unused_parameters_true",
+            DDPSpawnStrategy,
+            {"find_unused_parameters": True, "start_method": "fork"},
+            marks=RunIf(skip_windows=True),
+        ),
    ],
 )
-def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy, expected_init_params):
+def test_ddp_find_unused_parameters_strategy_registry(
+    tmpdir, strategy_name, strategy, expected_init_params, mps_count_0
+):
    trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name)
    assert isinstance(trainer.strategy, strategy)
    assert strategy_name in StrategyRegistry
--- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
+++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@ -363,8 +363,10 @@ def test_exception_invalid_strategy():
    (
        ("ddp_spawn", DDPSpawnStrategy),
        ("ddp_spawn_find_unused_parameters_false", DDPSpawnStrategy),
+        ("ddp_spawn_find_unused_parameters_true", DDPSpawnStrategy),
        ("ddp", DDPStrategy),
        ("ddp_find_unused_parameters_false", DDPStrategy),
+        ("ddp_find_unused_parameters_true", DDPStrategy),
        ("dp", DataParallelStrategy),
        pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
    ),
--- a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py
+++ b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py
@ -821,9 +821,7 @@ class TestManualOptimizationDDPModelToggleModel(TesManualOptimizationDDPModel):

@RunIf(min_cuda_gpus=2, standalone=True)
 def test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model(tmpdir):
-    train_manual_optimization(
-        tmpdir, "ddp_find_unused_parameters_false", model_cls=TestManualOptimizationDDPModelToggleModel
-    )
+    train_manual_optimization(tmpdir, "ddp", model_cls=TestManualOptimizationDDPModelToggleModel)


 def test_lr_schedulers(tmpdir):