Set `find_unused_parameters=False` as the default (#16611)

This commit is contained in:
Adrian Wälchli 2023-02-06 16:51:21 +01:00 committed by GitHub
parent 7bbbe22636
commit cd0eedb082
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 74 additions and 82 deletions

View File

@ -811,36 +811,6 @@ DDP Optimizations
*****************
When Using DDP Strategies, Set find_unused_parameters=False
===========================================================
By default, we have set ``find_unused_parameters=True`` for compatibility reasons that have been observed in the past (refer to the `discussion <https://github.com/Lightning-AI/lightning/discussions/6219>`_ for more details).
When enabled, it can result in a performance hit and can be disabled in most cases. Read more about it `here <https://pytorch.org/docs/stable/notes/ddp.html#internal-design>`_.
.. tip::
It applies to all DDP strategies that support ``find_unused_parameters`` as input.
.. code-block:: python
from pytorch_lightning.strategies import DDPStrategy
trainer = pl.Trainer(
accelerator="gpu",
devices=2,
strategy=DDPStrategy(find_unused_parameters=False),
)
.. code-block:: python
from pytorch_lightning.strategies import DDPSpawnStrategy
trainer = pl.Trainer(
accelerator="gpu",
devices=2,
strategy=DDPSpawnStrategy(find_unused_parameters=False),
)
DDP Static Graph
================

View File

@ -11,8 +11,8 @@ It also returns the optional description and parameters for initialising the Str
.. code-block:: python
# Training with the DDP Strategy with `find_unused_parameters` as False
trainer = Trainer(strategy="ddp_find_unused_parameters_false", accelerator="gpu", devices=4)
# Training with the DDP Strategy
trainer = Trainer(strategy="ddp", accelerator="gpu", devices=4)
# Training with DeepSpeed ZeRO Stage 3 and CPU Offload
trainer = Trainer(strategy="deepspeed_stage_3_offload", accelerator="gpu", devices=3)

View File

@ -43,7 +43,7 @@ Here are some examples:
trainer = Trainer(strategy="ddp", accelerator="gpu", devices=4)
# Training with the DistributedDataParallel strategy on 4 GPUs, with options configured
trainer = Trainer(strategy=DDPStrategy(find_unused_parameters=False), accelerator="gpu", devices=4)
trainer = Trainer(strategy=DDPStrategy(static_graph=True), accelerator="gpu", devices=4)
# Training with the DDP Spawn strategy using auto accelerator selection
trainer = Trainer(strategy="ddp_spawn", accelerator="auto", devices=4)

View File

@ -27,6 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added a `kill` method to launchers to kill all launched processes ([#16525](https://github.com/Lightning-AI/lightning/pull/16525))
- Added suffix option to DDP strategy names to enable `find_unused_parameters=True`, for example `strategy="ddp_find_unused_parameters_true"` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))
### Changed
- "Native" suffix removal ([#16490](https://github.com/Lightning-AI/lightning/pull/16490))
@ -48,6 +51,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Manual optimization is now required for working with multiple optimizers ([#16539](https://github.com/Lightning-AI/lightning/pull/16539))
- DDP's `find_unused_parameters` now defaults to `False` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))
- The strategy selected by `accelerator="hpu"` now defaults to `find_unused_parameters=False` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))
### Deprecated

View File

@ -190,13 +190,6 @@ class DDPStrategy(ParallelStrategy):
self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
rank_zero_only.rank = self.cluster_environment.global_rank()
def pre_configure_ddp(self) -> None:
# if unset, default `find_unused_parameters` `True`
# Many models require setting this parameter to True, as there are corner cases
# when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
# This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
def _register_ddp_hooks(self) -> None:
log.detail(f"{self.__class__.__name__}: registering ddp hooks")
if self.root_device.type == "cuda" and self._is_single_process_single_device:
@ -263,7 +256,6 @@ class DDPStrategy(ParallelStrategy):
def configure_ddp(self) -> None:
log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
self.pre_configure_ddp()
assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
self.model = self._setup_model(_LightningModuleWrapperBase(self.model))
self._register_ddp_hooks()
@ -360,6 +352,12 @@ class DDPStrategy(ParallelStrategy):
description="DDP Strategy with `find_unused_parameters` as False",
find_unused_parameters=False,
)
strategy_registry.register(
"ddp_find_unused_parameters_true",
cls,
description="DDP Strategy with `find_unused_parameters` as True",
find_unused_parameters=True,
)
strategy_registry.register(
cls.strategy_name,
cls,

View File

@ -50,8 +50,10 @@ log = logging.getLogger(__name__)
_DDP_FORK_ALIASES = (
"ddp_fork",
"ddp_fork_find_unused_parameters_false",
"ddp_fork_find_unused_parameters_true",
"ddp_notebook",
"ddp_notebook_find_unused_parameters_false",
"ddp_notebook_find_unused_parameters_true",
)
@ -186,13 +188,6 @@ class DDPSpawnStrategy(ParallelStrategy):
def _get_process_group_backend(self) -> str:
return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
def pre_configure_ddp(self) -> None:
# if unset, default `find_unused_parameters` `True`
# Many models require setting this parameter to True, as there are corner cases
# when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
# This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
def _register_ddp_hooks(self) -> None:
# currently, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
# https://github.com/pytorch/pytorch/blob/v1.8.0/torch/nn/parallel/distributed.py#L1080-L1084
@ -206,7 +201,6 @@ class DDPSpawnStrategy(ParallelStrategy):
)
def configure_ddp(self) -> None:
self.pre_configure_ddp()
assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
self.model = self._setup_model(_LightningModuleWrapperBase(self.model))
self._register_ddp_hooks()
@ -320,16 +314,19 @@ class DDPSpawnStrategy(ParallelStrategy):
)
entries = (
("ddp_spawn_find_unused_parameters_false", "spawn"),
("ddp_fork_find_unused_parameters_false", "fork"),
("ddp_notebook_find_unused_parameters_false", "fork"),
("ddp_spawn_find_unused_parameters_false", False, "spawn"),
("ddp_spawn_find_unused_parameters_true", True, "spawn"),
("ddp_fork_find_unused_parameters_false", False, "fork"),
("ddp_fork_find_unused_parameters_true", True, "fork"),
("ddp_notebook_find_unused_parameters_false", False, "fork"),
("ddp_notebook_find_unused_parameters_true", True, "fork"),
)
for name, start_method in entries:
for name, fup, start_method in entries:
strategy_registry.register(
name,
cls,
description=f"DDP strategy with `find_unused_parameters` as False and `start_method` '{start_method}'",
find_unused_parameters=False,
description=f"DDP strategy with `find_unused_parameters` as {fup} and `start_method` '{start_method}'",
find_unused_parameters=fup,
start_method=start_method,
)

View File

@ -99,24 +99,6 @@ class HPUParallelStrategy(DDPStrategy):
def determine_ddp_device_ids(self) -> None:
return None
def _pre_configure_ddp(self) -> None:
# if unset, default `find_unused_parameters` `True`
# Many models require setting this parameter to True, as there are corner cases
# when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
# This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
self._static_graph = False
static_graph = self._ddp_kwargs.get("static_graph")
if static_graph:
# when _set_static_graph() is called find_unused_parameters does not have any significance.
# Resetting the value of find_unused_parameters to False which is the default value to DDP
self._ddp_kwargs["find_unused_parameters"] = False
self._static_graph = True
if static_graph is not None:
# DDP does not accept static_graph as a parameter, hence removing it from the list
del self._ddp_kwargs["static_graph"]
def broadcast(self, obj: object, src: int = 0) -> object: # type: ignore
obj = [obj]
if self.global_rank != src:

View File

@ -495,7 +495,11 @@ class AcceleratorConnector:
# TODO this logic should apply to both str and object config
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
if strategy_flag in (
"ddp_spawn",
"ddp_spawn_find_unused_parameters_false",
"ddp_spawn_find_unused_parameters_true",
) and (
TorchElasticEnvironment.detect()
or KubeflowEnvironment.detect()
or SLURMEnvironment.detect()

View File

@ -60,7 +60,7 @@ def test_sync_batchnorm_parity(tmpdir):
trainer = Trainer(
default_root_dir=tmpdir,
accelerator="gpu",
strategy="ddp",
strategy="ddp_find_unused_parameters_true",
devices=2,
max_steps=3,
sync_batchnorm=True,

View File

@ -295,7 +295,7 @@ def _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=False)
"default_root_dir": tmpdir,
"max_epochs": 5,
"accelerator": "cpu",
"strategy": "ddp_spawn_find_unused_parameters_false" if ddp else None,
"strategy": "ddp_spawn" if ddp else None,
"devices": 2 if ddp else 1,
"limit_train_batches": 5,
"limit_val_batches": 0,

View File

@ -163,8 +163,9 @@ def test_ddp_process_group_backend(process_group_backend, device_str, expected_p
[
("ddp", {}),
("ddp_find_unused_parameters_false", {"find_unused_parameters": False}),
("ddp_find_unused_parameters_true", {"find_unused_parameters": True}),
],
)
def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_0):
trainer = Trainer(strategy=strategy_name)
assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs

View File

@ -155,16 +155,25 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group):
pytest.param("ddp_fork", {}, marks=RunIf(skip_windows=True)),
pytest.param("ddp_notebook", {}, marks=RunIf(skip_windows=True)),
("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}),
("ddp_spawn_find_unused_parameters_true", {"find_unused_parameters": True}),
pytest.param(
"ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True)
),
pytest.param(
"ddp_fork_find_unused_parameters_true", {"find_unused_parameters": True}, marks=RunIf(skip_windows=True)
),
pytest.param(
"ddp_notebook_find_unused_parameters_false",
{"find_unused_parameters": False},
marks=RunIf(skip_windows=True),
),
pytest.param(
"ddp_notebook_find_unused_parameters_true",
{"find_unused_parameters": True},
marks=RunIf(skip_windows=True),
),
],
)
def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_0):
trainer = Trainer(strategy=strategy_name)
assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs

View File

@ -83,26 +83,50 @@ def test_fsdp_strategy_registry(cuda_count_1):
DDPStrategy,
{"find_unused_parameters": False},
),
(
"ddp_find_unused_parameters_true",
DDPStrategy,
{"find_unused_parameters": True},
),
(
"ddp_spawn_find_unused_parameters_false",
DDPSpawnStrategy,
{"find_unused_parameters": False, "start_method": "spawn"},
),
(
"ddp_spawn_find_unused_parameters_true",
DDPSpawnStrategy,
{"find_unused_parameters": True, "start_method": "spawn"},
),
pytest.param(
"ddp_fork_find_unused_parameters_false",
DDPSpawnStrategy,
{"find_unused_parameters": False, "start_method": "fork"},
marks=RunIf(skip_windows=True),
),
pytest.param(
"ddp_fork_find_unused_parameters_true",
DDPSpawnStrategy,
{"find_unused_parameters": True, "start_method": "fork"},
marks=RunIf(skip_windows=True),
),
pytest.param(
"ddp_notebook_find_unused_parameters_false",
DDPSpawnStrategy,
{"find_unused_parameters": False, "start_method": "fork"},
marks=RunIf(skip_windows=True),
),
pytest.param(
"ddp_notebook_find_unused_parameters_true",
DDPSpawnStrategy,
{"find_unused_parameters": True, "start_method": "fork"},
marks=RunIf(skip_windows=True),
),
],
)
def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy, expected_init_params):
def test_ddp_find_unused_parameters_strategy_registry(
tmpdir, strategy_name, strategy, expected_init_params, mps_count_0
):
trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name)
assert isinstance(trainer.strategy, strategy)
assert strategy_name in StrategyRegistry

View File

@ -363,8 +363,10 @@ def test_exception_invalid_strategy():
(
("ddp_spawn", DDPSpawnStrategy),
("ddp_spawn_find_unused_parameters_false", DDPSpawnStrategy),
("ddp_spawn_find_unused_parameters_true", DDPSpawnStrategy),
("ddp", DDPStrategy),
("ddp_find_unused_parameters_false", DDPStrategy),
("ddp_find_unused_parameters_true", DDPStrategy),
("dp", DataParallelStrategy),
pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
),

View File

@ -821,9 +821,7 @@ class TestManualOptimizationDDPModelToggleModel(TesManualOptimizationDDPModel):
@RunIf(min_cuda_gpus=2, standalone=True)
def test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model(tmpdir):
train_manual_optimization(
tmpdir, "ddp_find_unused_parameters_false", model_cls=TestManualOptimizationDDPModelToggleModel
)
train_manual_optimization(tmpdir, "ddp", model_cls=TestManualOptimizationDDPModelToggleModel)
def test_lr_schedulers(tmpdir):