Set `find_unused_parameters=False` as the default (#16611)
This commit is contained in:
parent
7bbbe22636
commit
cd0eedb082
|
@ -811,36 +811,6 @@ DDP Optimizations
|
|||
*****************
|
||||
|
||||
|
||||
When Using DDP Strategies, Set find_unused_parameters=False
|
||||
===========================================================
|
||||
|
||||
By default, we have set ``find_unused_parameters=True`` for compatibility reasons that have been observed in the past (refer to the `discussion <https://github.com/Lightning-AI/lightning/discussions/6219>`_ for more details).
|
||||
When enabled, it can result in a performance hit and can be disabled in most cases. Read more about it `here <https://pytorch.org/docs/stable/notes/ddp.html#internal-design>`_.
|
||||
|
||||
.. tip::
|
||||
It applies to all DDP strategies that support ``find_unused_parameters`` as input.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from pytorch_lightning.strategies import DDPStrategy
|
||||
|
||||
trainer = pl.Trainer(
|
||||
accelerator="gpu",
|
||||
devices=2,
|
||||
strategy=DDPStrategy(find_unused_parameters=False),
|
||||
)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from pytorch_lightning.strategies import DDPSpawnStrategy
|
||||
|
||||
trainer = pl.Trainer(
|
||||
accelerator="gpu",
|
||||
devices=2,
|
||||
strategy=DDPSpawnStrategy(find_unused_parameters=False),
|
||||
)
|
||||
|
||||
|
||||
DDP Static Graph
|
||||
================
|
||||
|
||||
|
|
|
@ -11,8 +11,8 @@ It also returns the optional description and parameters for initialising the Str
|
|||
|
||||
.. code-block:: python
|
||||
|
||||
# Training with the DDP Strategy with `find_unused_parameters` as False
|
||||
trainer = Trainer(strategy="ddp_find_unused_parameters_false", accelerator="gpu", devices=4)
|
||||
# Training with the DDP Strategy
|
||||
trainer = Trainer(strategy="ddp", accelerator="gpu", devices=4)
|
||||
|
||||
# Training with DeepSpeed ZeRO Stage 3 and CPU Offload
|
||||
trainer = Trainer(strategy="deepspeed_stage_3_offload", accelerator="gpu", devices=3)
|
||||
|
|
|
@ -43,7 +43,7 @@ Here are some examples:
|
|||
trainer = Trainer(strategy="ddp", accelerator="gpu", devices=4)
|
||||
|
||||
# Training with the DistributedDataParallel strategy on 4 GPUs, with options configured
|
||||
trainer = Trainer(strategy=DDPStrategy(find_unused_parameters=False), accelerator="gpu", devices=4)
|
||||
trainer = Trainer(strategy=DDPStrategy(static_graph=True), accelerator="gpu", devices=4)
|
||||
|
||||
# Training with the DDP Spawn strategy using auto accelerator selection
|
||||
trainer = Trainer(strategy="ddp_spawn", accelerator="auto", devices=4)
|
||||
|
|
|
@ -27,6 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
- Added a `kill` method to launchers to kill all launched processes ([#16525](https://github.com/Lightning-AI/lightning/pull/16525))
|
||||
|
||||
- Added suffix option to DDP strategy names to enable `find_unused_parameters=True`, for example `strategy="ddp_find_unused_parameters_true"` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))
|
||||
|
||||
|
||||
### Changed
|
||||
|
||||
- "Native" suffix removal ([#16490](https://github.com/Lightning-AI/lightning/pull/16490))
|
||||
|
@ -48,6 +51,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
- Manual optimization is now required for working with multiple optimizers ([#16539](https://github.com/Lightning-AI/lightning/pull/16539))
|
||||
|
||||
- DDP's `find_unused_parameters` now defaults to `False` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))
|
||||
|
||||
- The strategy selected by `accelerator="hpu"` now defaults to `find_unused_parameters=False` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))
|
||||
|
||||
|
||||
### Deprecated
|
||||
|
||||
|
|
|
@ -190,13 +190,6 @@ class DDPStrategy(ParallelStrategy):
|
|||
self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
|
||||
rank_zero_only.rank = self.cluster_environment.global_rank()
|
||||
|
||||
def pre_configure_ddp(self) -> None:
|
||||
# if unset, default `find_unused_parameters` `True`
|
||||
# Many models require setting this parameter to True, as there are corner cases
|
||||
# when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
|
||||
# This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
|
||||
self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
|
||||
|
||||
def _register_ddp_hooks(self) -> None:
|
||||
log.detail(f"{self.__class__.__name__}: registering ddp hooks")
|
||||
if self.root_device.type == "cuda" and self._is_single_process_single_device:
|
||||
|
@ -263,7 +256,6 @@ class DDPStrategy(ParallelStrategy):
|
|||
|
||||
def configure_ddp(self) -> None:
|
||||
log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
|
||||
self.pre_configure_ddp()
|
||||
assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
|
||||
self.model = self._setup_model(_LightningModuleWrapperBase(self.model))
|
||||
self._register_ddp_hooks()
|
||||
|
@ -360,6 +352,12 @@ class DDPStrategy(ParallelStrategy):
|
|||
description="DDP Strategy with `find_unused_parameters` as False",
|
||||
find_unused_parameters=False,
|
||||
)
|
||||
strategy_registry.register(
|
||||
"ddp_find_unused_parameters_true",
|
||||
cls,
|
||||
description="DDP Strategy with `find_unused_parameters` as True",
|
||||
find_unused_parameters=True,
|
||||
)
|
||||
strategy_registry.register(
|
||||
cls.strategy_name,
|
||||
cls,
|
||||
|
|
|
@ -50,8 +50,10 @@ log = logging.getLogger(__name__)
|
|||
_DDP_FORK_ALIASES = (
|
||||
"ddp_fork",
|
||||
"ddp_fork_find_unused_parameters_false",
|
||||
"ddp_fork_find_unused_parameters_true",
|
||||
"ddp_notebook",
|
||||
"ddp_notebook_find_unused_parameters_false",
|
||||
"ddp_notebook_find_unused_parameters_true",
|
||||
)
|
||||
|
||||
|
||||
|
@ -186,13 +188,6 @@ class DDPSpawnStrategy(ParallelStrategy):
|
|||
def _get_process_group_backend(self) -> str:
|
||||
return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device)
|
||||
|
||||
def pre_configure_ddp(self) -> None:
|
||||
# if unset, default `find_unused_parameters` `True`
|
||||
# Many models require setting this parameter to True, as there are corner cases
|
||||
# when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
|
||||
# This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
|
||||
self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
|
||||
|
||||
def _register_ddp_hooks(self) -> None:
|
||||
# currently, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode
|
||||
# https://github.com/pytorch/pytorch/blob/v1.8.0/torch/nn/parallel/distributed.py#L1080-L1084
|
||||
|
@ -206,7 +201,6 @@ class DDPSpawnStrategy(ParallelStrategy):
|
|||
)
|
||||
|
||||
def configure_ddp(self) -> None:
|
||||
self.pre_configure_ddp()
|
||||
assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
|
||||
self.model = self._setup_model(_LightningModuleWrapperBase(self.model))
|
||||
self._register_ddp_hooks()
|
||||
|
@ -320,16 +314,19 @@ class DDPSpawnStrategy(ParallelStrategy):
|
|||
)
|
||||
|
||||
entries = (
|
||||
("ddp_spawn_find_unused_parameters_false", "spawn"),
|
||||
("ddp_fork_find_unused_parameters_false", "fork"),
|
||||
("ddp_notebook_find_unused_parameters_false", "fork"),
|
||||
("ddp_spawn_find_unused_parameters_false", False, "spawn"),
|
||||
("ddp_spawn_find_unused_parameters_true", True, "spawn"),
|
||||
("ddp_fork_find_unused_parameters_false", False, "fork"),
|
||||
("ddp_fork_find_unused_parameters_true", True, "fork"),
|
||||
("ddp_notebook_find_unused_parameters_false", False, "fork"),
|
||||
("ddp_notebook_find_unused_parameters_true", True, "fork"),
|
||||
)
|
||||
for name, start_method in entries:
|
||||
for name, fup, start_method in entries:
|
||||
strategy_registry.register(
|
||||
name,
|
||||
cls,
|
||||
description=f"DDP strategy with `find_unused_parameters` as False and `start_method` '{start_method}'",
|
||||
find_unused_parameters=False,
|
||||
description=f"DDP strategy with `find_unused_parameters` as {fup} and `start_method` '{start_method}'",
|
||||
find_unused_parameters=fup,
|
||||
start_method=start_method,
|
||||
)
|
||||
|
||||
|
|
|
@ -99,24 +99,6 @@ class HPUParallelStrategy(DDPStrategy):
|
|||
def determine_ddp_device_ids(self) -> None:
|
||||
return None
|
||||
|
||||
def _pre_configure_ddp(self) -> None:
|
||||
# if unset, default `find_unused_parameters` `True`
|
||||
# Many models require setting this parameter to True, as there are corner cases
|
||||
# when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
|
||||
# This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
|
||||
self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True)
|
||||
|
||||
self._static_graph = False
|
||||
static_graph = self._ddp_kwargs.get("static_graph")
|
||||
if static_graph:
|
||||
# when _set_static_graph() is called find_unused_parameters does not have any significance.
|
||||
# Resetting the value of find_unused_parameters to False which is the default value to DDP
|
||||
self._ddp_kwargs["find_unused_parameters"] = False
|
||||
self._static_graph = True
|
||||
if static_graph is not None:
|
||||
# DDP does not accept static_graph as a parameter, hence removing it from the list
|
||||
del self._ddp_kwargs["static_graph"]
|
||||
|
||||
def broadcast(self, obj: object, src: int = 0) -> object: # type: ignore
|
||||
obj = [obj]
|
||||
if self.global_rank != src:
|
||||
|
|
|
@ -495,7 +495,11 @@ class AcceleratorConnector:
|
|||
# TODO this logic should apply to both str and object config
|
||||
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
|
||||
|
||||
if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
|
||||
if strategy_flag in (
|
||||
"ddp_spawn",
|
||||
"ddp_spawn_find_unused_parameters_false",
|
||||
"ddp_spawn_find_unused_parameters_true",
|
||||
) and (
|
||||
TorchElasticEnvironment.detect()
|
||||
or KubeflowEnvironment.detect()
|
||||
or SLURMEnvironment.detect()
|
||||
|
|
|
@ -60,7 +60,7 @@ def test_sync_batchnorm_parity(tmpdir):
|
|||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
accelerator="gpu",
|
||||
strategy="ddp",
|
||||
strategy="ddp_find_unused_parameters_true",
|
||||
devices=2,
|
||||
max_steps=3,
|
||||
sync_batchnorm=True,
|
||||
|
|
|
@ -295,7 +295,7 @@ def _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=False)
|
|||
"default_root_dir": tmpdir,
|
||||
"max_epochs": 5,
|
||||
"accelerator": "cpu",
|
||||
"strategy": "ddp_spawn_find_unused_parameters_false" if ddp else None,
|
||||
"strategy": "ddp_spawn" if ddp else None,
|
||||
"devices": 2 if ddp else 1,
|
||||
"limit_train_batches": 5,
|
||||
"limit_val_batches": 0,
|
||||
|
|
|
@ -163,8 +163,9 @@ def test_ddp_process_group_backend(process_group_backend, device_str, expected_p
|
|||
[
|
||||
("ddp", {}),
|
||||
("ddp_find_unused_parameters_false", {"find_unused_parameters": False}),
|
||||
("ddp_find_unused_parameters_true", {"find_unused_parameters": True}),
|
||||
],
|
||||
)
|
||||
def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
|
||||
def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_0):
|
||||
trainer = Trainer(strategy=strategy_name)
|
||||
assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
|
||||
|
|
|
@ -155,16 +155,25 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group):
|
|||
pytest.param("ddp_fork", {}, marks=RunIf(skip_windows=True)),
|
||||
pytest.param("ddp_notebook", {}, marks=RunIf(skip_windows=True)),
|
||||
("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}),
|
||||
("ddp_spawn_find_unused_parameters_true", {"find_unused_parameters": True}),
|
||||
pytest.param(
|
||||
"ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True)
|
||||
),
|
||||
pytest.param(
|
||||
"ddp_fork_find_unused_parameters_true", {"find_unused_parameters": True}, marks=RunIf(skip_windows=True)
|
||||
),
|
||||
pytest.param(
|
||||
"ddp_notebook_find_unused_parameters_false",
|
||||
{"find_unused_parameters": False},
|
||||
marks=RunIf(skip_windows=True),
|
||||
),
|
||||
pytest.param(
|
||||
"ddp_notebook_find_unused_parameters_true",
|
||||
{"find_unused_parameters": True},
|
||||
marks=RunIf(skip_windows=True),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
|
||||
def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_0):
|
||||
trainer = Trainer(strategy=strategy_name)
|
||||
assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
|
||||
|
|
|
@ -83,26 +83,50 @@ def test_fsdp_strategy_registry(cuda_count_1):
|
|||
DDPStrategy,
|
||||
{"find_unused_parameters": False},
|
||||
),
|
||||
(
|
||||
"ddp_find_unused_parameters_true",
|
||||
DDPStrategy,
|
||||
{"find_unused_parameters": True},
|
||||
),
|
||||
(
|
||||
"ddp_spawn_find_unused_parameters_false",
|
||||
DDPSpawnStrategy,
|
||||
{"find_unused_parameters": False, "start_method": "spawn"},
|
||||
),
|
||||
(
|
||||
"ddp_spawn_find_unused_parameters_true",
|
||||
DDPSpawnStrategy,
|
||||
{"find_unused_parameters": True, "start_method": "spawn"},
|
||||
),
|
||||
pytest.param(
|
||||
"ddp_fork_find_unused_parameters_false",
|
||||
DDPSpawnStrategy,
|
||||
{"find_unused_parameters": False, "start_method": "fork"},
|
||||
marks=RunIf(skip_windows=True),
|
||||
),
|
||||
pytest.param(
|
||||
"ddp_fork_find_unused_parameters_true",
|
||||
DDPSpawnStrategy,
|
||||
{"find_unused_parameters": True, "start_method": "fork"},
|
||||
marks=RunIf(skip_windows=True),
|
||||
),
|
||||
pytest.param(
|
||||
"ddp_notebook_find_unused_parameters_false",
|
||||
DDPSpawnStrategy,
|
||||
{"find_unused_parameters": False, "start_method": "fork"},
|
||||
marks=RunIf(skip_windows=True),
|
||||
),
|
||||
pytest.param(
|
||||
"ddp_notebook_find_unused_parameters_true",
|
||||
DDPSpawnStrategy,
|
||||
{"find_unused_parameters": True, "start_method": "fork"},
|
||||
marks=RunIf(skip_windows=True),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy, expected_init_params):
|
||||
def test_ddp_find_unused_parameters_strategy_registry(
|
||||
tmpdir, strategy_name, strategy, expected_init_params, mps_count_0
|
||||
):
|
||||
trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name)
|
||||
assert isinstance(trainer.strategy, strategy)
|
||||
assert strategy_name in StrategyRegistry
|
||||
|
|
|
@ -363,8 +363,10 @@ def test_exception_invalid_strategy():
|
|||
(
|
||||
("ddp_spawn", DDPSpawnStrategy),
|
||||
("ddp_spawn_find_unused_parameters_false", DDPSpawnStrategy),
|
||||
("ddp_spawn_find_unused_parameters_true", DDPSpawnStrategy),
|
||||
("ddp", DDPStrategy),
|
||||
("ddp_find_unused_parameters_false", DDPStrategy),
|
||||
("ddp_find_unused_parameters_true", DDPStrategy),
|
||||
("dp", DataParallelStrategy),
|
||||
pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
|
||||
),
|
||||
|
|
|
@ -821,9 +821,7 @@ class TestManualOptimizationDDPModelToggleModel(TesManualOptimizationDDPModel):
|
|||
|
||||
@RunIf(min_cuda_gpus=2, standalone=True)
|
||||
def test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model(tmpdir):
|
||||
train_manual_optimization(
|
||||
tmpdir, "ddp_find_unused_parameters_false", model_cls=TestManualOptimizationDDPModelToggleModel
|
||||
)
|
||||
train_manual_optimization(tmpdir, "ddp", model_cls=TestManualOptimizationDDPModelToggleModel)
|
||||
|
||||
|
||||
def test_lr_schedulers(tmpdir):
|
||||
|
|
Loading…
Reference in New Issue