diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst index 3b1fe1bf9f..9b3030f02e 100644 --- a/docs/source-pytorch/advanced/model_parallel.rst +++ b/docs/source-pytorch/advanced/model_parallel.rst @@ -811,36 +811,6 @@ DDP Optimizations ***************** -When Using DDP Strategies, Set find_unused_parameters=False -=========================================================== - -By default, we have set ``find_unused_parameters=True`` for compatibility reasons that have been observed in the past (refer to the `discussion `_ for more details). -When enabled, it can result in a performance hit and can be disabled in most cases. Read more about it `here `_. - -.. tip:: - It applies to all DDP strategies that support ``find_unused_parameters`` as input. - -.. code-block:: python - - from pytorch_lightning.strategies import DDPStrategy - - trainer = pl.Trainer( - accelerator="gpu", - devices=2, - strategy=DDPStrategy(find_unused_parameters=False), - ) - -.. code-block:: python - - from pytorch_lightning.strategies import DDPSpawnStrategy - - trainer = pl.Trainer( - accelerator="gpu", - devices=2, - strategy=DDPSpawnStrategy(find_unused_parameters=False), - ) - - DDP Static Graph ================ diff --git a/docs/source-pytorch/advanced/strategy_registry.rst b/docs/source-pytorch/advanced/strategy_registry.rst index d92069a9fe..27bab6ea49 100644 --- a/docs/source-pytorch/advanced/strategy_registry.rst +++ b/docs/source-pytorch/advanced/strategy_registry.rst @@ -11,8 +11,8 @@ It also returns the optional description and parameters for initialising the Str .. code-block:: python - # Training with the DDP Strategy with `find_unused_parameters` as False - trainer = Trainer(strategy="ddp_find_unused_parameters_false", accelerator="gpu", devices=4) + # Training with the DDP Strategy + trainer = Trainer(strategy="ddp", accelerator="gpu", devices=4) # Training with DeepSpeed ZeRO Stage 3 and CPU Offload trainer = Trainer(strategy="deepspeed_stage_3_offload", accelerator="gpu", devices=3) diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst index ea24961cf3..7ca360b02f 100644 --- a/docs/source-pytorch/extensions/strategy.rst +++ b/docs/source-pytorch/extensions/strategy.rst @@ -43,7 +43,7 @@ Here are some examples: trainer = Trainer(strategy="ddp", accelerator="gpu", devices=4) # Training with the DistributedDataParallel strategy on 4 GPUs, with options configured - trainer = Trainer(strategy=DDPStrategy(find_unused_parameters=False), accelerator="gpu", devices=4) + trainer = Trainer(strategy=DDPStrategy(static_graph=True), accelerator="gpu", devices=4) # Training with the DDP Spawn strategy using auto accelerator selection trainer = Trainer(strategy="ddp_spawn", accelerator="auto", devices=4) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 3013c35a79..bb5a942686 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -27,6 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added a `kill` method to launchers to kill all launched processes ([#16525](https://github.com/Lightning-AI/lightning/pull/16525)) +- Added suffix option to DDP strategy names to enable `find_unused_parameters=True`, for example `strategy="ddp_find_unused_parameters_true"` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611)) + + ### Changed - "Native" suffix removal ([#16490](https://github.com/Lightning-AI/lightning/pull/16490)) @@ -48,6 +51,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Manual optimization is now required for working with multiple optimizers ([#16539](https://github.com/Lightning-AI/lightning/pull/16539)) +- DDP's `find_unused_parameters` now defaults to `False` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611)) + +- The strategy selected by `accelerator="hpu"` now defaults to `find_unused_parameters=False` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611)) + ### Deprecated diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py index 1692f9960f..e3204ba6c8 100644 --- a/src/lightning/pytorch/strategies/ddp.py +++ b/src/lightning/pytorch/strategies/ddp.py @@ -190,13 +190,6 @@ class DDPStrategy(ParallelStrategy): self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) rank_zero_only.rank = self.cluster_environment.global_rank() - def pre_configure_ddp(self) -> None: - # if unset, default `find_unused_parameters` `True` - # Many models require setting this parameter to True, as there are corner cases - # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. - # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible. - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - def _register_ddp_hooks(self) -> None: log.detail(f"{self.__class__.__name__}: registering ddp hooks") if self.root_device.type == "cuda" and self._is_single_process_single_device: @@ -263,7 +256,6 @@ class DDPStrategy(ParallelStrategy): def configure_ddp(self) -> None: log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel") - self.pre_configure_ddp() assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) self.model = self._setup_model(_LightningModuleWrapperBase(self.model)) self._register_ddp_hooks() @@ -360,6 +352,12 @@ class DDPStrategy(ParallelStrategy): description="DDP Strategy with `find_unused_parameters` as False", find_unused_parameters=False, ) + strategy_registry.register( + "ddp_find_unused_parameters_true", + cls, + description="DDP Strategy with `find_unused_parameters` as True", + find_unused_parameters=True, + ) strategy_registry.register( cls.strategy_name, cls, diff --git a/src/lightning/pytorch/strategies/ddp_spawn.py b/src/lightning/pytorch/strategies/ddp_spawn.py index 9974df53b1..ba08d7ad1f 100644 --- a/src/lightning/pytorch/strategies/ddp_spawn.py +++ b/src/lightning/pytorch/strategies/ddp_spawn.py @@ -50,8 +50,10 @@ log = logging.getLogger(__name__) _DDP_FORK_ALIASES = ( "ddp_fork", "ddp_fork_find_unused_parameters_false", + "ddp_fork_find_unused_parameters_true", "ddp_notebook", "ddp_notebook_find_unused_parameters_false", + "ddp_notebook_find_unused_parameters_true", ) @@ -186,13 +188,6 @@ class DDPSpawnStrategy(ParallelStrategy): def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) - def pre_configure_ddp(self) -> None: - # if unset, default `find_unused_parameters` `True` - # Many models require setting this parameter to True, as there are corner cases - # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. - # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible. - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - def _register_ddp_hooks(self) -> None: # currently, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode # https://github.com/pytorch/pytorch/blob/v1.8.0/torch/nn/parallel/distributed.py#L1080-L1084 @@ -206,7 +201,6 @@ class DDPSpawnStrategy(ParallelStrategy): ) def configure_ddp(self) -> None: - self.pre_configure_ddp() assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) self.model = self._setup_model(_LightningModuleWrapperBase(self.model)) self._register_ddp_hooks() @@ -320,16 +314,19 @@ class DDPSpawnStrategy(ParallelStrategy): ) entries = ( - ("ddp_spawn_find_unused_parameters_false", "spawn"), - ("ddp_fork_find_unused_parameters_false", "fork"), - ("ddp_notebook_find_unused_parameters_false", "fork"), + ("ddp_spawn_find_unused_parameters_false", False, "spawn"), + ("ddp_spawn_find_unused_parameters_true", True, "spawn"), + ("ddp_fork_find_unused_parameters_false", False, "fork"), + ("ddp_fork_find_unused_parameters_true", True, "fork"), + ("ddp_notebook_find_unused_parameters_false", False, "fork"), + ("ddp_notebook_find_unused_parameters_true", True, "fork"), ) - for name, start_method in entries: + for name, fup, start_method in entries: strategy_registry.register( name, cls, - description=f"DDP strategy with `find_unused_parameters` as False and `start_method` '{start_method}'", - find_unused_parameters=False, + description=f"DDP strategy with `find_unused_parameters` as {fup} and `start_method` '{start_method}'", + find_unused_parameters=fup, start_method=start_method, ) diff --git a/src/lightning/pytorch/strategies/hpu_parallel.py b/src/lightning/pytorch/strategies/hpu_parallel.py index b823d2fab5..fb587801aa 100644 --- a/src/lightning/pytorch/strategies/hpu_parallel.py +++ b/src/lightning/pytorch/strategies/hpu_parallel.py @@ -99,24 +99,6 @@ class HPUParallelStrategy(DDPStrategy): def determine_ddp_device_ids(self) -> None: return None - def _pre_configure_ddp(self) -> None: - # if unset, default `find_unused_parameters` `True` - # Many models require setting this parameter to True, as there are corner cases - # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. - # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible. - self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get("find_unused_parameters", True) - - self._static_graph = False - static_graph = self._ddp_kwargs.get("static_graph") - if static_graph: - # when _set_static_graph() is called find_unused_parameters does not have any significance. - # Resetting the value of find_unused_parameters to False which is the default value to DDP - self._ddp_kwargs["find_unused_parameters"] = False - self._static_graph = True - if static_graph is not None: - # DDP does not accept static_graph as a parameter, hence removing it from the list - del self._ddp_kwargs["static_graph"] - def broadcast(self, obj: object, src: int = 0) -> object: # type: ignore obj = [obj] if self.global_rank != src: diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index b62510d843..8fb6cbdc4a 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -495,7 +495,11 @@ class AcceleratorConnector: # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag - if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( + if strategy_flag in ( + "ddp_spawn", + "ddp_spawn_find_unused_parameters_false", + "ddp_spawn_find_unused_parameters_true", + ) and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect() diff --git a/tests/tests_pytorch/benchmarks/test_sync_batchnorm_parity.py b/tests/tests_pytorch/benchmarks/test_sync_batchnorm_parity.py index 82aaee4e64..bcbaea6714 100644 --- a/tests/tests_pytorch/benchmarks/test_sync_batchnorm_parity.py +++ b/tests/tests_pytorch/benchmarks/test_sync_batchnorm_parity.py @@ -60,7 +60,7 @@ def test_sync_batchnorm_parity(tmpdir): trainer = Trainer( default_root_dir=tmpdir, accelerator="gpu", - strategy="ddp", + strategy="ddp_find_unused_parameters_true", devices=2, max_steps=3, sync_batchnorm=True, diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index 6cc31f2718..35d8858710 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -295,7 +295,7 @@ def _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=False) "default_root_dir": tmpdir, "max_epochs": 5, "accelerator": "cpu", - "strategy": "ddp_spawn_find_unused_parameters_false" if ddp else None, + "strategy": "ddp_spawn" if ddp else None, "devices": 2 if ddp else 1, "limit_train_batches": 5, "limit_val_batches": 0, diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index b96e26521c..248e42bd7e 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -163,8 +163,9 @@ def test_ddp_process_group_backend(process_group_backend, device_str, expected_p [ ("ddp", {}), ("ddp_find_unused_parameters_false", {"find_unused_parameters": False}), + ("ddp_find_unused_parameters_true", {"find_unused_parameters": True}), ], ) -def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): +def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_0): trainer = Trainer(strategy=strategy_name) assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs diff --git a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py index 4f150e4302..7f0cb73891 100644 --- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py @@ -155,16 +155,25 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group): pytest.param("ddp_fork", {}, marks=RunIf(skip_windows=True)), pytest.param("ddp_notebook", {}, marks=RunIf(skip_windows=True)), ("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), + ("ddp_spawn_find_unused_parameters_true", {"find_unused_parameters": True}), pytest.param( "ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True) ), + pytest.param( + "ddp_fork_find_unused_parameters_true", {"find_unused_parameters": True}, marks=RunIf(skip_windows=True) + ), pytest.param( "ddp_notebook_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True), ), + pytest.param( + "ddp_notebook_find_unused_parameters_true", + {"find_unused_parameters": True}, + marks=RunIf(skip_windows=True), + ), ], ) -def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): +def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_0): trainer = Trainer(strategy=strategy_name) assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs diff --git a/tests/tests_pytorch/strategies/test_registry.py b/tests/tests_pytorch/strategies/test_registry.py index c9a5e23d63..270fb028fa 100644 --- a/tests/tests_pytorch/strategies/test_registry.py +++ b/tests/tests_pytorch/strategies/test_registry.py @@ -83,26 +83,50 @@ def test_fsdp_strategy_registry(cuda_count_1): DDPStrategy, {"find_unused_parameters": False}, ), + ( + "ddp_find_unused_parameters_true", + DDPStrategy, + {"find_unused_parameters": True}, + ), ( "ddp_spawn_find_unused_parameters_false", DDPSpawnStrategy, {"find_unused_parameters": False, "start_method": "spawn"}, ), + ( + "ddp_spawn_find_unused_parameters_true", + DDPSpawnStrategy, + {"find_unused_parameters": True, "start_method": "spawn"}, + ), pytest.param( "ddp_fork_find_unused_parameters_false", DDPSpawnStrategy, {"find_unused_parameters": False, "start_method": "fork"}, marks=RunIf(skip_windows=True), ), + pytest.param( + "ddp_fork_find_unused_parameters_true", + DDPSpawnStrategy, + {"find_unused_parameters": True, "start_method": "fork"}, + marks=RunIf(skip_windows=True), + ), pytest.param( "ddp_notebook_find_unused_parameters_false", DDPSpawnStrategy, {"find_unused_parameters": False, "start_method": "fork"}, marks=RunIf(skip_windows=True), ), + pytest.param( + "ddp_notebook_find_unused_parameters_true", + DDPSpawnStrategy, + {"find_unused_parameters": True, "start_method": "fork"}, + marks=RunIf(skip_windows=True), + ), ], ) -def test_ddp_find_unused_parameters_strategy_registry(tmpdir, strategy_name, strategy, expected_init_params): +def test_ddp_find_unused_parameters_strategy_registry( + tmpdir, strategy_name, strategy, expected_init_params, mps_count_0 +): trainer = Trainer(default_root_dir=tmpdir, strategy=strategy_name) assert isinstance(trainer.strategy, strategy) assert strategy_name in StrategyRegistry diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 64238be9c9..05ecfe5825 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -363,8 +363,10 @@ def test_exception_invalid_strategy(): ( ("ddp_spawn", DDPSpawnStrategy), ("ddp_spawn_find_unused_parameters_false", DDPSpawnStrategy), + ("ddp_spawn_find_unused_parameters_true", DDPSpawnStrategy), ("ddp", DDPStrategy), ("ddp_find_unused_parameters_false", DDPStrategy), + ("ddp_find_unused_parameters_true", DDPStrategy), ("dp", DataParallelStrategy), pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)), ), diff --git a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py index 0e98a962ed..040267c9ad 100644 --- a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py +++ b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py @@ -821,9 +821,7 @@ class TestManualOptimizationDDPModelToggleModel(TesManualOptimizationDDPModel): @RunIf(min_cuda_gpus=2, standalone=True) def test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model(tmpdir): - train_manual_optimization( - tmpdir, "ddp_find_unused_parameters_false", model_cls=TestManualOptimizationDDPModelToggleModel - ) + train_manual_optimization(tmpdir, "ddp", model_cls=TestManualOptimizationDDPModelToggleModel) def test_lr_schedulers(tmpdir):