Add `Strategy.on_exception` (#16646)

2023-02-08 15:00:31 +01:00 · 2023-02-08 15:00:31 +01:00 · 74ee699dfd
parent 1288e4ccc4
commit 74ee699dfd
11 changed files with 155 additions and 1 deletions
--- a/src/lightning/pytorch/CHANGELOG.md
+++ b/src/lightning/pytorch/CHANGELOG.md
@ -30,6 +30,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added suffix option to DDP strategy names to enable `find_unused_parameters=True`, for example `strategy="ddp_find_unused_parameters_true"` ([#16611](https://github.com/Lightning-AI/lightning/pull/16611))


+- Added a new method `Strategy.on_exception` to the strategy base interface ([#16646](https://github.com/Lightning-AI/lightning/pull/16646))
+
+
 ### Changed

 - "Native" suffix removal ([#16490](https://github.com/Lightning-AI/lightning/pull/16490))
--- a/src/lightning/pytorch/strategies/ddp.py
+++ b/src/lightning/pytorch/strategies/ddp.py
@ -45,6 +45,7 @@ from lightning.pytorch.strategies.parallel import ParallelStrategy
 from lightning.pytorch.strategies.strategy import TBroadcast
 from lightning.pytorch.trainer.states import TrainerFn
 from lightning.pytorch.utilities.distributed import register_ddp_comm_hook
+from lightning.pytorch.utilities.exceptions import _augment_message
 from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_only
 from lightning.pytorch.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep

@ -364,6 +365,18 @@ class DDPStrategy(ParallelStrategy):
            description=f"{cls.__class__.__name__}",
        )

+    def on_exception(self, exception: BaseException) -> None:
+        _augment_message(
+            exception,
+            pattern=".*Expected to have finished reduction in the prior iteration.*",
+            new_message=(
+                "It looks like your LightningModule has parameters that were not used in producing the loss returned"
+                " by training_step. If this is intentional, you must enable the detection of unused parameters in DDP,"
+                " either by setting the string value `strategy='ddp_find_unused_parameters_true'`"
+                " or by setting the flag in the strategy with `strategy=DDPStrategy(find_unused_parameters=True)`."
+            ),
+        )
+
    def teardown(self) -> None:
        log.detail(f"{self.__class__.__name__}: tearing down strategy")

--- a/src/lightning/pytorch/strategies/ddp_spawn.py
+++ b/src/lightning/pytorch/strategies/ddp_spawn.py
@ -42,6 +42,7 @@ from lightning.pytorch.strategies.parallel import ParallelStrategy
 from lightning.pytorch.strategies.strategy import TBroadcast
 from lightning.pytorch.trainer.states import TrainerFn
 from lightning.pytorch.utilities.distributed import register_ddp_comm_hook
+from lightning.pytorch.utilities.exceptions import _augment_message
 from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_only
 from lightning.pytorch.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep

@ -330,6 +331,18 @@ class DDPSpawnStrategy(ParallelStrategy):
                start_method=start_method,
            )

+    def on_exception(self, exception: BaseException) -> None:
+        _augment_message(
+            exception,
+            pattern=".*Expected to have finished reduction in the prior iteration.*",
+            new_message=(
+                "It looks like your LightningModule has parameters that were not used in producing the loss returned"
+                " by training_step. If this is intentional, you must enable the detection of unused parameters in DDP,"
+                f" either by setting the string value `strategy='ddp_{self._start_method}_find_unused_parameters_true'`"
+                " or by setting the flag in the strategy with `strategy=DDPSpawnStrategy(find_unused_parameters=True)`."
+            ),
+        )
+
    def teardown(self) -> None:
        log.detail(f"{self.__class__.__name__}: tearing down strategy")

--- a/src/lightning/pytorch/strategies/strategy.py
+++ b/src/lightning/pytorch/strategies/strategy.py
@ -528,6 +528,10 @@ class Strategy(ABC):
        """Called in the training loop before anything happens for that batch."""
        pass

+    def on_exception(self, exception: BaseException) -> None:
+        """Called when the trainer execution is interrupted by an exception."""
+        pass
+
    def __getstate__(self) -> Dict:
        # `LightningOptimizer` overrides `self.__class__` so they cannot be pickled
        state = dict(vars(self))  # copy
--- a/src/lightning/pytorch/trainer/call.py
+++ b/src/lightning/pytorch/trainer/call.py
@ -48,11 +48,13 @@ def _call_and_handle_interrupt(trainer: "pl.Trainer", trainer_fn: Callable, *arg
        if not trainer.interrupted:
            trainer.state.status = TrainerStatus.INTERRUPTED
            trainer._call_callback_hooks("on_exception", exception)
+            trainer.strategy.on_exception(exception)
            for logger in trainer.loggers:
                logger.finalize("failed")
    except BaseException as exception:
        trainer.state.status = TrainerStatus.INTERRUPTED
        trainer._call_callback_hooks("on_exception", exception)
+        trainer.strategy.on_exception(exception)
        for logger in trainer.loggers:
            logger.finalize("failed")
        trainer._teardown()
--- a/src/lightning/pytorch/utilities/exceptions.py
+++ b/src/lightning/pytorch/utilities/exceptions.py
@ -11,7 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
+
 from lightning.fabric.utilities.exceptions import MisconfigurationException  # noqa: F401
+from lightning.pytorch.utilities.imports import _PYTHON_GREATER_EQUAL_3_11_0


 class SIGTERMException(SystemExit):
@ -27,3 +30,13 @@ class SIGTERMException(SystemExit):

 class _TunerExitException(Exception):
    """Exception used to exit early while tuning."""
+
+
+def _augment_message(exception: BaseException, pattern: str, new_message: str) -> None:
+    if _PYTHON_GREATER_EQUAL_3_11_0 and any(re.match(pattern, message, re.DOTALL) for message in exception.args):
+        exception.add_note(new_message)
+    else:
+        # Remove this when Python 3.11 becomes the minimum supported version
+        exception.args = tuple(
+            new_message if re.match(pattern, message, re.DOTALL) else message for message in exception.args
+        )
--- a/src/lightning/pytorch/utilities/imports.py
+++ b/src/lightning/pytorch/utilities/imports.py
@ -20,6 +20,7 @@ from lightning_utilities.core.imports import compare_version, package_available,

 _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8)
 _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
+_PYTHON_GREATER_EQUAL_3_11_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 11)
 # duplicated from fabric because HPU is patching it below
 _TORCH_GREATER_EQUAL_1_13 = compare_version("torch", operator.ge, "1.13.0")
 _TORCHMETRICS_GREATER_EQUAL_0_9_1 = RequirementCache("torchmetrics>=0.9.1")
--- a/tests/tests_pytorch/strategies/test_ddp_spawn.py
+++ b/tests/tests_pytorch/strategies/test_ddp_spawn.py
@ -11,6 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import pytest
+from torch.multiprocessing import ProcessRaisedException
+
 import tests_pytorch.helpers.pipelines as tpipes
 from lightning.pytorch.callbacks import EarlyStopping
 from lightning.pytorch.demos.boring_classes import BoringModel
@ -18,6 +21,7 @@ from lightning.pytorch.trainer import Trainer
 from tests_pytorch.helpers.datamodules import ClassifDataModule
 from tests_pytorch.helpers.runif import RunIf
 from tests_pytorch.helpers.simple_models import ClassificationModel
+from tests_pytorch.strategies.test_ddp_strategy import UnusedParametersModel


@RunIf(min_cuda_gpus=2, sklearn=True)
@ -73,3 +77,13 @@ def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
    )
    trainer.fit(model, train_dataloaders=model.train_dataloader(), val_dataloaders=model.val_dataloader())
    assert trainer.state.finished, "DDP doesn't work with dataloaders passed to fit()."
+
+
+def test_ddp_spawn_find_unused_parameters_exception():
+    """Test that the DDP strategy can change PyTorch's error message so that it's more useful for Lightning
+    users."""
+    trainer = Trainer(accelerator="cpu", devices=1, strategy="ddp_spawn", max_steps=2)
+    with pytest.raises(
+        ProcessRaisedException, match="It looks like your LightningModule has parameters that were not used in"
+    ):
+        trainer.fit(UnusedParametersModel())
--- a/tests/tests_pytorch/strategies/test_ddp_strategy.py
+++ b/tests/tests_pytorch/strategies/test_ddp_strategy.py
@ -276,3 +276,22 @@ def test_ddp_strategy_checkpoint_zero_redundancy_optimizer(tmpdir, strategy):
    # Assert model parameters are identical after loading
    for trained_param, loaded_param in zip(model.parameters(), saved_model.parameters()):
        assert torch.equal(trained_param.to("cpu"), loaded_param)
+
+
+class UnusedParametersModel(BoringModel):
+    def __init__(self):
+        super().__init__()
+        self.intermediate_layer = torch.nn.Linear(32, 32)
+
+    def training_step(self, batch, batch_idx):
+        with torch.no_grad():
+            batch = self.intermediate_layer(batch)
+        return super().training_step(batch, batch_idx)
+
+
+def test_ddp_strategy_find_unused_parameters_exception():
+    """Test that the DDP strategy can change PyTorch's error message so that it's more useful for Lightning
+    users."""
+    trainer = Trainer(accelerator="cpu", devices=1, strategy="ddp", max_steps=2)
+    with pytest.raises(RuntimeError, match="It looks like your LightningModule has parameters that were not used in"):
+        trainer.fit(UnusedParametersModel())
--- a/tests/tests_pytorch/trainer/test_trainer.py
+++ b/tests/tests_pytorch/trainer/test_trainer.py
@ -17,9 +17,10 @@ import math
 import os
 import pickle
 from argparse import Namespace
-from contextlib import nullcontext
+from contextlib import nullcontext, suppress
 from copy import deepcopy
 from pathlib import Path
+from unittest import mock
 from unittest.mock import ANY, call, Mock, patch

 import cloudpickle
@ -2123,3 +2124,19 @@ def test_trainer_compiled_model(tmp_path, monkeypatch):
    trainer = Trainer(**trainer_kwargs)
    with pytest.raises(TypeError, match="must be a `Light"):
        trainer.fit(object())
+
+
+@pytest.mark.parametrize("exception_type", [KeyboardInterrupt, RuntimeError])
+def test_trainer_calls_strategy_on_exception(exception_type):
+    """Test that when an exception occurs, the Trainer lets the strategy process it."""
+    exception = exception_type("Test exception")
+
+    class ExceptionModel(BoringModel):
+        def on_fit_start(self):
+            raise exception
+
+    trainer = Trainer()
+    with mock.patch("lightning.pytorch.strategies.strategy.Strategy.on_exception") as on_exception_mock:
+        with suppress(Exception):
+            trainer.fit(ExceptionModel())
+    on_exception_mock.assert_called_once_with(exception)
--- a/tests/tests_pytorch/utilities/test_exceptions.py
+++ b/tests/tests_pytorch/utilities/test_exceptions.py
@ -0,0 +1,55 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from lightning.pytorch.utilities.exceptions import _augment_message
+from lightning.pytorch.utilities.imports import _PYTHON_GREATER_EQUAL_3_11_0
+
+
+def test_augment_message():
+    # exception without args
+    exception = Exception()
+    _augment_message(exception, "", "new message")
+    assert not exception.args
+    if _PYTHON_GREATER_EQUAL_3_11_0:
+        assert not exception.__notes__
+
+    # exception with one arg
+    exception = Exception("Test message.")
+    _augment_message(exception, "Test", "New Test message")
+    if _PYTHON_GREATER_EQUAL_3_11_0:
+        assert exception.__notes__ == ["New Test message"]
+        assert exception.args == ("Test message.",)
+    else:
+        assert exception.args == ("New Test message",)
+
+    # pattern matching
+    exception = Exception("Hello. Test message. Over!")
+    _augment_message(exception, ".*Test.*Over.*", "New Test message")
+    if _PYTHON_GREATER_EQUAL_3_11_0:
+        assert exception.__notes__ == ["New Test message"]
+        assert exception.args == ("Hello. Test message. Over!",)
+    else:
+        assert exception.args == ("New Test message",)
+
+    # exception with multiple args
+    exception = Exception("Message 1", "Message 2", "Message 3")
+    _augment_message(exception, "Message 2", "New message 2")
+    if _PYTHON_GREATER_EQUAL_3_11_0:
+        assert exception.__notes__ == ["New message 2"]
+        assert exception.args == (
+            "Message 1",
+            "Message 2",
+            "Message 3",
+        )
+    else:
+        assert exception.args == ("Message 1", "New message 2", "Message 3")