Make Trainer readable and debuggable (2/n) (#14862)

* clean trainer 2/n * clean trainer 2/n * clean trainer 2/n * clean trainer 2/n * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2022-09-23 05:05:29 -07:00 · 2022-09-23 05:05:29 -07:00 · fbfcc3d871
parent c77d4a8394
commit fbfcc3d871
2 changed files with 69 additions and 46 deletions
--- a/src/pytorch_lightning/trainer/teardown.py
+++ b/src/pytorch_lightning/trainer/teardown.py
@ -0,0 +1,57 @@
+# Copyright Lightning AI.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import traceback
+from typing import Any, Callable
+
+from lightning_lite.utilities.distributed import distributed_available
+from pytorch_lightning.trainer.states import TrainerStatus
+from pytorch_lightning.utilities.rank_zero import rank_zero_warn
+
+
+def call_and_handle_interrupt(trainer: Any, trainer_fn: Callable, *args: Any, **kwargs: Any) -> Any:
+    r"""
+    Error handling, intended to be used only for main trainer function entry points (fit, validate, test, predict)
+    as all errors should funnel through them
+
+    Args:
+        trainer_fn: one of (fit, validate, test, predict)
+        *args: positional arguments to be passed to the `trainer_fn`
+        **kwargs: keyword arguments to be passed to `trainer_fn`
+    """
+    try:
+        if trainer.strategy.launcher is not None:
+            return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
+        else:
+            return trainer_fn(*args, **kwargs)
+    # TODO: Unify both exceptions below, where `KeyboardError` doesn't re-raise
+    except KeyboardInterrupt as exception:
+        rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
+        # user could press Ctrl+c many times... only shutdown once
+        if not trainer.interrupted:
+            trainer.state.status = TrainerStatus.INTERRUPTED
+            trainer._call_callback_hooks("on_exception", exception)
+            for logger in trainer.loggers:
+                logger.finalize("failed")
+    except BaseException as exception:
+        trainer.state.status = TrainerStatus.INTERRUPTED
+        if distributed_available() and trainer.world_size > 1:
+            # try syncing remaining processes, kill otherwise
+            trainer.strategy.reconciliate_processes(traceback.format_exc())
+        trainer._call_callback_hooks("on_exception", exception)
+        for logger in trainer.loggers:
+            logger.finalize("failed")
+        trainer._teardown()
+        # teardown might access the stage so we reset it after
+        trainer.state.stage = None
+        raise
--- a/src/pytorch_lightning/trainer/trainer.py
+++ b/src/pytorch_lightning/trainer/trainer.py
@ -16,14 +16,13 @@ import inspect
 import logging
 import math
 import os
-import traceback
 import warnings
 from argparse import ArgumentParser, Namespace
 from contextlib import contextmanager
 from copy import deepcopy
 from datetime import timedelta
 from pathlib import Path
-from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Type, Union
+from typing import Any, Dict, Generator, Iterable, List, Optional, Type, Union
 from weakref import proxy

 import torch
@ -38,7 +37,6 @@ from torch.utils.data import DataLoader
 import pytorch_lightning as pl
 from lightning_lite.utilities.cloud_io import get_filesystem
 from lightning_lite.utilities.data import _auto_add_worker_init_fn
-from lightning_lite.utilities.distributed import distributed_available
 from lightning_lite.utilities.types import _PATH
 from lightning_lite.utilities.warnings import PossibleUserWarning
 from pytorch_lightning.accelerators import (
@ -75,6 +73,7 @@ from pytorch_lightning.profilers import (
    XLAProfiler,
 )
 from pytorch_lightning.strategies import ParallelStrategy, Strategy
+from pytorch_lightning.trainer import teardown
 from pytorch_lightning.trainer.configuration_validator import verify_loop_configurations
 from pytorch_lightning.trainer.connectors.accelerator_connector import _LITERAL_WARN, AcceleratorConnector
 from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
@ -620,43 +619,6 @@ class Trainer(
        self._last_train_dl_reload_epoch = None
        self._last_val_dl_reload_epoch: Optional[int] = None

-    def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs: Any) -> Any:
-        r"""
-        Error handling, intended to be used only for main trainer function entry points (fit, validate, test, predict)
-        as all errors should funnel through them
-
-        Args:
-            trainer_fn: one of (fit, validate, test, predict)
-            *args: positional arguments to be passed to the `trainer_fn`
-            **kwargs: keyword arguments to be passed to `trainer_fn`
-        """
-        try:
-            if self.strategy.launcher is not None:
-                return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
-            else:
-                return trainer_fn(*args, **kwargs)
-        # TODO: Unify both exceptions below, where `KeyboardError` doesn't re-raise
-        except KeyboardInterrupt as exception:
-            rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
-            # user could press Ctrl+c many times... only shutdown once
-            if not self.interrupted:
-                self.state.status = TrainerStatus.INTERRUPTED
-                self._call_callback_hooks("on_exception", exception)
-                for logger in self.loggers:
-                    logger.finalize("failed")
-        except BaseException as exception:
-            self.state.status = TrainerStatus.INTERRUPTED
-            if distributed_available() and self.world_size > 1:
-                # try syncing remaining processes, kill otherwise
-                self.strategy.reconciliate_processes(traceback.format_exc())
-            self._call_callback_hooks("on_exception", exception)
-            for logger in self.loggers:
-                logger.finalize("failed")
-            self._teardown()
-            # teardown might access the stage so we reset it after
-            self.state.stage = None
-            raise
-
    def fit(
        self,
        model: "pl.LightningModule",
@ -686,8 +648,8 @@ class Trainer(
        if not isinstance(model, pl.LightningModule):
            raise TypeError(f"`Trainer.fit()` requires a `LightningModule`, got: {model.__class__.__qualname__}")
        self.strategy._lightning_module = model
-        self._call_and_handle_interrupt(
-            self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
+        teardown.call_and_handle_interrupt(
+            self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
        )

    def _fit_impl(
@ -766,7 +728,9 @@ class Trainer(
        if model is not None and not isinstance(model, pl.LightningModule):
            raise TypeError(f"`Trainer.validate()` requires a `LightningModule`, got: {model.__class__.__qualname__}")
        self.strategy._lightning_module = model or self.lightning_module
-        return self._call_and_handle_interrupt(self._validate_impl, model, dataloaders, ckpt_path, verbose, datamodule)
+        return teardown.call_and_handle_interrupt(
+            self, self._validate_impl, model, dataloaders, ckpt_path, verbose, datamodule
+        )

    def _validate_impl(
        self,
@ -856,7 +820,9 @@ class Trainer(
        if model is not None and not isinstance(model, pl.LightningModule):
            raise TypeError(f"`Trainer.test()` requires a `LightningModule`, got: {model.__class__.__qualname__}")
        self.strategy._lightning_module = model or self.lightning_module
-        return self._call_and_handle_interrupt(self._test_impl, model, dataloaders, ckpt_path, verbose, datamodule)
+        return teardown.call_and_handle_interrupt(
+            self, self._test_impl, model, dataloaders, ckpt_path, verbose, datamodule
+        )

    def _test_impl(
        self,
@ -945,8 +911,8 @@ class Trainer(
        if model is not None and not isinstance(model, pl.LightningModule):
            raise TypeError(f"`Trainer.predict()` requires a `LightningModule`, got: {model.__class__.__qualname__}")
        self.strategy._lightning_module = model or self.lightning_module
-        return self._call_and_handle_interrupt(
-            self._predict_impl, model, dataloaders, datamodule, return_predictions, ckpt_path
+        return teardown.call_and_handle_interrupt(
+            self, self._predict_impl, model, dataloaders, datamodule, return_predictions, ckpt_path
        )

    def _predict_impl(