From 51b10f78f4b4c4b704219c619dc5e73784aca57b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 23 Mar 2021 18:13:29 +0100
Subject: [PATCH] Refactor PyTorch profiler 4/5 (#6349)

Co-authored-by: thomas chaton <thomas@grid.ai>
---
 CHANGELOG.md                                  |   9 +
 pytorch_lightning/profiler/profilers.py       |  12 +-
 pytorch_lightning/profiler/pytorch.py         | 359 +++++++++++-------
 .../trainer/connectors/profiler_connector.py  |   3 +-
 pytorch_lightning/trainer/predict_loop.py     |   4 +
 pytorch_lightning/trainer/training_loop.py    |   2 +-
 pytorch_lightning/utilities/imports.py        |   1 +
 tests/checkpointing/test_torch_saving.py      |   1 +
 tests/deprecated_api/test_remove_1-5.py       |   5 +
 tests/test_profiler.py                        | 178 ++++++---
 tests/trainer/properties/test_get_model.py    |  20 -
 11 files changed, 376 insertions(+), 218 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 81bfa85cc0..e1106189e0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -52,6 +52,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `AbstractProfiler` interface ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
 
 
+- Added support for including module names for forward in the autograd trace of `PyTorchProfiler` ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349))
+
+
 - Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120))
 
 
@@ -72,6 +75,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed profilers to save separate report files per state and rank ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
 
 
+- Changed `PyTorchProfiler` to use `torch.autograd.profiler.record_function` to record functions ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349))
+
+
 ### Deprecated
 
 - `period` has been deprecated in favor of `every_n_val_epochs` in the `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146))
@@ -83,6 +89,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `Profiler(output_filename)` in favor of `dirpath` and `filename` ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
 
 
+- Deprecated `PytorchProfiler(profiled_functions)` in favor of `record_functions` ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349))
+
+
 - Deprecated metrics in favor of `torchmetrics` ([#6505](https://github.com/PyTorchLightning/pytorch-lightning/pull/6505),
 
     [#6530](https://github.com/PyTorchLightning/pytorch-lightning/pull/6530),
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 54bc5cdf01..46d72583fb 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -126,7 +126,7 @@ class BaseProfiler(AbstractProfiler):
             filename += f"{self._stage}-"
         filename += str(self.filename)
         if self._local_rank is not None:
-            filename += f"-{self.local_rank}"
+            filename += f"-{self._local_rank}"
         filename += ".txt"
         return filename
 
@@ -134,8 +134,7 @@ class BaseProfiler(AbstractProfiler):
         if self._write_stream is not None:
             return
         if self.filename:
-            dirpath = self.dirpath or self._log_dir
-            filepath = os.path.join(dirpath, self._prepare_filename())
+            filepath = os.path.join(self.dirpath, self._prepare_filename())
             fs = get_filesystem(filepath)
             file = fs.open(filepath, "a")
             self._output_file = file
@@ -175,8 +174,7 @@ class BaseProfiler(AbstractProfiler):
         self._stage = stage
         self._local_rank = local_rank
         self._log_dir = log_dir
-        if self.dirpath is None:
-            self.dirpath = self._log_dir
+        self.dirpath = self.dirpath or log_dir
 
     def teardown(self, stage: Optional[str] = None) -> None:
         """
@@ -202,8 +200,8 @@ class BaseProfiler(AbstractProfiler):
         raise NotImplementedError
 
     @property
-    def local_rank(self):
-        return '0' if self._local_rank is None else self._local_rank
+    def local_rank(self) -> int:
+        return 0 if self._local_rank is None else self._local_rank
 
 
 class PassThroughProfiler(BaseProfiler):
diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py
index 55b1c28678..974883a472 100644
--- a/pytorch_lightning/profiler/pytorch.py
+++ b/pytorch_lightning/profiler/pytorch.py
@@ -12,25 +12,92 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Profiler to check if there are any bottlenecks in your code."""
-
 import inspect
 import logging
 import os
+from functools import partial
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Type, TYPE_CHECKING, Union
 
 import torch
+from torch import nn, Tensor
+from torch.autograd.profiler import record_function
 
 from pytorch_lightning.profiler.profilers import BaseProfiler
 from pytorch_lightning.utilities.distributed import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
+if TYPE_CHECKING:
+    from torch.autograd.profiler import EventList
+    from torch.utils.hooks import RemovableHandle
+
+    from pytorch_lightning.core.lightning import LightningModule
+
 log = logging.getLogger(__name__)
 
+_PROFILER = Union[torch.autograd.profiler.profile, torch.cuda.profiler.profile, torch.autograd.profiler.emit_nvtx]
+
+
+class RegisterRecordFunction:
+    """
+    While profiling autograd operations, this class will add labels for module names around the forward function.
+
+    The Lightning PyTorch Profiler will activate this feature automatically. It can be deactivated as follows:
+
+    Example::
+        from pytorch_lightning.profilers import PyTorchProfiler
+        profiler = PyTorchProfiler(record_module_names=False)
+        Trainer(profiler=profiler)
+
+    It can be used outside of Lightning as follows:
+
+    Example::
+        from pytorch_lightning import Trainer, seed_everything
+        with RegisterRecordFunction(model):
+            out = model(batch)
+    """
+
+    def __init__(self, model: nn.Module) -> None:
+        self._model = model
+        self._records: Dict[str, record_function] = {}
+        self._handles: Dict[str, List['RemovableHandle']] = {}
+
+    def _start_recording_forward(self, _: nn.Module, input: Tensor, record_name: str) -> Tensor:
+        record = record_function(record_name)
+        record.__enter__()
+        self._records[record_name] = record
+        return input
+
+    def _stop_recording_forward(self, _: nn.Module, __: Tensor, output: Tensor, record_name: str) -> Tensor:
+        self._records[record_name].__exit__(None, None, None)
+        return output
+
+    def __enter__(self) -> None:
+        for module_name, module in self._model.named_modules():
+            if module_name:
+                full_name = f"{type(module).__module__}.{type(module).__name__}"
+                record_name = f"{full_name}: {module_name}"
+                pre_forward_handle = module.register_forward_pre_hook(
+                    partial(self._start_recording_forward, record_name=record_name)
+                )
+                post_forward_handle = module.register_forward_hook(
+                    partial(self._stop_recording_forward, record_name=record_name)
+                )
+
+                self._handles[module_name] = [pre_forward_handle, post_forward_handle]
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        for handles in self._handles.values():
+            for h in handles:
+                h.remove()
+        self._handles = {}
+
 
 class PyTorchProfiler(BaseProfiler):
 
-    PROFILED_FUNCTIONS = ("training_step_and_backward", "validation_step", "test_step")
+    RECORD_FUNCTIONS = (
+        "training_step_and_backward", "training_step", "backward", "validation_step", "test_step", "predict_step"
+    )
     AVAILABLE_SORT_KEYS = (
         "cpu_time",
         "cuda_time",
@@ -42,27 +109,24 @@ class PyTorchProfiler(BaseProfiler):
         "self_cuda_memory_usage",
         "count",
     )
+    START_RECORD_FUNCTIONS = ('on_train_start', 'on_validation_start', 'on_test_start', 'on_predict_start')
 
     def __init__(
         self,
         dirpath: Optional[Union[str, Path]] = None,
         filename: Optional[str] = None,
-        enabled: bool = True,
-        use_cuda: bool = False,
-        record_shapes: bool = False,
-        profile_memory: bool = False,
         group_by_input_shapes: bool = False,
-        with_stack: bool = False,
-        use_kineto: bool = False,
-        use_cpu: bool = True,
         emit_nvtx: bool = False,
-        export_to_chrome: bool = False,
-        path_to_export_trace: str = None,
+        export_to_chrome: bool = True,
+        path_to_export_trace: Optional[str] = None,
         row_limit: int = 20,
         sort_by_key: Optional[str] = None,
+        record_functions: List[str] = None,
+        record_module_names: bool = True,
         profiled_functions: Optional[List] = None,
         output_filename: Optional[str] = None,
-    ):
+        **profiler_kwargs: Any,
+    ) -> None:
         """
         This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
         different operators inside your model - both on the CPU and GPU
@@ -75,24 +139,8 @@ class PyTorchProfiler(BaseProfiler):
             filename: If present, filename where the profiler results will be saved instead of printing to stdout.
                 The ``.txt`` extension will be used automatically.
 
-            enabled: Setting this to False makes this context manager a no-op.
-
-            use_cuda: Enables timing of CUDA events as well using the cudaEvent API.
-                Adds approximately 4us of overhead to each tensor operation.
-
-            record_shapes: If shapes recording is set, information about input dimensions will be collected.
-
-            profile_memory: Whether to report memory usage, default: True (Introduced in PyTorch 1.6.0)
-
             group_by_input_shapes: Include operator input shapes and group calls by shape.
 
-            with_stack: record source information (file and line number) for the ops (Introduced in PyTorch 1.7.0)
-
-            use_kineto: experimental support for Kineto profiler (Introduced in PyTorch 1.8.0)
-
-            use_cpu: use_kineto=True and can be used to lower the overhead
-                for GPU-only profiling (Introduced in PyTorch 1.8.0)
-
             emit_nvtx: Context manager that makes every autograd operation emit an NVTX range
                 Run::
 
@@ -103,164 +151,189 @@ class PyTorchProfiler(BaseProfiler):
                     nvvp trace_name.prof
                     torch.autograd.profiler.load_nvprof(path)
 
-            export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
+            export_to_chrome: Whether to export the sequence of profiled operators for Chrome.
                 It will generate a ``.json`` file which can be read by Chrome.
 
             path_to_export_trace: Directory path to export ``.json`` traces when using ``export_to_chrome=True``.
                 By default, it will be save where the file being is being run.
 
-            row_limit: Limit the number of rows in a table, `0` is a special value that
+            row_limit: Limit the number of rows in a table, ``-1`` is a special value that
                 removes the limit completely.
 
-            sort_by_key: Keys to sort out profiled table
+            sort_by_key: Attribute used to sort entries. By default
+                they are printed in the same order as they were registered.
+                Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
+                ``cuda_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``,
+                ``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``count``.
 
-            profiled_functions: list of profiled functions which will create a context manager on.
+            record_functions: list of profiled functions which will create a context manager on.
                 Any other will be pass through.
 
+            record_module_names: Whether to add module names while recording autograd operation.
+
+            profiler_kwargs: Keyword arguments for the PyTorch profiler. This depends on your PyTorch version
+
         Raises:
             MisconfigurationException:
                 If arg ``sort_by_key`` is not present in ``AVAILABLE_SORT_KEYS``.
-            ValueError:
-                If you attempt to stop recording an action which was never started.
         """
+        super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename)
 
-        self.profiled_actions = {}
-        self.enabled = enabled
-        self.profiled_functions = profiled_functions or self.PROFILED_FUNCTIONS
-        self.use_cuda = use_cuda
-        self.record_shapes = record_shapes
-        self.profile_memory = profile_memory
-        self.sort_by_key = sort_by_key or ("cuda_time_total" if self.use_cuda else "cpu_time_total")
-        self.with_stack = with_stack
-        self.group_by_input_shapes = group_by_input_shapes and record_shapes
-        self.use_kineto = use_kineto
-        self.use_cpu = use_cpu
-        self.row_limit = row_limit
-        self.emit_nvtx = emit_nvtx
-        self.export_to_chrome = export_to_chrome
-        self.path_to_export_trace = path_to_export_trace
+        record_functions = self.__deprecation_check(profiled_functions, record_functions)
 
-        if export_to_chrome and path_to_export_trace is None:
+        self._group_by_input_shapes = group_by_input_shapes and profiler_kwargs.get("record_shapes", False)
+        self._emit_nvtx = emit_nvtx
+        self._export_to_chrome = export_to_chrome
+        self._path_to_export_trace = path_to_export_trace
+        self._row_limit = row_limit
+        self._sort_by_key = sort_by_key or f"{'cuda' if profiler_kwargs.get('use_cuda', False) else 'cpu'}_time_total"
+        self._record_functions_start = set(record_functions + list(self.START_RECORD_FUNCTIONS))
+        self._record_functions = set(record_functions + list(self.RECORD_FUNCTIONS))
+        self._record_module_names = record_module_names
+        self._profiler_kwargs = profiler_kwargs
+
+        self.profiler: Optional[_PROFILER] = None
+        self.function_events: Optional['EventList'] = None
+        self._lightning_module: Optional['LightningModule'] = None  # set by ProfilerConnector
+        self._register: Optional[RegisterRecordFunction] = None
+        self._parent_profiler: Optional[_PROFILER] = None
+        self._recording_map: Dict[str, record_function] = {}
+
+        if self._export_to_chrome and self._path_to_export_trace is None:
             rank_zero_warn(
-                "The exported trace would be save locally as `path_to_export_trace` is empty."
+                "The exported trace would be saved locally as `path_to_export_trace` is None."
                 " Note: Each functions will generate its own traced file."
             )
 
-        if self.sort_by_key not in self.AVAILABLE_SORT_KEYS:
+        if self._sort_by_key not in self.AVAILABLE_SORT_KEYS:
             raise MisconfigurationException(
-                f"Found sort_by_key: {sort_by_key}. Should be within {self.AVAILABLE_SORT_KEYS}. "
+                f"Found sort_by_key: {self._sort_by_key}. Should be within {self.AVAILABLE_SORT_KEYS}. "
             )
 
-        self.profiled_actions = {}
-        self.context_names = {}
-        self.running_stack = []
-        self.profiler = None
+    def __deprecation_check(
+        self,
+        profiled_functions: Optional[List[str]],
+        record_functions: Optional[List[str]],
+    ) -> List[str]:
+        if record_functions is None:
+            record_functions = []
 
-        super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename)
+        if profiled_functions is not None:
+            rank_zero_warn(
+                "`PyTorchProfiler.profiled_functions` has been renamed to"
+                " `record_functions` in v1.3 and will be removed in v1.5", DeprecationWarning
+            )
+            if not record_functions:
+                record_functions += profiled_functions
+            else:
+                raise MisconfigurationException(
+                    "You set `PytorchProfiler.profiled_functions` and `PyTorchProfiler.record_functions`."
+                    "  Please use only the later."
+                )
+
+        return record_functions
 
     def setup(
-        self,
-        stage: Optional[str] = None,
-        local_rank: Optional[int] = None,
-        log_dir: Optional[str] = None
+        self, stage: Optional[str] = None, local_rank: Optional[int] = None, log_dir: Optional[str] = None
     ) -> None:
         super().setup(stage=stage, local_rank=local_rank, log_dir=log_dir)
 
         # if the user didn't provide `path_to_export_trace`,
         # set it as TensorBoardLogger log_dir if exists
-        if self.path_to_export_trace is None:
-            self.path_to_export_trace = log_dir
+        if self._path_to_export_trace is None:
+            self._path_to_export_trace = log_dir
 
     def start(self, action_name: str) -> None:
-        if action_name not in self.profiled_functions:
-            return
+        if self.profiler is None and action_name in self._record_functions_start:
 
-        if len(self.running_stack) > 0:
-            self._stop(self.running_stack[-1])
-        self.running_stack.append(action_name)
+            # close profiler if it is already opened. might happen if 2 profilers
+            # are created and the first one did not call `describe`
+            try:
+                torch.autograd._disable_profiler()  # noqa
+            except (AttributeError, RuntimeError):
+                pass
 
-        self.context_names[action_name] = "/".join(self.running_stack)
+            self._create_profilers()
 
-        self._start(action_name)
+            self.profiler.__enter__()
+            if self._parent_profiler is not None:
+                self._parent_profiler.__enter__()
+            if self._register is not None:
+                self._register.__enter__()
 
-    def _start(self, action_name: str) -> None:
-        if self.emit_nvtx:
-            self._parent_profiler = self._create_profiler(action_name, torch.cuda.profiler.profile, enter=True)
-            self._create_profiler(action_name, torch.autograd.profiler.emit_nvtx)
-        else:
-            self._create_profiler(action_name, torch.autograd.profiler.profile)
-
-    def _create_profiler(self, action_name, profiler, enter=True):
-        init_args = inspect.signature(profiler.__init__).parameters
-        profiler_args = {k: v for k, v in vars(self).items() if k in init_args}
-        pr = profiler(**profiler_args)
-        if enter:
-            out_pr = pr.__enter__()
-            if out_pr is not None:
-                pr = out_pr
-        self.profiler = pr
-        return self.profiler
-
-    def _stop(self, action_name: str) -> None:
-        if self.profiler is None:
-            return
-
-        self.profiler.__exit__(exc_type=None, exc_val=None, exc_tb=None)
-
-        if isinstance(self.profiler, torch.autograd.profiler.emit_nvtx):
-            # when running ``emit_nvtx``, PyTorch requires 2 context manager.
-            # The parent_profiler is being closed too.
-            self._parent_profiler.__exit__(None, None, None)
-            self._parent_profiler = None
-            return
-
-        function_events = self.profiler.function_events
-        self.profiler = None
-        for name in self.running_stack:
-            if name not in self.profiled_actions:
-                self.profiled_actions[name] = function_events
-            else:
-                self.profiled_actions[name] += function_events
+        if (
+            self.profiler is not None and action_name in self._record_functions
+            and action_name not in self._recording_map
+        ):
+            recording = record_function(action_name)
+            recording.__enter__()
+            self._recording_map[action_name] = recording
 
     def stop(self, action_name: str) -> None:
-        if action_name not in self.profiled_functions:
-            return
-
-        if len(self.running_stack) == 0 or self.running_stack[-1] != action_name:
-            raise ValueError(  # pragma: no-cover
-                f"Attempting to stop recording an action ({action_name}) which was never started."
-            )
-        self._stop(action_name)
-        self.running_stack.pop()
-        # restore running profiler
-        if len(self.running_stack) > 0:
-            self._start(self.running_stack[-1])
+        if action_name in self._recording_map:
+            self._recording_map[action_name].__exit__(None, None, None)
+            del self._recording_map[action_name]
 
     def summary(self) -> str:
-        recorded_stats = {}
-        output_string = ''
+        if not self._profiler_kwargs.get("enabled", True) or self._emit_nvtx:
+            return ""
 
-        if not self.enabled:
-            return output_string
+        self._delete_profilers()
 
-        for action_name, function_events in self.profiled_actions.items():
+        if not self.function_events:
+            return ""
 
-            # next line is a workaround for a pytorch issue (fixed on master, still present
-            # on 1.7). Without it the code fails with `AssertionError: There is already a CPU
-            # parent event for detach`
-            function_events.populate_cpu_children = lambda: None
+        if self._export_to_chrome:
+            filename = f"{self.local_rank}_trace.json"
+            path_to_trace = (
+                filename if self._path_to_export_trace is None else os.path.join(self._path_to_export_trace, filename)
+            )
+            self.function_events.export_chrome_trace(path_to_trace)
 
-            if self.export_to_chrome:
-                filename = f"{action_name}_{self.local_rank}_trace.json"
-                path_to_trace = filename if self.path_to_export_trace is None \
-                    else os.path.join(self.path_to_export_trace, filename)
-                function_events.export_chrome_trace(path_to_trace)
+        data = self.function_events.key_averages(group_by_input_shapes=self._group_by_input_shapes)
+        table = data.table(sort_by=self._sort_by_key, row_limit=self._row_limit)
 
-            if self.emit_nvtx:
-                return output_string
-
-            else:
-                data = function_events.key_averages(group_by_input_shapes=self.group_by_input_shapes)
-                table = data.table(sort_by=self.sort_by_key, row_limit=self.row_limit)
-                recorded_stats[action_name] = table
+        recorded_stats = {"records": table}
         return self._stats_to_str(recorded_stats)
+
+    def _create_profilers(self) -> None:
+        if self._emit_nvtx:
+            self._parent_profiler = self._create_profiler(torch.cuda.profiler.profile)
+            self.profiler = self._create_profiler(torch.autograd.profiler.emit_nvtx)
+        else:
+            self._parent_profiler = None
+            self.profiler = self._create_profiler(torch.autograd.profiler.profile)
+        if self._record_module_names and self._lightning_module is not None:
+            self._register = RegisterRecordFunction(self._lightning_module)
+
+    def _create_profiler(self, profiler: Type[_PROFILER]) -> _PROFILER:
+        init_parameters = inspect.signature(profiler.__init__).parameters
+        kwargs = {k: v for k, v in self._profiler_kwargs.items() if k in init_parameters}
+        return profiler(**kwargs)
+
+    def _cache_functions_events(self):
+        if not self._emit_nvtx:
+            self.function_events = self.profiler.function_events
+
+    def _delete_profilers(self) -> None:
+        if self.profiler is not None:
+            self.profiler.__exit__(None, None, None)
+            self._cache_functions_events()
+            self.profiler = None
+
+        if self._parent_profiler is not None:
+            self._parent_profiler.__exit__(None, None, None)
+            self._parent_profiler = None
+
+        if self._register is not None:
+            self._register.__exit__(None, None, None)
+            self._register = None
+
+    def teardown(self, stage: Optional[str] = None) -> None:
+        self._delete_profilers()
+
+        for k in self._recording_map:
+            self.stop(k)
+        self._recording_map = {}
+
+        super().teardown(stage=stage)
diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py
index e628d6d96b..191e871146 100644
--- a/pytorch_lightning/trainer/connectors/profiler_connector.py
+++ b/pytorch_lightning/trainer/connectors/profiler_connector.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-
 from typing import Union
+from weakref import proxy
 
 from pytorch_lightning.profiler import (
     AdvancedProfiler,
@@ -57,4 +57,5 @@ class ProfilerConnector:
     def setup(self) -> None:
         trainer = self.trainer
         local_rank = trainer.local_rank if trainer.world_size > 1 else None
+        trainer.profiler.lightning_module = proxy(trainer.lightning_module)
         trainer.profiler.setup(stage=trainer._setup_state, local_rank=local_rank, log_dir=trainer.log_dir)
diff --git a/pytorch_lightning/trainer/predict_loop.py b/pytorch_lightning/trainer/predict_loop.py
index 53e82fd3f6..b33f41cb2e 100644
--- a/pytorch_lightning/trainer/predict_loop.py
+++ b/pytorch_lightning/trainer/predict_loop.py
@@ -44,6 +44,8 @@ class PredictLoop(object):
         model_ref.on_predict_model_eval()
 
     def setup(self, model, max_batches, dataloaders):
+        self.trainer.call_hook("on_predict_start")
+
         # copy properties for forward overrides
         self.trainer.model_connector.copy_trainer_model_properties(model)
 
@@ -86,6 +88,8 @@ class PredictLoop(object):
         return
 
     def on_predict_epoch_end(self):
+        self.trainer.profiler.describe()
+
         self.trainer._progress_bar_callback.on_predict_end(self.trainer, self.trainer.lightning_module)
 
         results = self._predictions
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index cc471f76b6..c3ba34ca66 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -743,7 +743,7 @@ class TrainLoop:
 
                 # backward pass
                 if result is not None:
-                    with self.trainer.profiler.profile("model_backward"):
+                    with self.trainer.profiler.profile("backward"):
                         self.backward(result, optimizer, opt_idx)
 
                     # hook - call this hook only
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 8090c4ed65..5a780660a0 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -68,6 +68,7 @@ _IS_INTERACTIVE = hasattr(sys, "ps1")  # https://stackoverflow.com/a/64523765
 _TORCH_LOWER_EQUAL_1_4 = _compare_version("torch", operator.le, "1.5.0")
 _TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0")
 _TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0")
+_TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0")
 
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py
index c8b1e96aea..8eabc46400 100644
--- a/tests/checkpointing/test_torch_saving.py
+++ b/tests/checkpointing/test_torch_saving.py
@@ -47,6 +47,7 @@ def test_model_torch_save_ddp_cpu(tmpdir):
         max_epochs=num_epochs,
         accelerator="ddp_cpu",
         num_processes=2,
+        logger=False,
     )
     temp_path = os.path.join(tmpdir, 'temp.pt')
     trainer.fit(model)
diff --git a/tests/deprecated_api/test_remove_1-5.py b/tests/deprecated_api/test_remove_1-5.py
index 0c5f581d77..725db1180d 100644
--- a/tests/deprecated_api/test_remove_1-5.py
+++ b/tests/deprecated_api/test_remove_1-5.py
@@ -81,6 +81,11 @@ def test_v1_5_0_old_callback_on_save_checkpoint(tmpdir):
         trainer.save_checkpoint(filepath)
 
 
+def test_v1_5_0_legacy_profiler_argument():
+    with pytest.deprecated_call(match="renamed to `record_functions` in v1.3"):
+        PyTorchProfiler(profiled_functions=[])
+
+
 def test_v1_5_0_running_sanity_check():
     trainer = Trainer()
     with pytest.deprecated_call(match='has been renamed to `Trainer.sanity_checking`'):
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index cf6afcc9b6..5d144aef36 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import logging
 import os
+import platform
 import time
 from copy import deepcopy
 from distutils.version import LooseVersion
@@ -24,6 +25,9 @@ import torch
 
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.profiler import AdvancedProfiler, PyTorchProfiler, SimpleProfiler
+from pytorch_lightning.profiler.pytorch import RegisterRecordFunction
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -126,10 +130,10 @@ def test_simple_profiler_log_dir(tmpdir):
     )
     trainer.fit(model)
 
-    expected = profiler.dirpath
+    expected = tmpdir / "lightning_logs" / "version_0"
     assert trainer.log_dir == expected
     assert profiler._log_dir == trainer.log_dir
-    assert Path(os.path.join(profiler.dirpath, "fit-profiler.txt")).exists()
+    assert expected.join("fit-profiler.txt").exists()
 
 
 @RunIf(skip_windows=True)
@@ -264,8 +268,8 @@ def pytorch_profiler(tmpdir):
 
 def test_pytorch_profiler_describe(pytorch_profiler):
     """Ensure the profiler won't fail when reporting the summary."""
-    with pytorch_profiler.profile("test_step"):
-        pass
+    with pytorch_profiler.profile("on_test_start"):
+        torch.tensor(0)
 
     # log to stdout and print to file
     pytorch_profiler.describe()
@@ -274,15 +278,10 @@ def test_pytorch_profiler_describe(pytorch_profiler):
     assert len(data) > 0
 
 
-def test_pytorch_profiler_value_errors(pytorch_profiler):
+def test_pytorch_profiler_raises(pytorch_profiler):
     """Ensure errors are raised where expected."""
-
-    action = "test_step"
-    with pytest.raises(ValueError):
-        pytorch_profiler.stop(action)
-
-    pytorch_profiler.start(action)
-    pytorch_profiler.stop(action)
+    with pytest.raises(MisconfigurationException, match="profiled_functions` and `PyTorchProfiler.record"):
+        PyTorchProfiler(profiled_functions=["a"], record_functions=["b"])
 
 
 @RunIf(min_torch="1.6.0")
@@ -299,9 +298,8 @@ def test_advanced_profiler_cprofile_deepcopy(tmpdir):
 
 
 @RunIf(min_gpus=2, special=True)
-def test_pytorch_profiler_trainer_ddp(tmpdir):
+def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler):
     """Ensure that the profiler can be given to the training and default step are properly recorded. """
-    pytorch_profiler = PyTorchProfiler(dirpath=None, filename="profiler")
     model = BoringModel()
     trainer = Trainer(
         max_epochs=1,
@@ -314,17 +312,68 @@ def test_pytorch_profiler_trainer_ddp(tmpdir):
     )
     trainer.fit(model)
 
-    assert len(pytorch_profiler.summary()) > 0
-    assert set(pytorch_profiler.profiled_actions) == {'training_step_and_backward', 'validation_step'}
+    expected = ('validation_step', 'training_step_and_backward', 'training_step', 'backward')
+    for name in expected:
+        assert sum(e.name == name for e in pytorch_profiler.function_events)
 
-    files = sorted(f for f in os.listdir(pytorch_profiler.dirpath) if "fit" in f)
-    rank = int(os.getenv("LOCAL_RANK", "0"))
-    expected = f"fit-profiler-{rank}.txt"
-    assert files[rank] == expected
+    files = set(os.listdir(pytorch_profiler.dirpath))
+    expected = f"fit-profiler-{trainer.local_rank}.txt"
+    assert expected in files
 
     path = os.path.join(pytorch_profiler.dirpath, expected)
-    data = Path(path).read_text("utf-8")
-    assert len(data) > 0
+    assert Path(path).read_text()
+
+
+def test_pytorch_profiler_trainer_test(tmpdir, pytorch_profiler):
+    """Ensure that the profiler can be given to the trainer and test step are properly recorded. """
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_test_batches=2,
+        profiler=pytorch_profiler,
+    )
+    trainer.test(model)
+
+    assert sum(e.name == 'test_step' for e in pytorch_profiler.function_events)
+
+    path = pytorch_profiler.dirpath / f"test-{pytorch_profiler.filename}.txt"
+    assert path.read_text("utf-8")
+
+
+def test_pytorch_profiler_trainer_predict(tmpdir, pytorch_profiler):
+    """Ensure that the profiler can be given to the trainer and predict function are properly recorded. """
+    model = BoringModel()
+    model.predict_dataloader = model.train_dataloader
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_test_batches=2,
+        profiler=pytorch_profiler,
+    )
+    trainer.predict(model)
+
+    assert sum(e.name == 'predict_step' for e in pytorch_profiler.function_events)
+
+    path = pytorch_profiler.dirpath / f"predict-{pytorch_profiler.filename}.txt"
+    assert path.read_text("utf-8")
+
+
+def test_pytorch_profiler_trainer_validate(tmpdir, pytorch_profiler):
+    """Ensure that the profiler can be given to the trainer and validate function are properly recorded. """
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_val_batches=2,
+        profiler=pytorch_profiler,
+    )
+    trainer.validate(model)
+
+    assert sum(e.name == 'validation_step' for e in pytorch_profiler.function_events)
+
+    path = pytorch_profiler.dirpath / f"validate-{pytorch_profiler.filename}.txt"
+    assert path.read_text("utf-8")
 
 
 def test_pytorch_profiler_nested(tmpdir):
@@ -341,34 +390,31 @@ def test_pytorch_profiler_nested(tmpdir):
         with pytorch_profiler.profile("c"):
             _ = a + b
 
-    pa = pytorch_profiler.profiled_actions
+    pytorch_profiler.describe()
 
-    # From PyTorch 1.8.0, less operation are being traced.
-    if LooseVersion(torch.__version__) >= LooseVersion("1.8.0"):
-        expected_ = {
-            'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'add'],
-            'b': ['zeros', 'empty', 'zero_'],
-            'c': ['add'],
-        }
-    # From PyTorch 1.6.0, more operation are being traced.
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
-        expected_ = {
-            'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_', 'add', 'empty'],
-            'b': ['zeros', 'empty', 'zero_', 'fill_'],
-            'c': ['add', 'empty'],
-        }
+    events_name = {e.name for e in pytorch_profiler.function_events}
+
+    if platform.system() == "Windows":
+        expected = {'a', 'add', 'b', 'c', 'profiler::_record_function_enter', 'profiler::_record_function_exit'}
     else:
-        expected_ = {
-            'a': ['add'],
-            'b': [],
-            'c': ['add'],
+        expected = {
+            'signed char', 'add', 'profiler::_record_function_exit', 'bool', 'char', 'profiler::_record_function_enter'
         }
 
-    for n in ('a', 'b', 'c'):
-        pa[n] = [e.name for e in pa[n]]
-        if LooseVersion(torch.__version__) >= LooseVersion("1.7.1"):
-            pa[n] = [e.replace("aten::", "") for e in pa[n]]
-        assert pa[n] == expected_[n]
+    if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+        expected = {'add', 'zeros', 'ones', 'zero_', 'b', 'fill_', 'c', 'a', 'empty'}
+
+    if LooseVersion(torch.__version__) >= LooseVersion("1.7.0"):
+        expected = {
+            'aten::zeros', 'aten::add', 'aten::zero_', 'c', 'b', 'a', 'aten::fill_', 'aten::empty', 'aten::ones'
+        }
+
+    if LooseVersion(torch.__version__) >= LooseVersion("1.8.0"):
+        expected = {
+            'aten::ones', 'a', 'aten::add', 'aten::empty', 'aten::zero_', 'b', 'c', 'aten::zeros', 'aten::fill_'
+        }
+
+    assert events_name == expected, (events_name, torch.__version__, platform.system())
 
 
 @RunIf(min_gpus=1, special=True)
@@ -387,6 +433,43 @@ def test_pytorch_profiler_nested_emit_nvtx(tmpdir):
     trainer.fit(model)
 
 
+@RunIf(min_torch="1.5.0")
+def test_register_record_function(tmpdir):
+
+    use_cuda = torch.cuda.is_available()
+    pytorch_profiler = PyTorchProfiler(
+        export_to_chrome=False,
+        record_functions=["a"],
+        use_cuda=use_cuda,
+        dirpath=tmpdir,
+        filename="profiler",
+    )
+
+    class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            self.layer = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.ReLU(), torch.nn.Linear(8, 2))
+
+    model = TestModel()
+    input = torch.rand((1, 8))
+
+    if use_cuda:
+        model = model.cuda()
+        input = input.cuda()
+
+    with pytorch_profiler.profile("a"):
+        with RegisterRecordFunction(model):
+            model(input)
+
+    pytorch_profiler.describe()
+    event_names = [e.name for e in pytorch_profiler.function_events]
+    assert 'torch.nn.modules.container.Sequential: layer' in event_names
+    assert 'torch.nn.modules.linear.Linear: layer.0' in event_names
+    assert 'torch.nn.modules.activation.ReLU: layer.1' in event_names
+    assert 'torch.nn.modules.linear.Linear: layer.2' in event_names
+
+
 @pytest.mark.parametrize("cls", (SimpleProfiler, AdvancedProfiler, PyTorchProfiler))
 def test_profiler_teardown(tmpdir, cls):
     """
@@ -407,6 +490,9 @@ def test_profiler_teardown(tmpdir, cls):
     assert profiler._output_file is None
 
 
+@pytest.mark.skipif(_TORCH_GREATER_EQUAL_1_8, reason="currently not supported for PyTorch 1.8")
 def test_pytorch_profiler_deepcopy(pytorch_profiler):
+    pytorch_profiler.start("on_train_start")
+    torch.tensor(1)
     pytorch_profiler.describe()
     assert deepcopy(pytorch_profiler)
diff --git a/tests/trainer/properties/test_get_model.py b/tests/trainer/properties/test_get_model.py
index 3eb0596b55..5dc1ea5de4 100644
--- a/tests/trainer/properties/test_get_model.py
+++ b/tests/trainer/properties/test_get_model.py
@@ -80,23 +80,3 @@ def test_get_model_gpu(tmpdir):
         gpus=1,
     )
     trainer.fit(model)
-
-
-@RunIf(min_gpus=1, skip_windows=True)
-def test_get_model_ddp_gpu(tmpdir):
-    """
-    Tests that `trainer.lightning_module` extracts the model correctly when using GPU + ddp accelerators
-    """
-
-    model = TrainerGetModel()
-
-    limit_train_batches = 2
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        limit_train_batches=limit_train_batches,
-        limit_val_batches=2,
-        max_epochs=1,
-        gpus=1,
-    )
-    trainer.fit(model)
-    return 1