# Copyright The PyTorch Lightning team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Profiler to check if there are any bottlenecks in your code.""" import inspect import logging import os from pathlib import Path from typing import List, Optional, Union import torch from pytorch_lightning.profiler.profilers import BaseProfiler from pytorch_lightning.utilities.distributed import rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException log = logging.getLogger(__name__) class PyTorchProfiler(BaseProfiler): PROFILED_FUNCTIONS = ("training_step_and_backward", "validation_step", "test_step") AVAILABLE_SORT_KEYS = ( "cpu_time", "cuda_time", "cpu_time_total", "cuda_time_total", "cpu_memory_usage", "cuda_memory_usage", "self_cpu_memory_usage", "self_cuda_memory_usage", "count", ) def __init__( self, dirpath: Optional[Union[str, Path]] = None, filename: Optional[str] = None, enabled: bool = True, use_cuda: bool = False, record_shapes: bool = False, profile_memory: bool = False, group_by_input_shapes: bool = False, with_stack: bool = False, use_kineto: bool = False, use_cpu: bool = True, emit_nvtx: bool = False, export_to_chrome: bool = False, path_to_export_trace: str = None, row_limit: int = 20, sort_by_key: Optional[str] = None, profiled_functions: Optional[List] = None, output_filename: Optional[str] = None, ): """ This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of different operators inside your model - both on the CPU and GPU Args: dirpath: Directory path for the ``filename``. If ``dirpath`` is ``None`` but ``filename`` is present, the ``trainer.log_dir`` (from :class:`~pytorch_lightning.loggers.tensorboard.TensorBoardLogger`) will be used. filename: If present, filename where the profiler results will be saved instead of printing to stdout. The ``.txt`` extension will be used automatically. enabled: Setting this to False makes this context manager a no-op. use_cuda: Enables timing of CUDA events as well using the cudaEvent API. Adds approximately 4us of overhead to each tensor operation. record_shapes: If shapes recording is set, information about input dimensions will be collected. profile_memory: Whether to report memory usage, default: True (Introduced in PyTorch 1.6.0) group_by_input_shapes: Include operator input shapes and group calls by shape. with_stack: record source information (file and line number) for the ops (Introduced in PyTorch 1.7.0) use_kineto: experimental support for Kineto profiler (Introduced in PyTorch 1.8.0) use_cpu: use_kineto=True and can be used to lower the overhead for GPU-only profiling (Introduced in PyTorch 1.8.0) emit_nvtx: Context manager that makes every autograd operation emit an NVTX range Run:: nvprof --profile-from-start off -o trace_name.prof -- To visualize, you can either use:: nvvp trace_name.prof torch.autograd.profiler.load_nvprof(path) export_to_chrome: Wether to export the sequence of profiled operators for Chrome. It will generate a ``.json`` file which can be read by Chrome. path_to_export_trace: Directory path to export ``.json`` traces when using ``export_to_chrome=True``. By default, it will be save where the file being is being run. row_limit: Limit the number of rows in a table, `0` is a special value that removes the limit completely. sort_by_key: Keys to sort out profiled table profiled_functions: list of profiled functions which will create a context manager on. Any other will be pass through. Raises: MisconfigurationException: If arg ``sort_by_key`` is not present in ``AVAILABLE_SORT_KEYS``. ValueError: If you attempt to stop recording an action which was never started. """ self.profiled_actions = {} self.enabled = enabled self.profiled_functions = profiled_functions or self.PROFILED_FUNCTIONS self.use_cuda = use_cuda self.record_shapes = record_shapes self.profile_memory = profile_memory self.sort_by_key = sort_by_key or ("cuda_time_total" if self.use_cuda else "cpu_time_total") self.with_stack = with_stack self.group_by_input_shapes = group_by_input_shapes and record_shapes self.use_kineto = use_kineto self.use_cpu = use_cpu self.row_limit = row_limit self.emit_nvtx = emit_nvtx self.export_to_chrome = export_to_chrome self.path_to_export_trace = path_to_export_trace if export_to_chrome and path_to_export_trace is None: rank_zero_warn( "The exported trace would be save locally as `path_to_export_trace` is empty." " Note: Each functions will generate its own traced file." ) if self.sort_by_key not in self.AVAILABLE_SORT_KEYS: raise MisconfigurationException( f"Found sort_by_key: {sort_by_key}. Should be within {self.AVAILABLE_SORT_KEYS}. " ) self.profiled_actions = {} self.context_names = {} self.running_stack = [] self.profiler = None super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename) def setup( self, stage: Optional[str] = None, local_rank: Optional[int] = None, log_dir: Optional[str] = None ) -> None: super().setup(stage=stage, local_rank=local_rank, log_dir=log_dir) # if the user didn't provide `path_to_export_trace`, # set it as TensorBoardLogger log_dir if exists if self.path_to_export_trace is None: self.path_to_export_trace = log_dir def start(self, action_name: str) -> None: if action_name not in self.profiled_functions: return if len(self.running_stack) > 0: self._stop(self.running_stack[-1]) self.running_stack.append(action_name) self.context_names[action_name] = "/".join(self.running_stack) self._start(action_name) def _start(self, action_name: str) -> None: if self.emit_nvtx: self._parent_profiler = self._create_profiler(action_name, torch.cuda.profiler.profile, enter=True) self._create_profiler(action_name, torch.autograd.profiler.emit_nvtx) else: self._create_profiler(action_name, torch.autograd.profiler.profile) def _create_profiler(self, action_name, profiler, enter=True): init_args = inspect.signature(profiler.__init__).parameters profiler_args = {k: v for k, v in vars(self).items() if k in init_args} pr = profiler(**profiler_args) if enter: out_pr = pr.__enter__() if out_pr is not None: pr = out_pr self.profiler = pr return self.profiler def _stop(self, action_name: str) -> None: if self.profiler is None: return self.profiler.__exit__(exc_type=None, exc_val=None, exc_tb=None) if isinstance(self.profiler, torch.autograd.profiler.emit_nvtx): # when running ``emit_nvtx``, PyTorch requires 2 context manager. # The parent_profiler is being closed too. self._parent_profiler.__exit__(None, None, None) self._parent_profiler = None return function_events = self.profiler.function_events self.profiler = None for name in self.running_stack: if name not in self.profiled_actions: self.profiled_actions[name] = function_events else: self.profiled_actions[name] += function_events def stop(self, action_name: str) -> None: if action_name not in self.profiled_functions: return if len(self.running_stack) == 0 or self.running_stack[-1] != action_name: raise ValueError( # pragma: no-cover f"Attempting to stop recording an action ({action_name}) which was never started." ) self._stop(action_name) self.running_stack.pop() # restore running profiler if len(self.running_stack) > 0: self._start(self.running_stack[-1]) def summary(self) -> str: recorded_stats = {} output_string = '' if not self.enabled: return output_string for action_name, function_events in self.profiled_actions.items(): # next line is a workaround for a pytorch issue (fixed on master, still present # on 1.7). Without it the code fails with `AssertionError: There is already a CPU # parent event for detach` function_events.populate_cpu_children = lambda: None if self.export_to_chrome: filename = f"{action_name}_{self.local_rank}_trace.json" path_to_trace = filename if self.path_to_export_trace is None \ else os.path.join(self.path_to_export_trace, filename) function_events.export_chrome_trace(path_to_trace) if self.emit_nvtx: return output_string else: data = function_events.key_averages(group_by_input_shapes=self.group_by_input_shapes) table = data.table(sort_by=self.sort_by_key, row_limit=self.row_limit) recorded_stats[action_name] = table return self._stats_to_str(recorded_stats)