diff --git a/pyproject.toml b/pyproject.toml
index dd48b8126a..777f86841a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,6 @@ warn_no_return = "False"
 # mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g;  s|\/|\.|g' | xargs -I {} echo '"{}",'
 module = [
     "pytorch_lightning.callbacks.progress.rich_progress",
-    "pytorch_lightning.profilers.pytorch",
     "pytorch_lightning.trainer.trainer",
     "pytorch_lightning.tuner.batch_size_scaling",
     "pytorch_lightning.utilities.data",
diff --git a/src/pytorch_lightning/profilers/pytorch.py b/src/pytorch_lightning/profilers/pytorch.py
index c7f34fdc79..475db682d9 100644
--- a/src/pytorch_lightning/profilers/pytorch.py
+++ b/src/pytorch_lightning/profilers/pytorch.py
@@ -17,7 +17,7 @@ import logging
 import os
 from functools import lru_cache, partial
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Type, TYPE_CHECKING, Union
+from typing import Any, Callable, ContextManager, Dict, List, Optional, Type, TYPE_CHECKING, Union
 
 import torch
 from lightning_utilities.core.rank_zero import WarningCache
@@ -42,7 +42,7 @@ if _KINETO_AVAILABLE:
 log = logging.getLogger(__name__)
 warning_cache = WarningCache()
 
-_PROFILER = Union[torch.autograd.profiler.profile, torch.cuda.profiler.profile, torch.autograd.profiler.emit_nvtx]
+_PROFILER = Union[torch.profiler.profile, torch.autograd.profiler.profile, torch.autograd.profiler.emit_nvtx]
 
 
 class RegisterRecordFunction:
@@ -111,13 +111,7 @@ class ScheduleWrapper:
         self._schedule = schedule
         self.reset()
 
-    def setup(self, start_action_name: str) -> None:
-        self._start_action_name = start_action_name
-
-    def pre_step(self, current_action: str) -> None:
-        self._current_action = current_action
-
-    def reset(self):
+    def reset(self) -> None:
         # handle properly `fast_dev_run`. PyTorch Profiler will fail otherwise.
         self._num_training_step = 0
         self._num_validation_step = 0
@@ -132,20 +126,30 @@ class ScheduleWrapper:
         self._prev_schedule_action: Optional[ProfilerAction] = None
         self._start_action_name: Optional[str] = None
 
+    def setup(self, start_action_name: str) -> None:
+        self._start_action_name = start_action_name
+
+    def pre_step(self, current_action: str) -> None:
+        self._current_action = current_action
+
     @property
-    def is_training(self):
+    def is_training(self) -> bool:
+        assert self._current_action is not None
         return self._current_action.endswith("training_step")
 
     @property
-    def is_validating(self):
+    def is_validating(self) -> bool:
+        assert self._current_action is not None
         return self._current_action.endswith("validation_step")
 
     @property
-    def is_testing(self):
+    def is_testing(self) -> bool:
+        assert self._current_action is not None
         return self._current_action.endswith("test_step")
 
     @property
-    def is_predicting(self):
+    def is_predicting(self) -> bool:
+        assert self._current_action is not None
         return self._current_action.endswith("predict_step")
 
     @property
@@ -164,6 +168,7 @@ class ScheduleWrapper:
         if self.is_training:
             self._num_training_step += 1
         elif self.is_validating:
+            assert self._start_action_name is not None
             if self._start_action_name.endswith("on_fit_start"):
                 if self._num_training_step > 0:
                     self._num_validation_step += 1
@@ -238,7 +243,7 @@ class PyTorchProfiler(Profiler):
         record_module_names: bool = True,
         **profiler_kwargs: Any,
     ) -> None:
-        """This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of.
+        r"""This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of.
 
         different operators inside your model - both on the CPU and GPU
 
@@ -276,7 +281,7 @@ class PyTorchProfiler(Profiler):
 
             record_module_names: Whether to add module names while recording autograd operation.
 
-            profiler_kwargs: Keyword arguments for the PyTorch profiler. This depends on your PyTorch version
+            \**profiler_kwargs: Keyword arguments for the PyTorch profiler. This depends on your PyTorch version
 
         Raises:
             MisconfigurationException:
@@ -298,7 +303,7 @@ class PyTorchProfiler(Profiler):
         self.function_events: Optional["EventList"] = None
         self._lightning_module: Optional["LightningModule"] = None  # set by ProfilerConnector
         self._register: Optional[RegisterRecordFunction] = None
-        self._parent_profiler: Optional[_PROFILER] = None
+        self._parent_profiler: Optional[ContextManager] = None
         self._recording_map: Dict[str, record_function] = {}
         self._start_action_name: Optional[str] = None
         self._schedule: Optional[ScheduleWrapper] = None
@@ -317,7 +322,7 @@ class PyTorchProfiler(Profiler):
 
         schedule = profiler_kwargs.get("schedule", None)
         if schedule is not None:
-            if not isinstance(schedule, Callable):
+            if not callable(schedule):
                 raise MisconfigurationException(f"Schedule should be a callable. Found: {schedule}")
             action = schedule(0)
             if not isinstance(action, ProfilerAction):
@@ -337,7 +342,9 @@ class PyTorchProfiler(Profiler):
         self._profiler_kwargs["with_stack"] = with_stack
 
     @property
-    def _total_steps(self) -> int:
+    def _total_steps(self) -> Union[int, float]:
+        assert self._schedule is not None
+        assert self._lightning_module is not None
         trainer = self._lightning_module.trainer
         if self._schedule.is_training:
             return trainer.num_training_batches
@@ -358,13 +365,13 @@ class PyTorchProfiler(Profiler):
 
     @staticmethod
     @lru_cache(1)
-    def _default_schedule() -> Optional[callable]:
+    def _default_schedule() -> Optional[Callable]:
         if _KINETO_AVAILABLE:
             # Those schedule defaults allow the profiling overhead to be negligible over training time.
             return torch.profiler.schedule(wait=1, warmup=1, active=3)
 
     def _default_activities(self) -> List["ProfilerActivity"]:
-        activities = []
+        activities: List["ProfilerActivity"] = []
         if not _KINETO_AVAILABLE:
             return activities
         if self._profiler_kwargs.get("use_cpu", True):
@@ -411,6 +418,7 @@ class PyTorchProfiler(Profiler):
             return
 
         if self.profiler is not None and any(action_name.endswith(func) for func in self.STEP_FUNCTIONS):
+            assert isinstance(self.profiler, torch.profiler.profile)
             if self._schedule is not None:
                 self._schedule.pre_step(action_name)
 
@@ -424,11 +432,11 @@ class PyTorchProfiler(Profiler):
                 self._schedule = None
                 self.profiler.schedule = torch.profiler.profiler._default_schedule_fn
 
-            def on_trace_ready(profiler):
+            def on_trace_ready(profiler: _PROFILER) -> None:
                 if self.dirpath is not None:
                     if self._export_to_chrome:
                         handler = tensorboard_trace_handler(
-                            self.dirpath, self._prepare_filename(action_name=action_name, extension="")
+                            str(self.dirpath), self._prepare_filename(action_name=action_name, extension="")
                         )
                         handler(profiler)
 
@@ -436,6 +444,7 @@ class PyTorchProfiler(Profiler):
                         path = os.path.join(
                             self.dirpath, self._prepare_filename(action_name=action_name, extension=".stack")
                         )
+                        assert isinstance(profiler, torch.autograd.profiler.profile)
                         profiler.export_stacks(path, metric=self._metric)
                 else:
                     rank_zero_warn("The PyTorchProfiler failed to export trace as `dirpath` is None")
@@ -469,8 +478,12 @@ class PyTorchProfiler(Profiler):
         return self._stats_to_str(recorded_stats)
 
     def _create_profilers(self) -> None:
+        if self.profiler is not None:
+            return
+
         if self._emit_nvtx:
-            self._parent_profiler = self._create_profiler(torch.cuda.profiler.profile)
+            if self._parent_profiler is None:
+                self._parent_profiler = torch.cuda.profiler.profile()
             self.profiler = self._create_profiler(torch.autograd.profiler.emit_nvtx)
         else:
             self._parent_profiler = None
@@ -486,7 +499,13 @@ class PyTorchProfiler(Profiler):
     def _cache_functions_events(self) -> None:
         if self._emit_nvtx:
             return
-        self.function_events = self.profiler.events() if _KINETO_AVAILABLE else self.profiler.function_events
+
+        if _KINETO_AVAILABLE:
+            assert isinstance(self.profiler, torch.profiler.profile)
+            self.function_events = self.profiler.events()
+        else:
+            assert isinstance(self.profiler, torch.autograd.profiler.profile)
+            self.function_events = self.profiler.function_events
 
     def _delete_profilers(self) -> None:
         if self.profiler is not None:
@@ -505,7 +524,7 @@ class PyTorchProfiler(Profiler):
             self._register.__exit__(None, None, None)
             self._register = None
 
-    def teardown(self, stage: str) -> None:
+    def teardown(self, stage: Optional[str]) -> None:
         self._delete_profilers()
 
         for k in list(self._recording_map):