diff --git a/.circleci/config.yml b/.circleci/config.yml index 7186c51936..fa9753e063 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -91,7 +91,7 @@ jobs: docker: - image: circleci/python:3.7 environment: - - XLA_VER: 1.7 + - XLA_VER: 1.8 - MAX_CHECKS: 240 - CHECK_SPEEP: 5 steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index dbb1c5798a..54c9647a57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -118,6 +118,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add torchelastic check when sanitizing GPUs ([#8095](https://github.com/PyTorchLightning/pytorch-lightning/pull/8095)) +- Added XLA Profiler ([#8014](https://github.com/PyTorchLightning/pytorch-lightning/pull/8014)) + + ### Changed diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index 13f70deed4..e4b3db9cac 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -22,6 +22,7 @@ local tputests = base.BaseTest { ||| cd pytorch-lightning coverage run --source=pytorch_lightning -m pytest -v --capture=no \ + tests/profiler/test_xla_profiler.py \ pytorch_lightning/utilities/xla_device.py \ tests/accelerators/test_tpu_backend.py \ tests/models/test_tpu.py diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py index 90fe55e0c1..a21b3f173c 100644 --- a/pytorch_lightning/profiler/__init__.py +++ b/pytorch_lightning/profiler/__init__.py @@ -198,6 +198,7 @@ from pytorch_lightning.profiler.advanced import AdvancedProfiler from pytorch_lightning.profiler.base import AbstractProfiler, BaseProfiler, PassThroughProfiler from pytorch_lightning.profiler.pytorch import PyTorchProfiler from pytorch_lightning.profiler.simple import SimpleProfiler +from pytorch_lightning.profiler.xla import XLAProfiler __all__ = [ 'AbstractProfiler', @@ -206,4 +207,5 @@ __all__ = [ 'PassThroughProfiler', 'PyTorchProfiler', 'SimpleProfiler', + 'XLAProfiler', ] diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py index 3f534ce0bb..fb29ec5289 100644 --- a/pytorch_lightning/profiler/profilers.py +++ b/pytorch_lightning/profiler/profilers.py @@ -9,6 +9,7 @@ from pytorch_lightning.profiler.advanced import AdvancedProfiler # noqa E402 from pytorch_lightning.profiler.base import AbstractProfiler, BaseProfiler, PassThroughProfiler # noqa E402 from pytorch_lightning.profiler.pytorch import PyTorchProfiler # noqa E402 from pytorch_lightning.profiler.simple import SimpleProfiler # noqa E402 +from pytorch_lightning.profiler.xla import XLAProfiler # noqa E402 __all__ = [ 'AbstractProfiler', @@ -17,4 +18,5 @@ __all__ = [ 'PassThroughProfiler', 'PyTorchProfiler', 'SimpleProfiler', + 'XLAProfiler', ] diff --git a/pytorch_lightning/profiler/xla.py b/pytorch_lightning/profiler/xla.py new file mode 100644 index 0000000000..35b8e7f264 --- /dev/null +++ b/pytorch_lightning/profiler/xla.py @@ -0,0 +1,110 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +XLA Profiler will help you debug and optimize training workload performance +for your models using Cloud TPU performance tools. + +Manual capture via TensorBoard + +The following instructions are for capturing trace from a running program + +0. This [guide](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm#tpu-vm) will +help you with the Cloud TPU setup with the required installations + +1. Start a TensorBoard Server + +>> tensorboard --logdir ./tensorboard --port 9001 + +You could view the TensorBoard output at http://localhost:9001 on your local machine, and then open the +``PROFILE`` plugn from the top right dropdown or open http://localhost:9001/#profile + +2. Once the code you'd like to profile is running, click on ``CAPTURE PROFILE`` button. You could enter +``localhost:9012`` (default port for XLA Profiler) as the Profile Service URL. Then, you could enter +the number of milliseconds for the profiling duration, and click ``CAPTURE`` + +3. Make sure the code is running, while you are trying to capture the traces. Also, it would lead to better +performance insights if the profiling duration is longer than the step time + +4. Once the capture is finished, the page will refresh and you could browse through the insights using the +``Tools`` dropdown at the top left + +""" +import logging +from typing import Dict + +from pytorch_lightning.profiler.base import BaseProfiler +from pytorch_lightning.utilities import _TPU_AVAILABLE + +if _TPU_AVAILABLE: + import torch_xla.debug.profiler as xp + +log = logging.getLogger(__name__) + + +class XLAProfiler(BaseProfiler): + + STEP_FUNCTIONS = { + "training_step_and_backward", + "validation_step", + "test_step", + "predict_step", + } + RECORD_FUNCTIONS = { + "training_step_and_backward", + "training_step", + "backward", + "validation_step", + "test_step", + "predict_step", + } + + def __init__(self, port: int = 9012) -> None: + """ + This Profiler will help you debug and optimize training workload performance + for your models using Cloud TPU performance tools. + """ + super().__init__(dirpath=None, filename=None, output_filename=None) + self.port = port + self._recording_map: Dict = {} + self._step_recoding_map: Dict = {} + self._start_trace: bool = False + + def start(self, action_name: str) -> None: + if action_name in self.RECORD_FUNCTIONS: + if not self._start_trace: + self.server = xp.start_server(self.port) + self._start_trace = True + + if action_name in self.STEP_FUNCTIONS: + step = self._get_step_num(action_name) + recording = xp.StepTrace(action_name, step_num=step) + else: + recording = xp.Trace(action_name) + recording.__enter__() + self._recording_map[action_name] = recording + + def stop(self, action_name: str) -> None: + if action_name in self._recording_map: + self._recording_map[action_name].__exit__(None, None, None) + del self._recording_map[action_name] + + def _get_step_num(self, action_name: str) -> int: + if action_name not in self._step_recoding_map: + self._step_recoding_map[action_name] = 1 + else: + self._step_recoding_map[action_name] += 1 + return self._step_recoding_map[action_name] + + def summary(self) -> str: + return "" diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 258f4dc1c1..e854688a04 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -39,6 +39,7 @@ from pytorch_lightning.profiler import ( PassThroughProfiler, PyTorchProfiler, SimpleProfiler, + XLAProfiler, ) from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin from pytorch_lightning.trainer.configuration_validator import ConfigValidator @@ -1183,6 +1184,7 @@ class Trainer( "simple": SimpleProfiler, "advanced": AdvancedProfiler, "pytorch": PyTorchProfiler, + "xla": XLAProfiler, } profiler = profiler.lower() if profiler not in PROFILERS: diff --git a/tests/profiler/__init__.py b/tests/profiler/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_profiler.py b/tests/profiler/test_profiler.py similarity index 100% rename from tests/test_profiler.py rename to tests/profiler/test_profiler.py diff --git a/tests/profiler/test_xla_profiler.py b/tests/profiler/test_xla_profiler.py new file mode 100644 index 0000000000..35279ddee8 --- /dev/null +++ b/tests/profiler/test_xla_profiler.py @@ -0,0 +1,72 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from multiprocessing import Event, Process + +import pytest + +from pytorch_lightning import Trainer +from pytorch_lightning.profiler import XLAProfiler +from pytorch_lightning.utilities import _TPU_AVAILABLE +from tests.helpers import BoringModel +from tests.helpers.runif import RunIf + +if _TPU_AVAILABLE: + import torch_xla.debug.profiler as xp + import torch_xla.utils.utils as xu + + +@RunIf(tpu=True) +def test_xla_profiler_instance(tmpdir): + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + profiler="xla", + tpu_cores=8, + ) + + assert isinstance(trainer.profiler, XLAProfiler) + trainer.fit(model) + assert trainer.state.finished, f"Training failed with {trainer.state}" + + +@pytest.mark.skipif(True, reason="XLA Profiler doesn't support Prog. capture yet") +def test_xla_profiler_prog_capture(tmpdir): + + port = xu.get_free_tcp_ports()[0] + training_started = Event() + + def train_worker(): + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=4, + profiler="xla", + tpu_cores=8, + ) + + trainer.fit(model) + + p = Process(target=train_worker, daemon=True) + p.start() + training_started.wait(120) + + logdir = str(tmpdir) + xp.trace(f'localhost:{port}', logdir, duration_ms=2000, num_tracing_attempts=5, delay_ms=1000) + + p.terminate() + + assert os.isfile(os.path.join(logdir, 'plugins', 'profile', '*', '*.xplane.pb')) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index f76ac419ea..95311fb2df 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -69,7 +69,7 @@ for i in "${!files_arr[@]}"; do done if nvcc --version; then - nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx + nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/profiler/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx fi # needs to run outside of `pytest`