2020-09-04 10:02:16 +00:00
|
|
|
|
# Copyright The PyTorch Lightning team.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
2020-08-27 17:50:32 +00:00
|
|
|
|
"""
|
|
|
|
|
GPU Stats Monitor
|
2020-09-04 10:02:16 +00:00
|
|
|
|
=================
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
|
|
|
|
Monitor and logs GPU stats during training.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import shutil
|
|
|
|
|
import subprocess
|
|
|
|
|
import time
|
2021-07-19 11:42:43 +00:00
|
|
|
|
from typing import Any, Dict, List, Optional, Tuple
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2021-07-19 11:42:43 +00:00
|
|
|
|
import torch
|
|
|
|
|
|
|
|
|
|
import pytorch_lightning as pl
|
2020-08-27 17:50:32 +00:00
|
|
|
|
from pytorch_lightning.callbacks.base import Callback
|
2021-10-14 15:52:45 +00:00
|
|
|
|
from pytorch_lightning.utilities import DeviceType, rank_zero_deprecation, rank_zero_only
|
2020-08-27 17:50:32 +00:00
|
|
|
|
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
|
|
|
|
from pytorch_lightning.utilities.parsing import AttributeDict
|
2021-07-19 11:42:43 +00:00
|
|
|
|
from pytorch_lightning.utilities.types import STEP_OUTPUT
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GPUStatsMonitor(Callback):
|
|
|
|
|
r"""
|
2021-10-14 15:52:45 +00:00
|
|
|
|
.. deprecated:: v1.5
|
|
|
|
|
The `GPUStatsMonitor` callback was deprecated in v1.5 and will be removed in v1.7.
|
|
|
|
|
Please use the `DeviceStatsMonitor` callback instead.
|
|
|
|
|
|
2020-09-04 10:02:16 +00:00
|
|
|
|
Automatically monitors and logs GPU stats during training stage. ``GPUStatsMonitor``
|
2020-08-27 17:50:32 +00:00
|
|
|
|
is a callback and in order to use it you need to assign a logger in the ``Trainer``.
|
|
|
|
|
|
|
|
|
|
Args:
|
2020-09-04 10:02:16 +00:00
|
|
|
|
memory_utilization: Set to ``True`` to monitor used, free and percentage of memory
|
2020-08-27 17:50:32 +00:00
|
|
|
|
utilization at the start and end of each step. Default: ``True``.
|
2020-09-04 10:02:16 +00:00
|
|
|
|
gpu_utilization: Set to ``True`` to monitor percentage of GPU utilization
|
2020-08-27 17:50:32 +00:00
|
|
|
|
at the start and end of each step. Default: ``True``.
|
2020-09-04 10:02:16 +00:00
|
|
|
|
intra_step_time: Set to ``True`` to monitor the time of each step. Default: ``False``.
|
|
|
|
|
inter_step_time: Set to ``True`` to monitor the time between the end of one step
|
2020-08-27 17:50:32 +00:00
|
|
|
|
and the start of the next step. Default: ``False``.
|
2020-09-04 10:02:16 +00:00
|
|
|
|
fan_speed: Set to ``True`` to monitor percentage of fan speed. Default: ``False``.
|
|
|
|
|
temperature: Set to ``True`` to monitor the memory and gpu temperature in degree Celsius.
|
2020-08-27 17:50:32 +00:00
|
|
|
|
Default: ``False``.
|
|
|
|
|
|
2021-02-15 10:24:36 +00:00
|
|
|
|
Raises:
|
|
|
|
|
MisconfigurationException:
|
|
|
|
|
If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger.
|
|
|
|
|
|
2020-08-27 17:50:32 +00:00
|
|
|
|
Example::
|
|
|
|
|
|
|
|
|
|
>>> from pytorch_lightning import Trainer
|
|
|
|
|
>>> from pytorch_lightning.callbacks import GPUStatsMonitor
|
|
|
|
|
>>> gpu_stats = GPUStatsMonitor() # doctest: +SKIP
|
|
|
|
|
>>> trainer = Trainer(callbacks=[gpu_stats]) # doctest: +SKIP
|
|
|
|
|
|
|
|
|
|
GPU stats are mainly based on `nvidia-smi --query-gpu` command. The description of the queries is as follows:
|
|
|
|
|
|
|
|
|
|
- **fan.speed** – The fan speed value is the percent of maximum speed that the device's fan is currently
|
|
|
|
|
intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed.
|
|
|
|
|
If the fan is physically blocked and unable to spin, this output will not match the actual fan speed.
|
|
|
|
|
Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.
|
|
|
|
|
- **memory.used** – Total memory allocated by active contexts.
|
|
|
|
|
- **memory.free** – Total free memory.
|
|
|
|
|
- **utilization.gpu** – Percent of time over the past sample period during which one or more kernels was
|
|
|
|
|
executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.
|
|
|
|
|
- **utilization.memory** – Percent of time over the past sample period during which global (device) memory was
|
|
|
|
|
being read or written. The sample period may be between 1 second and 1/6 second depending on the product.
|
|
|
|
|
- **temperature.gpu** – Core GPU temperature, in degrees C.
|
|
|
|
|
- **temperature.memory** – HBM memory temperature, in degrees C.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
memory_utilization: bool = True,
|
|
|
|
|
gpu_utilization: bool = True,
|
|
|
|
|
intra_step_time: bool = False,
|
|
|
|
|
inter_step_time: bool = False,
|
|
|
|
|
fan_speed: bool = False,
|
2021-07-26 11:37:35 +00:00
|
|
|
|
temperature: bool = False,
|
2020-08-27 17:50:32 +00:00
|
|
|
|
):
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
2021-10-14 15:52:45 +00:00
|
|
|
|
rank_zero_deprecation(
|
|
|
|
|
"The `GPUStatsMonitor` callback was deprecated in v1.5 and will be removed in v1.7."
|
|
|
|
|
" Please use the `DeviceStatsMonitor` callback instead."
|
|
|
|
|
)
|
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
|
if shutil.which("nvidia-smi") is None:
|
2020-08-27 17:50:32 +00:00
|
|
|
|
raise MisconfigurationException(
|
2021-07-26 11:37:35 +00:00
|
|
|
|
"Cannot use GPUStatsMonitor callback because NVIDIA driver is not installed."
|
2020-08-27 17:50:32 +00:00
|
|
|
|
)
|
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
|
self._log_stats = AttributeDict(
|
|
|
|
|
{
|
|
|
|
|
"memory_utilization": memory_utilization,
|
|
|
|
|
"gpu_utilization": gpu_utilization,
|
|
|
|
|
"intra_step_time": intra_step_time,
|
|
|
|
|
"inter_step_time": inter_step_time,
|
|
|
|
|
"fan_speed": fan_speed,
|
|
|
|
|
"temperature": temperature,
|
|
|
|
|
}
|
|
|
|
|
)
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2021-07-19 11:42:43 +00:00
|
|
|
|
# The logical device IDs for selected devices
|
|
|
|
|
self._device_ids: List[int] = [] # will be assigned later in setup()
|
|
|
|
|
|
|
|
|
|
# The unmasked real GPU IDs
|
|
|
|
|
self._gpu_ids: List[str] = [] # will be assigned later in setup()
|
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
|
def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None:
|
2020-08-27 17:50:32 +00:00
|
|
|
|
if not trainer.logger:
|
2021-07-26 11:37:35 +00:00
|
|
|
|
raise MisconfigurationException("Cannot use GPUStatsMonitor callback with Trainer that has no logger.")
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2021-01-12 10:22:37 +00:00
|
|
|
|
if trainer._device_type != DeviceType.GPU:
|
2020-09-04 10:02:16 +00:00
|
|
|
|
raise MisconfigurationException(
|
2021-07-26 11:37:35 +00:00
|
|
|
|
"You are using GPUStatsMonitor but are not running on GPU"
|
|
|
|
|
f" since gpus attribute in Trainer is set to {trainer.gpus}."
|
2020-08-27 17:50:32 +00:00
|
|
|
|
)
|
|
|
|
|
|
2021-07-19 11:42:43 +00:00
|
|
|
|
# The logical device IDs for selected devices
|
2021-10-20 10:22:03 +00:00
|
|
|
|
# ignoring mypy check because `trainer.data_parallel_device_ids` is None when using CPU
|
|
|
|
|
self._device_ids = sorted(set(trainer.data_parallel_device_ids)) # type: ignore
|
2021-07-19 11:42:43 +00:00
|
|
|
|
|
|
|
|
|
# The unmasked real GPU IDs
|
2021-10-20 10:22:03 +00:00
|
|
|
|
self._gpu_ids = self._get_gpu_ids(self._device_ids)
|
2020-09-04 10:02:16 +00:00
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
|
def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
|
2021-10-20 10:22:03 +00:00
|
|
|
|
self._snap_intra_step_time: Optional[float] = None
|
|
|
|
|
self._snap_inter_step_time: Optional[float] = None
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
|
|
|
|
@rank_zero_only
|
2021-07-19 11:42:43 +00:00
|
|
|
|
def on_train_batch_start(
|
2021-10-07 10:18:11 +00:00
|
|
|
|
self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
|
2021-07-19 11:42:43 +00:00
|
|
|
|
) -> None:
|
2020-10-22 11:08:03 +00:00
|
|
|
|
if self._log_stats.intra_step_time:
|
|
|
|
|
self._snap_intra_step_time = time.time()
|
|
|
|
|
|
2021-10-20 10:22:03 +00:00
|
|
|
|
if not trainer.logger_connector.should_update_logs:
|
2020-10-22 11:08:03 +00:00
|
|
|
|
return
|
|
|
|
|
|
2020-09-04 10:02:16 +00:00
|
|
|
|
gpu_stat_keys = self._get_gpu_stat_keys()
|
2020-09-25 05:30:30 +00:00
|
|
|
|
gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys])
|
2021-07-19 11:42:43 +00:00
|
|
|
|
logs = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys)
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2020-09-04 10:02:16 +00:00
|
|
|
|
if self._log_stats.inter_step_time and self._snap_inter_step_time:
|
2020-08-27 17:50:32 +00:00
|
|
|
|
# First log at beginning of second step
|
2021-07-26 11:37:35 +00:00
|
|
|
|
logs["batch_time/inter_step (ms)"] = (time.time() - self._snap_inter_step_time) * 1000
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2020-09-25 05:30:30 +00:00
|
|
|
|
trainer.logger.log_metrics(logs, step=trainer.global_step)
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
|
|
|
|
@rank_zero_only
|
2021-04-09 11:24:59 +00:00
|
|
|
|
def on_train_batch_end(
|
2021-07-19 11:42:43 +00:00
|
|
|
|
self,
|
2021-07-26 11:37:35 +00:00
|
|
|
|
trainer: "pl.Trainer",
|
|
|
|
|
pl_module: "pl.LightningModule",
|
2021-07-19 11:42:43 +00:00
|
|
|
|
outputs: STEP_OUTPUT,
|
|
|
|
|
batch: Any,
|
|
|
|
|
batch_idx: int,
|
2021-04-09 11:24:59 +00:00
|
|
|
|
) -> None:
|
2020-10-22 11:08:03 +00:00
|
|
|
|
if self._log_stats.inter_step_time:
|
|
|
|
|
self._snap_inter_step_time = time.time()
|
|
|
|
|
|
2021-10-20 10:22:03 +00:00
|
|
|
|
if not trainer.logger_connector.should_update_logs:
|
2020-10-22 11:08:03 +00:00
|
|
|
|
return
|
|
|
|
|
|
2020-09-04 10:02:16 +00:00
|
|
|
|
gpu_stat_keys = self._get_gpu_stat_keys() + self._get_gpu_device_stat_keys()
|
2020-09-25 05:30:30 +00:00
|
|
|
|
gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys])
|
2021-07-19 11:42:43 +00:00
|
|
|
|
logs = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys)
|
2020-09-14 04:06:48 +00:00
|
|
|
|
|
2020-09-04 10:02:16 +00:00
|
|
|
|
if self._log_stats.intra_step_time and self._snap_intra_step_time:
|
2021-07-26 11:37:35 +00:00
|
|
|
|
logs["batch_time/intra_step (ms)"] = (time.time() - self._snap_intra_step_time) * 1000
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2020-09-25 05:30:30 +00:00
|
|
|
|
trainer.logger.log_metrics(logs, step=trainer.global_step)
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2021-07-19 11:42:43 +00:00
|
|
|
|
@staticmethod
|
|
|
|
|
def _get_gpu_ids(device_ids: List[int]) -> List[str]:
|
2021-09-06 12:49:09 +00:00
|
|
|
|
"""Get the unmasked real GPU IDs."""
|
2021-07-19 11:42:43 +00:00
|
|
|
|
# All devices if `CUDA_VISIBLE_DEVICES` unset
|
2021-07-26 11:37:35 +00:00
|
|
|
|
default = ",".join(str(i) for i in range(torch.cuda.device_count()))
|
|
|
|
|
cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",")
|
2021-07-19 11:42:43 +00:00
|
|
|
|
return [cuda_visible_devices[device_id].strip() for device_id in device_ids]
|
|
|
|
|
|
2020-09-25 05:30:30 +00:00
|
|
|
|
def _get_gpu_stats(self, queries: List[str]) -> List[List[float]]:
|
2021-08-21 03:22:33 +00:00
|
|
|
|
if not queries:
|
|
|
|
|
return []
|
|
|
|
|
|
2020-09-25 05:30:30 +00:00
|
|
|
|
"""Run nvidia-smi to get the gpu stats"""
|
2021-07-26 11:37:35 +00:00
|
|
|
|
gpu_query = ",".join(queries)
|
|
|
|
|
format = "csv,nounits,noheader"
|
|
|
|
|
gpu_ids = ",".join(self._gpu_ids)
|
2020-08-27 17:50:32 +00:00
|
|
|
|
result = subprocess.run(
|
2021-10-20 10:22:03 +00:00
|
|
|
|
[
|
|
|
|
|
# it's ok to supress the warning here since we ensure nvidia-smi exists during init
|
|
|
|
|
shutil.which("nvidia-smi"), # type: ignore
|
|
|
|
|
f"--query-gpu={gpu_query}",
|
|
|
|
|
f"--format={format}",
|
|
|
|
|
f"--id={gpu_ids}",
|
|
|
|
|
],
|
2020-08-27 17:50:32 +00:00
|
|
|
|
encoding="utf-8",
|
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
|
stderr=subprocess.PIPE, # for backward compatibility with python version 3.6
|
2021-07-26 11:37:35 +00:00
|
|
|
|
check=True,
|
2020-08-27 17:50:32 +00:00
|
|
|
|
)
|
|
|
|
|
|
2020-09-25 05:30:30 +00:00
|
|
|
|
def _to_float(x: str) -> float:
|
2020-09-04 10:02:16 +00:00
|
|
|
|
try:
|
|
|
|
|
return float(x)
|
|
|
|
|
except ValueError:
|
2021-07-26 11:37:35 +00:00
|
|
|
|
return 0.0
|
2020-09-04 10:02:16 +00:00
|
|
|
|
|
2021-10-20 10:22:03 +00:00
|
|
|
|
stats = [[_to_float(x) for x in s.split(", ")] for s in result.stdout.strip().split(os.linesep)]
|
2020-09-25 05:30:30 +00:00
|
|
|
|
return stats
|
2020-09-04 10:02:16 +00:00
|
|
|
|
|
2020-09-25 05:30:30 +00:00
|
|
|
|
@staticmethod
|
2021-07-19 11:42:43 +00:00
|
|
|
|
def _parse_gpu_stats(
|
2021-07-26 11:37:35 +00:00
|
|
|
|
device_ids: List[int], stats: List[List[float]], keys: List[Tuple[str, str]]
|
2021-07-19 11:42:43 +00:00
|
|
|
|
) -> Dict[str, float]:
|
2021-09-06 12:49:09 +00:00
|
|
|
|
"""Parse the gpu stats into a loggable dict."""
|
2020-09-04 10:02:16 +00:00
|
|
|
|
logs = {}
|
2021-07-19 11:42:43 +00:00
|
|
|
|
for i, device_id in enumerate(device_ids):
|
2020-09-25 05:30:30 +00:00
|
|
|
|
for j, (x, unit) in enumerate(keys):
|
2021-07-26 11:37:35 +00:00
|
|
|
|
logs[f"device_id: {device_id}/{x} ({unit})"] = stats[i][j]
|
2020-09-04 10:02:16 +00:00
|
|
|
|
return logs
|
|
|
|
|
|
2020-09-25 05:30:30 +00:00
|
|
|
|
def _get_gpu_stat_keys(self) -> List[Tuple[str, str]]:
|
2021-09-06 12:49:09 +00:00
|
|
|
|
"""Get the GPU stats keys."""
|
2020-09-04 10:02:16 +00:00
|
|
|
|
stat_keys = []
|
|
|
|
|
|
|
|
|
|
if self._log_stats.gpu_utilization:
|
2021-07-26 11:37:35 +00:00
|
|
|
|
stat_keys.append(("utilization.gpu", "%"))
|
2020-09-04 10:02:16 +00:00
|
|
|
|
|
|
|
|
|
if self._log_stats.memory_utilization:
|
2021-07-26 11:37:35 +00:00
|
|
|
|
stat_keys.extend([("memory.used", "MB"), ("memory.free", "MB"), ("utilization.memory", "%")])
|
2020-09-04 10:02:16 +00:00
|
|
|
|
|
|
|
|
|
return stat_keys
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2020-09-25 05:30:30 +00:00
|
|
|
|
def _get_gpu_device_stat_keys(self) -> List[Tuple[str, str]]:
|
2021-09-06 12:49:09 +00:00
|
|
|
|
"""Get the device stats keys."""
|
2020-09-04 10:02:16 +00:00
|
|
|
|
stat_keys = []
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2020-09-04 10:02:16 +00:00
|
|
|
|
if self._log_stats.fan_speed:
|
2021-07-26 11:37:35 +00:00
|
|
|
|
stat_keys.append(("fan.speed", "%"))
|
2020-09-04 10:02:16 +00:00
|
|
|
|
|
|
|
|
|
if self._log_stats.temperature:
|
2021-07-26 11:37:35 +00:00
|
|
|
|
stat_keys.extend([("temperature.gpu", "°C"), ("temperature.memory", "°C")])
|
2020-08-27 17:50:32 +00:00
|
|
|
|
|
2020-09-04 10:02:16 +00:00
|
|
|
|
return stat_keys
|