From f6a3d8fd8da36a332d7bcf43a3967f3c5f10dbb4 Mon Sep 17 00:00:00 2001 From: Gerardo Roa Dabike Date: Wed, 12 Aug 2020 20:09:34 +0100 Subject: [PATCH] GPU Usage Logger (#2932) * GPU utilisation Callback * GPU utilisation Callback * Fixing style * Fixing style * Fixing CodeFactor: partial executable path * Fix a misspelling in the Class name --- pytorch_lightning/callbacks/__init__.py | 2 + .../callbacks/gpu_usage_logger.py | 141 ++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 pytorch_lightning/callbacks/gpu_usage_logger.py diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py index 92920ee9b7..db32bc365c 100644 --- a/pytorch_lightning/callbacks/__init__.py +++ b/pytorch_lightning/callbacks/__init__.py @@ -4,6 +4,7 @@ from pytorch_lightning.callbacks.gradient_accumulation_scheduler import Gradient from pytorch_lightning.callbacks.lr_logger import LearningRateLogger from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint from pytorch_lightning.callbacks.progress import ProgressBarBase, ProgressBar +from pytorch_lightning.callbacks.gpu_usage_logger import GpuUsageLogger __all__ = [ 'Callback', @@ -13,4 +14,5 @@ __all__ = [ 'LearningRateLogger', 'ProgressBarBase', 'ProgressBar', + 'GpuUsageLogger' ] diff --git a/pytorch_lightning/callbacks/gpu_usage_logger.py b/pytorch_lightning/callbacks/gpu_usage_logger.py new file mode 100644 index 0000000000..426fff444d --- /dev/null +++ b/pytorch_lightning/callbacks/gpu_usage_logger.py @@ -0,0 +1,141 @@ +""" + +GPU Usage Logger +==================== + +Log GPU memory and GPU usage during training + +""" + +from pytorch_lightning.callbacks.base import Callback +import subprocess +import os +import time + + +class GpuUsageLogger(Callback): + r""" + Automatically logs GPU memory and GPU usage during training stage. + + Args: + memory_utilisation: Set to ``True`` to log used, free and percentage of memory + utilisation at starts and ends of each step. Default: ``True``. + From nvidia-smi --help-query-gpu + memory.used = ```Total memory allocated by active contexts.``` + memory.free = ```Total free memory.``` + gpu_utilisation: Set to ``True`` to log percentage of GPU utilisation. + at starts and ends of each step. Default: ``True``. + intra_step_time: Set to ``True`` to log the time of each step. Default: ``False`` + inter_step_time: Set to ``True`` to log the time between the end of one step + and the start of the next. Default: ``False`` + fan_speed: Set to ``False`` to log percentage of fan speed. Default: ``False``. + temperature: Set to ``True`` to log the memory and gpu temperature in degrees C. + Default: ``False`` + Example:: + + >>> from pytorch_lightning import Trainer + >>> from pytorch_lightning.callbacks import GpuUsageLogger + >>> gpu_usage = GpuUsageLogger() + >>> trainer = Trainer(callbacks=[gpu_usage]) + + Gpu usage is mainly based on nvidia-smi --query-gpu command. + The description of the queries used here as appears in + in ``nvidia-smi --help-query-gpu``: + + "fan.speed" + ```The fan speed value is the percent of maximum speed that the device's fan is currently + intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended + fan speed. If the fan is physically blocked and unable to spin, this output will not match + the actual fan speed. Many parts do not report fan speeds because they rely on cooling via + fans in the surrounding enclosure.``` + "memory.used" + ```Total memory allocated by active contexts.``` + "memory.free" + ```Total free memory.``` + "utilization.gpu" + ```Percent of time over the past sample period during which one or more kernels was executing + on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.``` + "utilization.memory" + ```Percent of time over the past sample period during which global (device) memory was being + read or written. The sample period may be between 1 second and 1/6 second depending on the + product.``` + "temperature.gpu" + ```Core GPU temperature. in degrees C.``` + "temperature.memory" + ```HBM memory temperature. in degrees C.``` + + """ + + def __init__(self, memory_utilisation: bool = True, gpu_utilisation: bool = True, + intra_step_time: bool = False, inter_step_time: bool = False, + fan_speed: bool = False, temperature: bool = False): + super(GpuUsageLogger).__init__() + self.memory_utilisation = memory_utilisation + self.gpu_utilisation = gpu_utilisation + self.intra_step_time = intra_step_time + self.inter_step_time = inter_step_time + self.fan_speed = fan_speed + self.temperature = temperature + self.snap_intra_step_time = None + self.snap_inter_step_time = None + + def on_batch_start(self, trainer, pl_module): + if self.gpu_utilisation: + self._log_gpu(trainer) + if self.memory_utilisation: + self._log_memory(trainer) + + if self.inter_step_time: + # First log at beginning of second step + if self.snap_inter_step_time: + trainer.logger.log_metrics({'Batch_Time/inter_step (ms)': + (time.time() - self.snap_inter_step_time) * 1000}, + step=trainer.global_step) + if self.intra_step_time: + self.snap_intra_step_time = time.time() + + def on_batch_end(self, trainer, pl_module): + if self.gpu_utilisation: + self._log_gpu(trainer) + if self.memory_utilisation: + self._log_memory(trainer) + + if self.fan_speed: + trainer.logger.log_metrics(self._get_gpu_stat("fan.speed", "%"), step=trainer.global_step) + if self.temperature: + trainer.logger.log_metrics(self._get_gpu_stat("temperature.gpu", "degrees C"), step=trainer.global_step) + trainer.logger.log_metrics(self._get_gpu_stat("temperature.memory", "degrees C"), step=trainer.global_step) + + if self.inter_step_time: + self.snap_inter_step_time = time.time() + + if self.intra_step_time: + if self.snap_intra_step_time: + trainer.logger.log_metrics({'Batch_Time/intra_step (ms)': + (time.time() - self.snap_intra_step_time) * 1000}, + step=trainer.global_step) + + def on_train_epoch_start(self, trainer, pl_module): + self.snap_intra_step_time = None + self.snap_inter_step_time = None + + @staticmethod + def _get_gpu_stat(pitem: str, unit: str): + result = subprocess.run(["/bin/nvidia-smi", f"--query-gpu={pitem}", "--format=csv,nounits,noheader"], + encoding="utf-8", stdout=subprocess.PIPE, + stderr=subprocess.PIPE, # for backward compatibility with python version 3.6 + check=True) + try: + gpu_usage = [float(x) for x in result.stdout.strip().split(os.linesep)] + except ValueError: + gpu_usage = [0] + + return {f"GPU_{pitem}/gpu_id_{index} ({unit})": usage for index, usage in enumerate(gpu_usage)} + + def _log_gpu(self, trainer): + trainer.logger.log_metrics(self._get_gpu_stat("utilization.gpu", "%"), step=trainer.global_step) + + def _log_memory(self, trainer): + trainer.logger.log_metrics(self._get_gpu_stat("memory.used", "MB"), step=trainer.global_step) + trainer.logger.log_metrics(self._get_gpu_stat("memory.free", "MB"), step=trainer.global_step) + trainer.logger.log_metrics(self._get_gpu_stat("utilization.memory", "%"), step=trainer.global_step) \ No newline at end of file