GPU Usage Logger (#2932)
* GPU utilisation Callback * GPU utilisation Callback * Fixing style * Fixing style * Fixing CodeFactor: partial executable path * Fix a misspelling in the Class name
This commit is contained in:
parent
fcf3c40172
commit
f6a3d8fd8d
|
@ -4,6 +4,7 @@ from pytorch_lightning.callbacks.gradient_accumulation_scheduler import Gradient
|
|||
from pytorch_lightning.callbacks.lr_logger import LearningRateLogger
|
||||
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
|
||||
from pytorch_lightning.callbacks.progress import ProgressBarBase, ProgressBar
|
||||
from pytorch_lightning.callbacks.gpu_usage_logger import GpuUsageLogger
|
||||
|
||||
__all__ = [
|
||||
'Callback',
|
||||
|
@ -13,4 +14,5 @@ __all__ = [
|
|||
'LearningRateLogger',
|
||||
'ProgressBarBase',
|
||||
'ProgressBar',
|
||||
'GpuUsageLogger'
|
||||
]
|
||||
|
|
|
@ -0,0 +1,141 @@
|
|||
"""
|
||||
|
||||
GPU Usage Logger
|
||||
====================
|
||||
|
||||
Log GPU memory and GPU usage during training
|
||||
|
||||
"""
|
||||
|
||||
from pytorch_lightning.callbacks.base import Callback
|
||||
import subprocess
|
||||
import os
|
||||
import time
|
||||
|
||||
|
||||
class GpuUsageLogger(Callback):
|
||||
r"""
|
||||
Automatically logs GPU memory and GPU usage during training stage.
|
||||
|
||||
Args:
|
||||
memory_utilisation: Set to ``True`` to log used, free and percentage of memory
|
||||
utilisation at starts and ends of each step. Default: ``True``.
|
||||
From nvidia-smi --help-query-gpu
|
||||
memory.used = ```Total memory allocated by active contexts.```
|
||||
memory.free = ```Total free memory.```
|
||||
gpu_utilisation: Set to ``True`` to log percentage of GPU utilisation.
|
||||
at starts and ends of each step. Default: ``True``.
|
||||
intra_step_time: Set to ``True`` to log the time of each step. Default: ``False``
|
||||
inter_step_time: Set to ``True`` to log the time between the end of one step
|
||||
and the start of the next. Default: ``False``
|
||||
fan_speed: Set to ``False`` to log percentage of fan speed. Default: ``False``.
|
||||
temperature: Set to ``True`` to log the memory and gpu temperature in degrees C.
|
||||
Default: ``False``
|
||||
Example::
|
||||
|
||||
>>> from pytorch_lightning import Trainer
|
||||
>>> from pytorch_lightning.callbacks import GpuUsageLogger
|
||||
>>> gpu_usage = GpuUsageLogger()
|
||||
>>> trainer = Trainer(callbacks=[gpu_usage])
|
||||
|
||||
Gpu usage is mainly based on nvidia-smi --query-gpu command.
|
||||
The description of the queries used here as appears in
|
||||
in ``nvidia-smi --help-query-gpu``:
|
||||
|
||||
"fan.speed"
|
||||
```The fan speed value is the percent of maximum speed that the device's fan is currently
|
||||
intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended
|
||||
fan speed. If the fan is physically blocked and unable to spin, this output will not match
|
||||
the actual fan speed. Many parts do not report fan speeds because they rely on cooling via
|
||||
fans in the surrounding enclosure.```
|
||||
"memory.used"
|
||||
```Total memory allocated by active contexts.```
|
||||
"memory.free"
|
||||
```Total free memory.```
|
||||
"utilization.gpu"
|
||||
```Percent of time over the past sample period during which one or more kernels was executing
|
||||
on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.```
|
||||
"utilization.memory"
|
||||
```Percent of time over the past sample period during which global (device) memory was being
|
||||
read or written. The sample period may be between 1 second and 1/6 second depending on the
|
||||
product.```
|
||||
"temperature.gpu"
|
||||
```Core GPU temperature. in degrees C.```
|
||||
"temperature.memory"
|
||||
```HBM memory temperature. in degrees C.```
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, memory_utilisation: bool = True, gpu_utilisation: bool = True,
|
||||
intra_step_time: bool = False, inter_step_time: bool = False,
|
||||
fan_speed: bool = False, temperature: bool = False):
|
||||
super(GpuUsageLogger).__init__()
|
||||
self.memory_utilisation = memory_utilisation
|
||||
self.gpu_utilisation = gpu_utilisation
|
||||
self.intra_step_time = intra_step_time
|
||||
self.inter_step_time = inter_step_time
|
||||
self.fan_speed = fan_speed
|
||||
self.temperature = temperature
|
||||
self.snap_intra_step_time = None
|
||||
self.snap_inter_step_time = None
|
||||
|
||||
def on_batch_start(self, trainer, pl_module):
|
||||
if self.gpu_utilisation:
|
||||
self._log_gpu(trainer)
|
||||
if self.memory_utilisation:
|
||||
self._log_memory(trainer)
|
||||
|
||||
if self.inter_step_time:
|
||||
# First log at beginning of second step
|
||||
if self.snap_inter_step_time:
|
||||
trainer.logger.log_metrics({'Batch_Time/inter_step (ms)':
|
||||
(time.time() - self.snap_inter_step_time) * 1000},
|
||||
step=trainer.global_step)
|
||||
if self.intra_step_time:
|
||||
self.snap_intra_step_time = time.time()
|
||||
|
||||
def on_batch_end(self, trainer, pl_module):
|
||||
if self.gpu_utilisation:
|
||||
self._log_gpu(trainer)
|
||||
if self.memory_utilisation:
|
||||
self._log_memory(trainer)
|
||||
|
||||
if self.fan_speed:
|
||||
trainer.logger.log_metrics(self._get_gpu_stat("fan.speed", "%"), step=trainer.global_step)
|
||||
if self.temperature:
|
||||
trainer.logger.log_metrics(self._get_gpu_stat("temperature.gpu", "degrees C"), step=trainer.global_step)
|
||||
trainer.logger.log_metrics(self._get_gpu_stat("temperature.memory", "degrees C"), step=trainer.global_step)
|
||||
|
||||
if self.inter_step_time:
|
||||
self.snap_inter_step_time = time.time()
|
||||
|
||||
if self.intra_step_time:
|
||||
if self.snap_intra_step_time:
|
||||
trainer.logger.log_metrics({'Batch_Time/intra_step (ms)':
|
||||
(time.time() - self.snap_intra_step_time) * 1000},
|
||||
step=trainer.global_step)
|
||||
|
||||
def on_train_epoch_start(self, trainer, pl_module):
|
||||
self.snap_intra_step_time = None
|
||||
self.snap_inter_step_time = None
|
||||
|
||||
@staticmethod
|
||||
def _get_gpu_stat(pitem: str, unit: str):
|
||||
result = subprocess.run(["/bin/nvidia-smi", f"--query-gpu={pitem}", "--format=csv,nounits,noheader"],
|
||||
encoding="utf-8", stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE, # for backward compatibility with python version 3.6
|
||||
check=True)
|
||||
try:
|
||||
gpu_usage = [float(x) for x in result.stdout.strip().split(os.linesep)]
|
||||
except ValueError:
|
||||
gpu_usage = [0]
|
||||
|
||||
return {f"GPU_{pitem}/gpu_id_{index} ({unit})": usage for index, usage in enumerate(gpu_usage)}
|
||||
|
||||
def _log_gpu(self, trainer):
|
||||
trainer.logger.log_metrics(self._get_gpu_stat("utilization.gpu", "%"), step=trainer.global_step)
|
||||
|
||||
def _log_memory(self, trainer):
|
||||
trainer.logger.log_metrics(self._get_gpu_stat("memory.used", "MB"), step=trainer.global_step)
|
||||
trainer.logger.log_metrics(self._get_gpu_stat("memory.free", "MB"), step=trainer.global_step)
|
||||
trainer.logger.log_metrics(self._get_gpu_stat("utilization.memory", "%"), step=trainer.global_step)
|
Loading…
Reference in New Issue