Allow to deactivate GPU memory logging in Trainer (#190)

* Allow to deactivate GPU memory logging in Trainer

Adds the flag `log_gpu_memory` to Trainer to deactivate logging of GPU
memory utilization. On some servers logging the GPU memory usage can
significantly slow down training.

* Update Logging.md

* Update trainer.py
This commit is contained in:
Max Horn 2019-09-04 16:43:46 +02:00 committed by William Falcon
parent 0872c32151
commit dac41030d4
2 changed files with 14 additions and 1 deletions

View File

@ -14,6 +14,14 @@ Every k batches lightning will make an entry in the metrics log
``` {.python}
# DEFAULT (ie: save a .csv log file every 10 batches)
trainer = Trainer(add_log_row_interval=10)
```
---
#### Log metric row every k batches
Logs GPU memory when metrics are logged.
``` {.python}
# DEFAULT
trainer = Trainer(log_gpu_memory=False)
```
---

View File

@ -63,6 +63,7 @@ class Trainer(TrainerIO):
current_gpu_name=0,
nb_gpu_nodes=1,
gpus=None,
log_gpu_memory=False,
show_progress_bar=True,
overfit_pct=0.0,
track_grad_norm=-1,
@ -94,6 +95,9 @@ class Trainer(TrainerIO):
:param current_gpu_name:
:param nb_gpu_nodes:
:param gpus:
:param \: Log GPU memory utilization as metric
during training. This can lead to lower performance on some
servers, in particular when `nvidia-smi` is slow.
:param show_progress_bar:
:param overfit_pct:
:param track_grad_norm:
@ -118,6 +122,7 @@ class Trainer(TrainerIO):
"""
# Transfer params
self.nb_gpu_nodes = nb_gpu_nodes
self.log_gpu_memory = log_gpu_memory
self.gradient_clip = gradient_clip
self.check_val_every_n_epoch = check_val_every_n_epoch
self.enable_early_stop = early_stop_callback is not None
@ -934,7 +939,7 @@ class Trainer(TrainerIO):
metrics = self.__tng_tqdm_dic
# add gpu memory
if self.on_gpu:
if self.on_gpu and self.log_gpu_memory:
mem_map = get_gpu_memory_map()
metrics.update(mem_map)