Allow to deactivate GPU memory logging in Trainer (#190)

* Allow to deactivate GPU memory logging in Trainer Adds the flag `log_gpu_memory` to Trainer to deactivate logging of GPU memory utilization. On some servers logging the GPU memory usage can significantly slow down training. * Update Logging.md * Update trainer.py
2019-09-04 16:43:46 +02:00 · 2019-09-04 16:43:46 +02:00 · dac41030d4
parent 0872c32151
commit dac41030d4
2 changed files with 14 additions and 1 deletions
--- a/docs/Trainer/Logging.md
+++ b/docs/Trainer/Logging.md
@ -14,6 +14,14 @@ Every k batches lightning will make an entry in the metrics log
 ``` {.python}
 # DEFAULT (ie: save a .csv log file every 10 batches)
 trainer = Trainer(add_log_row_interval=10)
+```   
+
+---
+#### Log metric row every k batches 
+Logs GPU memory when metrics are logged.   
+``` {.python}
+# DEFAULT
+trainer = Trainer(log_gpu_memory=False)
 ```

 ---
--- a/pytorch_lightning/models/trainer.py
+++ b/pytorch_lightning/models/trainer.py
@ -63,6 +63,7 @@ class Trainer(TrainerIO):
                 current_gpu_name=0,
                 nb_gpu_nodes=1,
                 gpus=None,
+                 log_gpu_memory=False,
                 show_progress_bar=True,
                 overfit_pct=0.0,
                 track_grad_norm=-1,
@ -94,6 +95,9 @@ class Trainer(TrainerIO):
        :param current_gpu_name:
        :param nb_gpu_nodes:
        :param gpus:
+        :param \: Log GPU memory utilization as metric
+            during training. This can lead to lower performance on some
+            servers, in particular when `nvidia-smi` is slow.
        :param show_progress_bar:
        :param overfit_pct:
        :param track_grad_norm:
@ -118,6 +122,7 @@ class Trainer(TrainerIO):
        """
        # Transfer params
        self.nb_gpu_nodes = nb_gpu_nodes
+        self.log_gpu_memory = log_gpu_memory
        self.gradient_clip = gradient_clip
        self.check_val_every_n_epoch = check_val_every_n_epoch
        self.enable_early_stop = early_stop_callback is not None
@ -934,7 +939,7 @@ class Trainer(TrainerIO):
                metrics = self.__tng_tqdm_dic

                # add gpu memory
-                if self.on_gpu:
+                if self.on_gpu and self.log_gpu_memory:
                    mem_map = get_gpu_memory_map()
                    metrics.update(mem_map)