diff --git a/docs/Trainer/Logging.md b/docs/Trainer/Logging.md index d6eb39d6d6..27d1e85d24 100644 --- a/docs/Trainer/Logging.md +++ b/docs/Trainer/Logging.md @@ -14,6 +14,14 @@ Every k batches lightning will make an entry in the metrics log ``` {.python} # DEFAULT (ie: save a .csv log file every 10 batches) trainer = Trainer(add_log_row_interval=10) +``` + +--- +#### Log metric row every k batches +Logs GPU memory when metrics are logged. +``` {.python} +# DEFAULT +trainer = Trainer(log_gpu_memory=False) ``` --- diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 4a7ae48fb0..d84e80a710 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -63,6 +63,7 @@ class Trainer(TrainerIO): current_gpu_name=0, nb_gpu_nodes=1, gpus=None, + log_gpu_memory=False, show_progress_bar=True, overfit_pct=0.0, track_grad_norm=-1, @@ -94,6 +95,9 @@ class Trainer(TrainerIO): :param current_gpu_name: :param nb_gpu_nodes: :param gpus: + :param \: Log GPU memory utilization as metric + during training. This can lead to lower performance on some + servers, in particular when `nvidia-smi` is slow. :param show_progress_bar: :param overfit_pct: :param track_grad_norm: @@ -118,6 +122,7 @@ class Trainer(TrainerIO): """ # Transfer params self.nb_gpu_nodes = nb_gpu_nodes + self.log_gpu_memory = log_gpu_memory self.gradient_clip = gradient_clip self.check_val_every_n_epoch = check_val_every_n_epoch self.enable_early_stop = early_stop_callback is not None @@ -934,7 +939,7 @@ class Trainer(TrainerIO): metrics = self.__tng_tqdm_dic # add gpu memory - if self.on_gpu: + if self.on_gpu and self.log_gpu_memory: mem_map = get_gpu_memory_map() metrics.update(mem_map)