Merge pull request #8174 from PyTorchLightning/bugfix/8159_log_gpu_memory_on_step
[bugfix] Resolve memory not logged when missing metrics
This commit is contained in:
parent
2a372e3682
commit
c4492ad6aa
|
@ -317,6 +317,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
- Fixed a bug where an infinite recursion would be triggered when using the `BaseFinetuning` callback on a model that contains a `ModuleDict` ([#8170](https://github.com/PyTorchLightning/pytorch-lightning/pull/8170))
|
||||
|
||||
|
||||
- Fixed `log_gpu_memory` metrics not being added to `logging` when nothing else is logged ([#8174](https://github.com/PyTorchLightning/pytorch-lightning/pull/8174))
|
||||
|
||||
|
||||
## [1.3.7] - 2021-06-22
|
||||
|
||||
- Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975))
|
||||
|
|
|
@ -38,6 +38,7 @@ class LoggerConnector:
|
|||
self._progress_bar_metrics: Dict[str, float] = {}
|
||||
self._logged_metrics: Dict[str, _METRIC] = {}
|
||||
self._callback_metrics: Dict[str, _METRIC] = {}
|
||||
self._gpus_metrics: Dict[str, str] = {}
|
||||
self._epoch_end_reached = False
|
||||
self._current_fx: Optional[str] = None
|
||||
self._batch_idx: Optional[int] = None
|
||||
|
@ -94,11 +95,6 @@ class LoggerConnector:
|
|||
if self.trainer.logger is None or not metrics:
|
||||
return
|
||||
|
||||
# add gpu memory
|
||||
if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
|
||||
mem_map = memory.get_memory_profile(self.log_gpu_memory)
|
||||
metrics.update(mem_map)
|
||||
|
||||
# turn all tensors to scalars
|
||||
scalar_metrics = metrics_to_scalars(metrics)
|
||||
|
||||
|
@ -213,6 +209,8 @@ class LoggerConnector:
|
|||
if self.trainer.fit_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization:
|
||||
return
|
||||
|
||||
self._log_gpus_metrics()
|
||||
|
||||
# when metrics should be logged
|
||||
assert not self._epoch_end_reached
|
||||
if self.should_update_logs or self.trainer.fast_dev_run:
|
||||
|
@ -226,6 +224,12 @@ class LoggerConnector:
|
|||
# reset result collection for next epoch
|
||||
self.trainer._results.reset(metrics=True)
|
||||
|
||||
def _log_gpus_metrics(self):
|
||||
for key, mem in self.gpus_metrics.items():
|
||||
gpu_id = int(key.split('/')[0].split(':')[1])
|
||||
if gpu_id in self.trainer.accelerator_connector.parallel_device_ids:
|
||||
self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False)
|
||||
|
||||
"""
|
||||
Utilities and properties
|
||||
"""
|
||||
|
@ -276,6 +280,13 @@ class LoggerConnector:
|
|||
on_step = not self._epoch_end_reached
|
||||
return self.trainer._results.metrics(on_step)
|
||||
|
||||
@property
|
||||
def gpus_metrics(self) -> Dict[str, str]:
|
||||
if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
|
||||
mem_map = memory.get_memory_profile(self.log_gpu_memory)
|
||||
self._gpus_metrics.update(mem_map)
|
||||
return self._gpus_metrics
|
||||
|
||||
@property
|
||||
def callback_metrics(self) -> Dict[str, _METRIC]:
|
||||
if self.trainer._results:
|
||||
|
|
|
@ -712,3 +712,21 @@ def test_logging_raises(tmpdir):
|
|||
model = TestModel()
|
||||
with pytest.raises(MisconfigurationException, match=r'reduce_fx={min,max,mean,sum}\)` are currently supported'):
|
||||
trainer.fit(model)
|
||||
|
||||
|
||||
@RunIf(min_gpus=2)
|
||||
def test_log_gpu_memory_without_logging_on_step(tmpdir):
|
||||
|
||||
model = BoringModel()
|
||||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
max_epochs=1,
|
||||
limit_train_batches=1,
|
||||
limit_val_batches=0,
|
||||
log_gpu_memory='all',
|
||||
log_every_n_steps=1,
|
||||
gpus=[1]
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
assert 'gpu_id: 1/memory.used (MB)' in trainer.logged_metrics
|
||||
|
|
Loading…
Reference in New Issue