[bugfix] Accumulated_gradient and TensoBoard (#4738)
* resolve bug * update * update * modify one test * remove paramters * update on comments * update changelog * update docstring Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
This commit is contained in:
parent
d24a26748d
commit
204a0a2d03
|
@ -100,6 +100,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
- Change Metrics `persistent` default mode to `False` ([#4685](https://github.com/PyTorchLightning/pytorch-lightning/pull/4685))
|
||||
|
||||
- LoggerConnector log_metrics will use `total_batch_idx` instead of `global_step` when logging on `training step` ([#4738](https://github.com/PyTorchLightning/pytorch-lightning/pull/4738))
|
||||
|
||||
|
||||
### Fixed
|
||||
|
||||
|
|
|
@ -95,14 +95,14 @@ class LoggerConnector:
|
|||
if self._current_stage is not None:
|
||||
self._cached_results[self._current_stage].cache_result()
|
||||
|
||||
def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool):
|
||||
def on_trainer_init(self, logger, flush_logs_every_n_steps: int,
|
||||
log_every_n_steps: int, move_metrics_to_cpu: bool):
|
||||
# logging
|
||||
self.configure_logger(logger)
|
||||
# todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders
|
||||
# and assign here the desired value
|
||||
# and assign here the desired value
|
||||
self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps
|
||||
self.trainer.log_every_n_steps = log_every_n_steps
|
||||
|
||||
self.trainer.move_metrics_to_cpu = move_metrics_to_cpu
|
||||
self.trainer.split_idx = None
|
||||
|
||||
|
@ -181,7 +181,7 @@ class LoggerConnector:
|
|||
self.logged_metrics.update(logged_metrics_tmp)
|
||||
self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp)
|
||||
|
||||
def log_metrics(self, metrics, grad_norm_dic, step=None):
|
||||
def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=False):
|
||||
"""Logs the metric dict passed in.
|
||||
If `step` parameter is None and `step` key is presented is metrics,
|
||||
uses metrics["step"] as a step
|
||||
|
@ -190,6 +190,8 @@ class LoggerConnector:
|
|||
metrics (dict): Metric values
|
||||
grad_norm_dic (dict): Gradient norms
|
||||
step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step`
|
||||
log_train_step_metrics (bool): Used to track if log_metrics function is being called in during training steps.
|
||||
In training steps, we will log metrics on step: total_nb_idx (for accumulated gradients) and global_step for the rest.
|
||||
"""
|
||||
# add gpu memory
|
||||
if self.trainer.on_gpu and self.trainer.log_gpu_memory:
|
||||
|
@ -207,8 +209,11 @@ class LoggerConnector:
|
|||
|
||||
elif step is None:
|
||||
# added metrics by Lightning for convenience
|
||||
scalar_metrics['epoch'] = self.trainer.current_epoch
|
||||
step = self.trainer.global_step
|
||||
if log_train_step_metrics:
|
||||
step = self.trainer.total_batch_idx
|
||||
else:
|
||||
scalar_metrics['epoch'] = self.trainer.current_epoch
|
||||
step = self.trainer.global_step
|
||||
|
||||
# log actual metrics
|
||||
if self.trainer.logger is not None:
|
||||
|
@ -619,5 +624,5 @@ class LoggerConnector:
|
|||
metrics = self.cached_results.get_latest_batch_log_metrics()
|
||||
grad_norm_dic = batch_output.grad_norm_dic
|
||||
if len(metrics) > 0 or len(grad_norm_dic) > 0:
|
||||
self.log_metrics(metrics, grad_norm_dic)
|
||||
self.log_metrics(metrics, grad_norm_dic, log_train_step_metrics=True)
|
||||
self.callback_metrics.update(metrics)
|
||||
|
|
|
@ -21,48 +21,48 @@ from typing import Dict, Iterable, List, Optional, Union
|
|||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from pytorch_lightning import _logger as log
|
||||
from pytorch_lightning.accelerators.accelerator import Accelerator
|
||||
from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
|
||||
from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator
|
||||
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
|
||||
from pytorch_lightning.core.datamodule import LightningDataModule
|
||||
from pytorch_lightning.core.lightning import LightningModule
|
||||
from pytorch_lightning.core.memory import ModelSummary
|
||||
from pytorch_lightning.core.step_result import Result, EvalResult
|
||||
from pytorch_lightning.core.step_result import EvalResult, Result
|
||||
from pytorch_lightning.loggers import LightningLoggerBase
|
||||
from pytorch_lightning.plugins.plugin_connector import PluginConnector
|
||||
from pytorch_lightning.profiler import BaseProfiler
|
||||
from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin
|
||||
from pytorch_lightning.trainer.configuration_validator import ConfigValidator
|
||||
from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
|
||||
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
|
||||
from pytorch_lightning.trainer.connectors.data_connector import DataConnector
|
||||
from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector
|
||||
from pytorch_lightning.trainer.connectors.env_vars_connector import overwrite_by_env_vars
|
||||
from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
|
||||
from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
|
||||
from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
|
||||
from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector
|
||||
from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector
|
||||
from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
|
||||
from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
|
||||
from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
|
||||
from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
|
||||
from pytorch_lightning.trainer.logging import TrainerLoggingMixin
|
||||
from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin
|
||||
from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin
|
||||
from pytorch_lightning.trainer.properties import TrainerProperties
|
||||
from pytorch_lightning.trainer.states import TrainerState, trainer_state
|
||||
from pytorch_lightning.trainer.training_loop import TrainLoop
|
||||
from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
|
||||
from pytorch_lightning.tuner.tuning import Tuner
|
||||
from pytorch_lightning.utilities import rank_zero_warn
|
||||
from pytorch_lightning.utilities.cloud_io import load as pl_load
|
||||
from pytorch_lightning.utilities.debugging import InternalDebugger
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
|
||||
from pytorch_lightning.trainer.training_loop import TrainLoop
|
||||
from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
|
||||
from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
|
||||
from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
|
||||
from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
|
||||
from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
|
||||
from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
|
||||
from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector
|
||||
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
|
||||
from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
|
||||
from pytorch_lightning import _logger as log
|
||||
from pytorch_lightning.tuner.tuning import Tuner
|
||||
from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector
|
||||
from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector
|
||||
from pytorch_lightning.trainer.connectors.data_connector import DataConnector
|
||||
from pytorch_lightning.utilities.cloud_io import load as pl_load
|
||||
from pytorch_lightning.utilities.model_utils import is_overridden
|
||||
from pytorch_lightning.trainer.properties import TrainerProperties
|
||||
from pytorch_lightning.plugins.plugin_connector import PluginConnector
|
||||
from pytorch_lightning.accelerators.accelerator import Accelerator
|
||||
from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator
|
||||
from pytorch_lightning.utilities.memory import recursive_detach
|
||||
from pytorch_lightning.utilities.model_utils import is_overridden
|
||||
|
||||
# warnings to ignore in trainer
|
||||
warnings.filterwarnings(
|
||||
|
@ -385,7 +385,7 @@ class Trainer(
|
|||
logger,
|
||||
flush_logs_every_n_steps,
|
||||
log_every_n_steps,
|
||||
move_metrics_to_cpu
|
||||
move_metrics_to_cpu,
|
||||
)
|
||||
|
||||
# init debugging flags
|
||||
|
|
|
@ -21,13 +21,13 @@ from unittest.mock import ANY, call
|
|||
import pytest
|
||||
|
||||
import tests.base.develop_utils as tutils
|
||||
from pytorch_lightning import Trainer, Callback
|
||||
from pytorch_lightning import Callback, Trainer
|
||||
from pytorch_lightning.loggers import (
|
||||
TensorBoardLogger,
|
||||
CometLogger,
|
||||
MLFlowLogger,
|
||||
NeptuneLogger,
|
||||
TensorBoardLogger,
|
||||
TestTubeLogger,
|
||||
CometLogger,
|
||||
WandbLogger,
|
||||
)
|
||||
from pytorch_lightning.loggers.base import DummyExperiment
|
||||
|
@ -124,7 +124,7 @@ def _test_loggers_fit_test(tmpdir, logger_class):
|
|||
if logger_class == TensorBoardLogger:
|
||||
expected = [
|
||||
(0, ['hp_metric']),
|
||||
(0, ['epoch', 'train_some_val']),
|
||||
(0, ['train_some_val']),
|
||||
(0, ['early_stop_on', 'epoch', 'val_acc']),
|
||||
(0, ['hp_metric']),
|
||||
(1, ['epoch', 'test_acc', 'test_loss'])
|
||||
|
@ -132,7 +132,7 @@ def _test_loggers_fit_test(tmpdir, logger_class):
|
|||
assert log_metric_names == expected
|
||||
else:
|
||||
expected = [
|
||||
(0, ['epoch', 'train_some_val']),
|
||||
(0, ['train_some_val']),
|
||||
(0, ['early_stop_on', 'epoch', 'val_acc']),
|
||||
(1, ['epoch', 'test_acc', 'test_loss'])
|
||||
]
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
import os
|
||||
from argparse import Namespace
|
||||
from distutils.version import LooseVersion
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
@ -23,7 +24,7 @@ from tensorboard.backend.event_processing.event_accumulator import EventAccumula
|
|||
|
||||
from pytorch_lightning import Trainer, seed_everything
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
from tests.base import EvalModelTemplate, BoringModel
|
||||
from tests.base import BoringModel, EvalModelTemplate
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
|
@ -201,3 +202,63 @@ def test_tensorboard_log_graph_warning_no_example_input_array(tmpdir):
|
|||
' attribute is not set or `input_array` was not given'
|
||||
):
|
||||
logger.log_graph(model)
|
||||
|
||||
|
||||
@mock.patch('pytorch_lightning.loggers.TensorBoardLogger.log_metrics')
|
||||
@pytest.mark.parametrize('expected', [
|
||||
([5, 11, 17]),
|
||||
])
|
||||
def test_tensorboard_with_accummulated_gradients(mock_log_metrics, expected, tmpdir):
|
||||
"""
|
||||
Tests to ensure that tensorboard log properly when accumulated_gradients > 1
|
||||
"""
|
||||
class TestModel(BoringModel):
|
||||
_count = 0
|
||||
_indexes = []
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
output = self.layer(batch)
|
||||
loss = self.loss(batch, output)
|
||||
self.log('count', self._count, on_step=True, on_epoch=True)
|
||||
self.log('loss', loss, on_step=True, on_epoch=True)
|
||||
|
||||
if self.trainer.logger_connector.should_update_logs:
|
||||
self._indexes.append(self._count)
|
||||
|
||||
self._count += 1
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
output = self.layer(batch)
|
||||
loss = self.loss(batch, output)
|
||||
self.log('val_loss', loss, on_step=True, on_epoch=True)
|
||||
return loss
|
||||
|
||||
def configure_optimizers(self):
|
||||
optimizer = torch.optim.SGD(self.layer.parameters(), lr=.001)
|
||||
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
|
||||
return [optimizer], [lr_scheduler]
|
||||
|
||||
model = TestModel()
|
||||
model.training_epoch_end = None
|
||||
model.validation_epoch_end = None
|
||||
|
||||
logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False)
|
||||
|
||||
accumulate_grad_batches = 2
|
||||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
limit_train_batches=12,
|
||||
limit_val_batches=12,
|
||||
max_epochs=3,
|
||||
gpus=0,
|
||||
accumulate_grad_batches=accumulate_grad_batches,
|
||||
logger=[logger_0],
|
||||
log_every_n_steps=3,
|
||||
)
|
||||
trainer.fit(model)
|
||||
|
||||
mock_count_epochs = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_epoch" in m[2]["metrics"]]
|
||||
assert mock_count_epochs == expected
|
||||
mock_count_steps = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_step" in m[2]["metrics"]]
|
||||
assert model._indexes == mock_count_steps
|
||||
|
|
Loading…
Reference in New Issue