[bugfix] Accumulated_gradient and TensoBoard (#4738)

* resolve bug

* update

* update

* modify one test

* remove paramters

* update on comments

* update changelog

* update docstring

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
This commit is contained in:
chaton 2020-11-25 19:44:05 +00:00 committed by GitHub
parent d24a26748d
commit 204a0a2d03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 105 additions and 37 deletions

View File

@ -100,6 +100,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Change Metrics `persistent` default mode to `False` ([#4685](https://github.com/PyTorchLightning/pytorch-lightning/pull/4685))
- LoggerConnector log_metrics will use `total_batch_idx` instead of `global_step` when logging on `training step` ([#4738](https://github.com/PyTorchLightning/pytorch-lightning/pull/4738))
### Fixed

View File

@ -95,14 +95,14 @@ class LoggerConnector:
if self._current_stage is not None:
self._cached_results[self._current_stage].cache_result()
def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool):
def on_trainer_init(self, logger, flush_logs_every_n_steps: int,
log_every_n_steps: int, move_metrics_to_cpu: bool):
# logging
self.configure_logger(logger)
# todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders
# and assign here the desired value
# and assign here the desired value
self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps
self.trainer.log_every_n_steps = log_every_n_steps
self.trainer.move_metrics_to_cpu = move_metrics_to_cpu
self.trainer.split_idx = None
@ -181,7 +181,7 @@ class LoggerConnector:
self.logged_metrics.update(logged_metrics_tmp)
self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp)
def log_metrics(self, metrics, grad_norm_dic, step=None):
def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=False):
"""Logs the metric dict passed in.
If `step` parameter is None and `step` key is presented is metrics,
uses metrics["step"] as a step
@ -190,6 +190,8 @@ class LoggerConnector:
metrics (dict): Metric values
grad_norm_dic (dict): Gradient norms
step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step`
log_train_step_metrics (bool): Used to track if log_metrics function is being called in during training steps.
In training steps, we will log metrics on step: total_nb_idx (for accumulated gradients) and global_step for the rest.
"""
# add gpu memory
if self.trainer.on_gpu and self.trainer.log_gpu_memory:
@ -207,8 +209,11 @@ class LoggerConnector:
elif step is None:
# added metrics by Lightning for convenience
scalar_metrics['epoch'] = self.trainer.current_epoch
step = self.trainer.global_step
if log_train_step_metrics:
step = self.trainer.total_batch_idx
else:
scalar_metrics['epoch'] = self.trainer.current_epoch
step = self.trainer.global_step
# log actual metrics
if self.trainer.logger is not None:
@ -619,5 +624,5 @@ class LoggerConnector:
metrics = self.cached_results.get_latest_batch_log_metrics()
grad_norm_dic = batch_output.grad_norm_dic
if len(metrics) > 0 or len(grad_norm_dic) > 0:
self.log_metrics(metrics, grad_norm_dic)
self.log_metrics(metrics, grad_norm_dic, log_train_step_metrics=True)
self.callback_metrics.update(metrics)

View File

@ -21,48 +21,48 @@ from typing import Dict, Iterable, List, Optional, Union
import torch
from torch.utils.data import DataLoader
from pytorch_lightning import _logger as log
from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
from pytorch_lightning.core.datamodule import LightningDataModule
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.core.memory import ModelSummary
from pytorch_lightning.core.step_result import Result, EvalResult
from pytorch_lightning.core.step_result import EvalResult, Result
from pytorch_lightning.loggers import LightningLoggerBase
from pytorch_lightning.plugins.plugin_connector import PluginConnector
from pytorch_lightning.profiler import BaseProfiler
from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin
from pytorch_lightning.trainer.configuration_validator import ConfigValidator
from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
from pytorch_lightning.trainer.connectors.data_connector import DataConnector
from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector
from pytorch_lightning.trainer.connectors.env_vars_connector import overwrite_by_env_vars
from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector
from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector
from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
from pytorch_lightning.trainer.logging import TrainerLoggingMixin
from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin
from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin
from pytorch_lightning.trainer.properties import TrainerProperties
from pytorch_lightning.trainer.states import TrainerState, trainer_state
from pytorch_lightning.trainer.training_loop import TrainLoop
from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
from pytorch_lightning.tuner.tuning import Tuner
from pytorch_lightning.utilities import rank_zero_warn
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.debugging import InternalDebugger
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
from pytorch_lightning.trainer.training_loop import TrainLoop
from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
from pytorch_lightning import _logger as log
from pytorch_lightning.tuner.tuning import Tuner
from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector
from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector
from pytorch_lightning.trainer.connectors.data_connector import DataConnector
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.model_utils import is_overridden
from pytorch_lightning.trainer.properties import TrainerProperties
from pytorch_lightning.plugins.plugin_connector import PluginConnector
from pytorch_lightning.accelerators.accelerator import Accelerator
from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator
from pytorch_lightning.utilities.memory import recursive_detach
from pytorch_lightning.utilities.model_utils import is_overridden
# warnings to ignore in trainer
warnings.filterwarnings(
@ -385,7 +385,7 @@ class Trainer(
logger,
flush_logs_every_n_steps,
log_every_n_steps,
move_metrics_to_cpu
move_metrics_to_cpu,
)
# init debugging flags

View File

@ -21,13 +21,13 @@ from unittest.mock import ANY, call
import pytest
import tests.base.develop_utils as tutils
from pytorch_lightning import Trainer, Callback
from pytorch_lightning import Callback, Trainer
from pytorch_lightning.loggers import (
TensorBoardLogger,
CometLogger,
MLFlowLogger,
NeptuneLogger,
TensorBoardLogger,
TestTubeLogger,
CometLogger,
WandbLogger,
)
from pytorch_lightning.loggers.base import DummyExperiment
@ -124,7 +124,7 @@ def _test_loggers_fit_test(tmpdir, logger_class):
if logger_class == TensorBoardLogger:
expected = [
(0, ['hp_metric']),
(0, ['epoch', 'train_some_val']),
(0, ['train_some_val']),
(0, ['early_stop_on', 'epoch', 'val_acc']),
(0, ['hp_metric']),
(1, ['epoch', 'test_acc', 'test_loss'])
@ -132,7 +132,7 @@ def _test_loggers_fit_test(tmpdir, logger_class):
assert log_metric_names == expected
else:
expected = [
(0, ['epoch', 'train_some_val']),
(0, ['train_some_val']),
(0, ['early_stop_on', 'epoch', 'val_acc']),
(1, ['epoch', 'test_acc', 'test_loss'])
]

View File

@ -14,6 +14,7 @@
import os
from argparse import Namespace
from distutils.version import LooseVersion
from unittest import mock
import pytest
import torch
@ -23,7 +24,7 @@ from tensorboard.backend.event_processing.event_accumulator import EventAccumula
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers import TensorBoardLogger
from tests.base import EvalModelTemplate, BoringModel
from tests.base import BoringModel, EvalModelTemplate
@pytest.mark.skipif(
@ -201,3 +202,63 @@ def test_tensorboard_log_graph_warning_no_example_input_array(tmpdir):
' attribute is not set or `input_array` was not given'
):
logger.log_graph(model)
@mock.patch('pytorch_lightning.loggers.TensorBoardLogger.log_metrics')
@pytest.mark.parametrize('expected', [
([5, 11, 17]),
])
def test_tensorboard_with_accummulated_gradients(mock_log_metrics, expected, tmpdir):
"""
Tests to ensure that tensorboard log properly when accumulated_gradients > 1
"""
class TestModel(BoringModel):
_count = 0
_indexes = []
def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log('count', self._count, on_step=True, on_epoch=True)
self.log('loss', loss, on_step=True, on_epoch=True)
if self.trainer.logger_connector.should_update_logs:
self._indexes.append(self._count)
self._count += 1
return loss
def validation_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log('val_loss', loss, on_step=True, on_epoch=True)
return loss
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=.001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
return [optimizer], [lr_scheduler]
model = TestModel()
model.training_epoch_end = None
model.validation_epoch_end = None
logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False)
accumulate_grad_batches = 2
trainer = Trainer(
default_root_dir=tmpdir,
limit_train_batches=12,
limit_val_batches=12,
max_epochs=3,
gpus=0,
accumulate_grad_batches=accumulate_grad_batches,
logger=[logger_0],
log_every_n_steps=3,
)
trainer.fit(model)
mock_count_epochs = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_epoch" in m[2]["metrics"]]
assert mock_count_epochs == expected
mock_count_steps = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_step" in m[2]["metrics"]]
assert model._indexes == mock_count_steps