diff --git a/CHANGELOG.md b/CHANGELOG.md index d63a0ad83c..c87c3811a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,6 +100,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Change Metrics `persistent` default mode to `False` ([#4685](https://github.com/PyTorchLightning/pytorch-lightning/pull/4685)) +- LoggerConnector log_metrics will use `total_batch_idx` instead of `global_step` when logging on `training step` ([#4738](https://github.com/PyTorchLightning/pytorch-lightning/pull/4738)) + ### Fixed diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 9386d428b1..cab08edd58 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -95,14 +95,14 @@ class LoggerConnector: if self._current_stage is not None: self._cached_results[self._current_stage].cache_result() - def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool): + def on_trainer_init(self, logger, flush_logs_every_n_steps: int, + log_every_n_steps: int, move_metrics_to_cpu: bool): # logging self.configure_logger(logger) # todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders - # and assign here the desired value + # and assign here the desired value self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps self.trainer.log_every_n_steps = log_every_n_steps - self.trainer.move_metrics_to_cpu = move_metrics_to_cpu self.trainer.split_idx = None @@ -181,7 +181,7 @@ class LoggerConnector: self.logged_metrics.update(logged_metrics_tmp) self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp) - def log_metrics(self, metrics, grad_norm_dic, step=None): + def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=False): """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, uses metrics["step"] as a step @@ -190,6 +190,8 @@ class LoggerConnector: metrics (dict): Metric values grad_norm_dic (dict): Gradient norms step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step` + log_train_step_metrics (bool): Used to track if log_metrics function is being called in during training steps. + In training steps, we will log metrics on step: total_nb_idx (for accumulated gradients) and global_step for the rest. """ # add gpu memory if self.trainer.on_gpu and self.trainer.log_gpu_memory: @@ -207,8 +209,11 @@ class LoggerConnector: elif step is None: # added metrics by Lightning for convenience - scalar_metrics['epoch'] = self.trainer.current_epoch - step = self.trainer.global_step + if log_train_step_metrics: + step = self.trainer.total_batch_idx + else: + scalar_metrics['epoch'] = self.trainer.current_epoch + step = self.trainer.global_step # log actual metrics if self.trainer.logger is not None: @@ -619,5 +624,5 @@ class LoggerConnector: metrics = self.cached_results.get_latest_batch_log_metrics() grad_norm_dic = batch_output.grad_norm_dic if len(metrics) > 0 or len(grad_norm_dic) > 0: - self.log_metrics(metrics, grad_norm_dic) + self.log_metrics(metrics, grad_norm_dic, log_train_step_metrics=True) self.callback_metrics.update(metrics) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 43babc0b34..b55756109a 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -21,48 +21,48 @@ from typing import Dict, Iterable, List, Optional, Union import torch from torch.utils.data import DataLoader +from pytorch_lightning import _logger as log +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector +from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.memory import ModelSummary -from pytorch_lightning.core.step_result import Result, EvalResult +from pytorch_lightning.core.step_result import EvalResult, Result from pytorch_lightning.loggers import LightningLoggerBase +from pytorch_lightning.plugins.plugin_connector import PluginConnector from pytorch_lightning.profiler import BaseProfiler from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin from pytorch_lightning.trainer.configuration_validator import ConfigValidator +from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector +from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector +from pytorch_lightning.trainer.connectors.data_connector import DataConnector +from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector from pytorch_lightning.trainer.connectors.env_vars_connector import overwrite_by_env_vars +from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector +from pytorch_lightning.trainer.connectors.model_connector import ModelConnector +from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector +from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector +from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector +from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector +from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin +from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop from pytorch_lightning.trainer.logging import TrainerLoggingMixin from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin +from pytorch_lightning.trainer.properties import TrainerProperties from pytorch_lightning.trainer.states import TrainerState, trainer_state +from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin +from pytorch_lightning.tuner.tuning import Tuner from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop -from pytorch_lightning.trainer.training_loop import TrainLoop -from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector -from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector -from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector -from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector -from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector -from pytorch_lightning.trainer.connectors.model_connector import ModelConnector -from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector -from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector -from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector -from pytorch_lightning import _logger as log -from pytorch_lightning.tuner.tuning import Tuner -from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector -from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector -from pytorch_lightning.trainer.connectors.data_connector import DataConnector -from pytorch_lightning.utilities.cloud_io import load as pl_load -from pytorch_lightning.utilities.model_utils import is_overridden -from pytorch_lightning.trainer.properties import TrainerProperties -from pytorch_lightning.plugins.plugin_connector import PluginConnector -from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator from pytorch_lightning.utilities.memory import recursive_detach +from pytorch_lightning.utilities.model_utils import is_overridden # warnings to ignore in trainer warnings.filterwarnings( @@ -385,7 +385,7 @@ class Trainer( logger, flush_logs_every_n_steps, log_every_n_steps, - move_metrics_to_cpu + move_metrics_to_cpu, ) # init debugging flags diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 386b8f1e23..5fd166afce 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -21,13 +21,13 @@ from unittest.mock import ANY, call import pytest import tests.base.develop_utils as tutils -from pytorch_lightning import Trainer, Callback +from pytorch_lightning import Callback, Trainer from pytorch_lightning.loggers import ( - TensorBoardLogger, + CometLogger, MLFlowLogger, NeptuneLogger, + TensorBoardLogger, TestTubeLogger, - CometLogger, WandbLogger, ) from pytorch_lightning.loggers.base import DummyExperiment @@ -124,7 +124,7 @@ def _test_loggers_fit_test(tmpdir, logger_class): if logger_class == TensorBoardLogger: expected = [ (0, ['hp_metric']), - (0, ['epoch', 'train_some_val']), + (0, ['train_some_val']), (0, ['early_stop_on', 'epoch', 'val_acc']), (0, ['hp_metric']), (1, ['epoch', 'test_acc', 'test_loss']) @@ -132,7 +132,7 @@ def _test_loggers_fit_test(tmpdir, logger_class): assert log_metric_names == expected else: expected = [ - (0, ['epoch', 'train_some_val']), + (0, ['train_some_val']), (0, ['early_stop_on', 'epoch', 'val_acc']), (1, ['epoch', 'test_acc', 'test_loss']) ] diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index b7688b7815..15a024003e 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -14,6 +14,7 @@ import os from argparse import Namespace from distutils.version import LooseVersion +from unittest import mock import pytest import torch @@ -23,7 +24,7 @@ from tensorboard.backend.event_processing.event_accumulator import EventAccumula from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.loggers import TensorBoardLogger -from tests.base import EvalModelTemplate, BoringModel +from tests.base import BoringModel, EvalModelTemplate @pytest.mark.skipif( @@ -201,3 +202,63 @@ def test_tensorboard_log_graph_warning_no_example_input_array(tmpdir): ' attribute is not set or `input_array` was not given' ): logger.log_graph(model) + + +@mock.patch('pytorch_lightning.loggers.TensorBoardLogger.log_metrics') +@pytest.mark.parametrize('expected', [ + ([5, 11, 17]), +]) +def test_tensorboard_with_accummulated_gradients(mock_log_metrics, expected, tmpdir): + """ + Tests to ensure that tensorboard log properly when accumulated_gradients > 1 + """ + class TestModel(BoringModel): + _count = 0 + _indexes = [] + + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('count', self._count, on_step=True, on_epoch=True) + self.log('loss', loss, on_step=True, on_epoch=True) + + if self.trainer.logger_connector.should_update_logs: + self._indexes.append(self._count) + + self._count += 1 + return loss + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('val_loss', loss, on_step=True, on_epoch=True) + return loss + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=.001) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + return [optimizer], [lr_scheduler] + + model = TestModel() + model.training_epoch_end = None + model.validation_epoch_end = None + + logger_0 = TensorBoardLogger(tmpdir, default_hp_metric=False) + + accumulate_grad_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=12, + limit_val_batches=12, + max_epochs=3, + gpus=0, + accumulate_grad_batches=accumulate_grad_batches, + logger=[logger_0], + log_every_n_steps=3, + ) + trainer.fit(model) + + mock_count_epochs = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_epoch" in m[2]["metrics"]] + assert mock_count_epochs == expected + mock_count_steps = [m[2]["step"] for m in mock_log_metrics.mock_calls if "count_step" in m[2]["metrics"]] + assert model._indexes == mock_count_steps