From 4a01fd048cebb65405021d3f24ae4dc07cb735e6 Mon Sep 17 00:00:00 2001 From: chaton Date: Thu, 12 Nov 2020 15:59:01 +0000 Subject: [PATCH] [FIX] Average Pbar Metrics (#4534) * wip * update * normalize loss * update test * resolve bug * update test and add TODO * make sure it can be sync * add TODO * update sol --- pytorch_lightning/core/step_result.py | 3 ++ .../test_trainer_steps_scalar_return.py | 50 ++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 8f8a517d54..12f1b57f83 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -134,6 +134,9 @@ class Result(Dict): # sync across workers when using distributed training sync_fn = sync_fn or sync_ddp_if_available if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)): + is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized() + # TODO: Find a way to make the reduction only once, so we don't need to clone. + value = value.clone() if is_dist_initialized else value value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op) if 'meta' not in self: diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py index 2a66f743a4..b85646e1c2 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py @@ -14,11 +14,13 @@ """ Tests to ensure that the training loop works with a scalar """ -import torch import os +import torch +import pytest from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel +from tests.base import BoringModel def test_training_step_scalar(tmpdir): @@ -190,3 +192,49 @@ def test_train_step_epoch_end_scalar(tmpdir): opt_closure_result = trainer.train_loop.training_step_and_backward( batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'].item() == 171 + + +class DPPReduceMeanPbarModel(BoringModel): + + logged = [] + + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + loss /= loss.clone().detach() + self.log('self_log', loss, prog_bar=True, sync_dist=True) + return {"loss": loss, "progress_bar":{"loss_2": loss}} + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_dpp_reduce_mean_pbar(tmpdir): + os.environ['PL_DEV_DEBUG'] = '1' + + model = DPPReduceMeanPbarModel() + model.training_step_end = None + model.training_epoch_end = None + + distributed_backend = "ddp_spawn" + + trainer = Trainer( + max_epochs=1, + default_root_dir=os.getcwd(), + limit_train_batches=10, + limit_test_batches=2, + limit_val_batches=2, + distributed_backend=distributed_backend, + gpus=2, + precision=32) + + trainer.fit(model) + + # TODO: Move this test to DDP. pbar_added_metrics is empty with ddp_spawn for some reasons + + pbar_added_metrics = trainer.dev_debugger.pbar_added_metrics + is_in = False + for pbar_metrics in pbar_added_metrics: + if 'loss_2' in pbar_metrics: + is_in = True + assert pbar_metrics["loss_2"].item() == 1 + if distributed_backend == "ddp": + assert is_in is True