diff --git a/docs/source/examples/example_model.py b/docs/source/examples/example_model.py index 5bf1ead73d..17c90cc81f 100644 --- a/docs/source/examples/example_model.py +++ b/docs/source/examples/example_model.py @@ -75,7 +75,8 @@ class ExampleModel(RootModule): # return loss_val, tqdm_dic output = OrderedDict({ - 'tng_loss': loss_val, + 'loss': loss_val, + 'tqdm_metrics': {} }) return output diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index c18ae6e548..ecaf2f22e1 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -394,14 +394,17 @@ class Trainer(TrainerIO): # forward pass # return a scalar value and a dic with tqdm metrics - pdb.set_trace() - if self.data_parallel: - output = self.model(data_batch, batch_nb) - output = reduce_distributed_output(output, len(self.data_parallel_device_ids)) - else: - output = self.mode(data_batch, batch_nb) + output = self.model(data_batch, batch_nb, False) + + # when DP, we need to aggregate the scalars we received as outputs + # use mean as the reduce function + if self.data_parallel: + output = reduce_distributed_output(output, len(self.gpus)) + + pdb.set_trace() + model_specific_tqdm_metrics_dic = output['tqdm_metrics'] + loss = output['loss'] - loss, model_specific_tqdm_metrics_dic = self.model.training_step(data_batch, batch_nb) self.__add_tqdm_metrics(model_specific_tqdm_metrics_dic) # backward pass