From 6cc3f1757f39f34955d0ff2b0c2e492fc8c8f854 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 5 Oct 2019 13:35:20 -0400
Subject: [PATCH] decouple returns from each step (#307)

* decoupled training metrics from logging metrics

* decoupled validation metrics from log metrics

* updated docs

* updated docs

* updated docs

* Fixed test

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master

* merged master
---
 .../RequiredTrainerInterface.md               |  52 ++++-
 docs/Trainer/hooks.md                         |  10 -
 .../lightning_module_template.py              |   3 +-
 pytorch_lightning/root_module/hooks.py        |   3 -
 .../testing/lm_test_module_base.py            |   3 +-
 .../testing/lm_test_module_mixins.py          |  12 +-
 pytorch_lightning/trainer/trainer.py          | 165 +++++++++------
 tests/debug.py                                | 189 +++---------------
 tests/test_models.py                          |   4 +-
 9 files changed, 193 insertions(+), 248 deletions(-)

diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md
index 3cf630db22..a849988f36 100644
--- a/docs/LightningModule/RequiredTrainerInterface.md
+++ b/docs/LightningModule/RequiredTrainerInterface.md
@@ -129,7 +129,8 @@ Dictionary or OrderedDict
 | key  | value  | is required |
 |---|---|---|
 |  loss | tensor scalar  | Y |
-|  progress | Dict for progress bar display. Must have only tensors  | N |
+|  progress_bar | Dict for progress bar display. Must have only tensors  | N |
+|  log | Dict of metrics to add to logger. Must have only tensors (no images, etc)  | N |
 
 
 **Example**
@@ -144,7 +145,8 @@ def training_step(self, batch, batch_nb):
     
     output = {
         'loss': loss, # required
-        'progress': {'training_loss': loss} # optional (MUST ALL BE TENSORS)
+        'progress_bar': {'training_loss': loss}, # optional (MUST ALL BE TENSORS)
+        'log': {'training_loss': loss} # optional (MUST ALL BE TENSORS)
     }
     
     # return a dict
@@ -161,6 +163,9 @@ def training_step(self, batch, batch_nb, optimizer_idx):
         # do training_step with decoder    
 ```    
 
+You can also return a -1 instead of a dict to stop the current loop. This is useful if you want to
+break out of the current training epoch early.
+
 --- 
 ### train_dataloader 
 
@@ -263,7 +268,7 @@ The dict you return here will be available in the `validation_end` method.
 
 | Return  | description  | optional |
 |---|---|---|   
-|  dict | Dict or OrderedDict with metrics to display in progress bar. All keys must be tensors. | Y |
+|  dict | Dict or OrderedDict - passed to the validation_end step | N |
 
 **Example**
 
@@ -327,9 +332,12 @@ The outputs here are strictly for the progress bar. If you don't need to display
 
 **Return**   
 
-| Return  | description  | optional |
-|---|---|---|   
-|  dict | Dict of OrderedDict with metrics to display in progress bar | Y |
+Dictionary or OrderedDict   
+
+| key  | value  | is required |
+|---|---|---|
+|  progress_bar | Dict for progress bar display. Must have only tensors  | N |
+|  log | Dict of metrics to add to logger. Must have only tensors (no images, etc)  | N |
 
 **Example**
 
@@ -351,7 +359,13 @@ def validation_end(self, outputs):
     val_loss_mean /= len(outputs)
     val_acc_mean /= len(outputs)
     tqdm_dict = {'val_loss': val_loss_mean.item(), 'val_acc': val_acc_mean.item()}
-    return tqdm_dict
+       
+    # show val_loss and val_acc in progress bar but only log val_loss
+    results = {
+        'progress_bar': tqdm_dict,
+        'log': {'val_loss': val_loss_mean.item()}
+    }
+    return results
 ```
 
 With multiple dataloaders, `outputs` will be a list of lists. The outer list contains
@@ -377,7 +391,13 @@ def validation_end(self, outputs):
     val_loss_mean /= i
     val_acc_mean /= i
     tqdm_dict = {'val_loss': val_loss_mean.item(), 'val_acc': val_acc_mean.item()}
-    return tqdm_dict
+    
+    # show val_loss and val_acc in progress bar but only log val_loss
+    results = {
+        'progress_bar': tqdm_dict,
+        'log': {'val_loss': val_loss_mean.item()}
+    }
+    return results
 ```
 
 ### test_step
@@ -490,7 +510,13 @@ def test_end(self, outputs):
     test_loss_mean /= len(outputs)
     test_acc_mean /= len(outputs)
     tqdm_dict = {'test_loss': test_loss_mean.item(), 'test_acc': test_acc_mean.item()}
-    return tqdm_dict
+    
+    # show test_loss and test_acc in progress bar but only log test_loss
+    results = {
+        'progress_bar': tqdm_dict,
+        'log': {'test_loss': val_loss_mean.item()}
+    }
+    return results
 ```
 
 With multiple dataloaders, `outputs` will be a list of lists. The outer list contains
@@ -516,7 +542,13 @@ def test_end(self, outputs):
     test_loss_mean /= i 
     test_acc_mean /= i
     tqdm_dict = {'test_loss': test_loss_mean.item(), 'test_acc': test_acc_mean.item()}
-    return tqdm_dict
+    
+    # show test_loss and test_acc in progress bar but only log test_loss
+    results = {
+        'progress_bar': tqdm_dict,
+        'log': {'test_loss': val_loss_mean.item()}
+    }
+    return results
 ```
 
 --- 
diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md
index d05eb630b6..726b5f5aa4 100644
--- a/docs/Trainer/hooks.md
+++ b/docs/Trainer/hooks.md
@@ -58,16 +58,6 @@ def on_post_performance_check(self):
 ```
 
 ---
-#### on_training_metrics
-Called in the training loop, right before metrics are logged.
-Although you can log at any time by using self.experiment, you can use
-this callback to modify what will be logged.
-```python
-def on_training_metrics(self, metrics):
-    # do something before validation end
-```
-
----   
 #### optimizer_step 
 Calls .step() and .zero_grad for each optimizer.  
 You can override this method to adjust how you do the optimizer step for each optimizer
diff --git a/examples/new_project_templates/lightning_module_template.py b/examples/new_project_templates/lightning_module_template.py
index bfa65e5cf7..7188d804b9 100644
--- a/examples/new_project_templates/lightning_module_template.py
+++ b/examples/new_project_templates/lightning_module_template.py
@@ -168,7 +168,8 @@ class LightningTemplateModel(LightningModule):
         val_loss_mean /= len(outputs)
         val_acc_mean /= len(outputs)
         tqdm_dict = {'val_loss': val_loss_mean, 'val_acc': val_acc_mean}
-        return tqdm_dict
+        result = {'progress_bar': tqdm_dict}
+        return result
 
     # ---------------------
     # TRAINING SETUP
diff --git a/pytorch_lightning/root_module/hooks.py b/pytorch_lightning/root_module/hooks.py
index 6820fd70b4..37d41eb5c5 100644
--- a/pytorch_lightning/root_module/hooks.py
+++ b/pytorch_lightning/root_module/hooks.py
@@ -28,9 +28,6 @@ class ModelHooks(torch.nn.Module):
     def on_post_performance_check(self):
         pass
 
-    def on_training_metrics(self, metrics):
-        pass
-
     def on_before_zero_grad(self, optimizer):
         """
         Called after optimizer.step() and before optimizer.zero_grad()
diff --git a/pytorch_lightning/testing/lm_test_module_base.py b/pytorch_lightning/testing/lm_test_module_base.py
index bd7d2f8761..d14cf534d1 100644
--- a/pytorch_lightning/testing/lm_test_module_base.py
+++ b/pytorch_lightning/testing/lm_test_module_base.py
@@ -104,8 +104,9 @@ class LightningTestModelBase(LightningModule):
         if self.trainer.batch_nb % 1 == 0:
             output = OrderedDict({
                 'loss': loss_val,
-                'progress': {'some_val': loss_val * loss_val}
+                'progress_bar': {'some_val': loss_val * loss_val}
             })
+
             return output
         if self.trainer.batch_nb % 2 == 0:
             return loss_val
diff --git a/pytorch_lightning/testing/lm_test_module_mixins.py b/pytorch_lightning/testing/lm_test_module_mixins.py
index 3831300e24..feab206f02 100644
--- a/pytorch_lightning/testing/lm_test_module_mixins.py
+++ b/pytorch_lightning/testing/lm_test_module_mixins.py
@@ -105,7 +105,8 @@ class LightningValidationMixin(LightningValidationStepMixin):
         val_acc_mean /= len(outputs)
 
         tqdm_dict = {'val_loss': val_loss_mean.item(), 'val_acc': val_acc_mean.item()}
-        return tqdm_dict
+        results = {'progress_bar': tqdm_dict}
+        return results
 
 
 class LightningValidationStepMultipleDataloadersMixin:
@@ -207,7 +208,8 @@ class LightningValidationMultipleDataloadersMixin(LightningValidationStepMultipl
         val_acc_mean /= i
 
         tqdm_dict = {'val_loss': val_loss_mean.item(), 'val_acc': val_acc_mean.item()}
-        return tqdm_dict
+        result = {'progress_bar': tqdm_dict}
+        return result
 
 
 class LightningTestStepMixin:
@@ -291,7 +293,8 @@ class LightningTestMixin(LightningTestStepMixin):
         test_acc_mean /= len(outputs)
 
         tqdm_dict = {'test_loss': test_loss_mean.item(), 'test_acc': test_acc_mean.item()}
-        return tqdm_dict
+        result = {'progress_bar': tqdm_dict}
+        return result
 
 
 class LightningTestStepMultipleDataloadersMixin:
@@ -384,4 +387,5 @@ class LightningTestMultipleDataloadersMixin(LightningTestStepMultipleDataloaders
         test_acc_mean /= i
 
         tqdm_dict = {'test_loss': test_loss_mean.item(), 'test_acc': test_acc_mean.item()}
-        return tqdm_dict
+        result = {'progress_bar': tqdm_dict}
+        return result
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index cb0e9b858c..034c2db50d 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -553,7 +553,6 @@ class Trainer(TrainerIO):
         :param model: PT model
         :param dataloaders: list of PT dataloaders
         :param max_batches: Scalar
-        :param dataloader_idx:
         :param test: boolean
         :return:
         """
@@ -582,7 +581,10 @@ class Trainer(TrainerIO):
                 # -----------------
                 # RUN EVALUATION STEP
                 # -----------------
-                output = self.__evaluation_forward(model, batch, batch_idx, dataloader_idx,
+                output = self.__evaluation_forward(model,
+                                                   batch,
+                                                   batch_idx,
+                                                   dataloader_idx,
                                                    test)
 
                 # track outputs for collation
@@ -704,8 +706,6 @@ class Trainer(TrainerIO):
                 task = int(os.environ['SLURM_LOCALID'])
                 self.ddp_train(task, model)
             else:
-                nb_gpus = self.nb_requested_gpus
-                nb_tasks = self.nb_slurm_tasks
                 mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model, ))
 
         # 1 gpu or dp option triggers training using DP module
@@ -1054,7 +1054,8 @@ class Trainer(TrainerIO):
             # ---------------
             # RUN TRAIN STEP
             # ---------------
-            batch_result, grad_norm_dic = self.__run_training_batch(batch, batch_nb)
+            output = self.__run_training_batch(batch, batch_nb)
+            batch_result, grad_norm_dic, batch_step_metrics = output
             early_stop_epoch = batch_result == -1
 
             # ---------------
@@ -1073,29 +1074,9 @@ class Trainer(TrainerIO):
 
             # when metrics should be logged
             if batch_nb % self.row_log_interval == 0 or early_stop_epoch:
-                # count items in memory
-                # nb_params, nb_tensors = count_mem_items()
 
-                model = self.__get_model()
-                metrics = self.__training_tqdm_dict
-
-                # add gpu memory
-                if self.on_gpu and self.log_gpu_memory is not None:
-                    mem_map = memory.get_memory_profile(mode=self.log_gpu_memory)
-                    metrics.update(mem_map)
-
-                # add norms
-                metrics.update(grad_norm_dic)
-
-                if self.__is_function_implemented('on_training_metrics'):
-                    model.on_training_metrics(metrics)
-
-                # log metrics
-                scalar_metrics = self.__metrics_to_scalars(
-                    metrics, blacklist=self.__log_vals_blacklist())
-                if self.proc_rank == 0 and self.logger is not None:
-                    self.logger.log_metrics(scalar_metrics, step_num=self.global_step)
-                    self.logger.save()
+                # logs user requested information to logger
+                self.__log_metrics(batch_step_metrics, grad_norm_dic)
 
             # end epoch early
             if early_stop_epoch:
@@ -1106,6 +1087,32 @@ class Trainer(TrainerIO):
             model = self.__get_model()
             model.on_epoch_end()
 
+    def __log_metrics(self, metrics, grad_norm_dic):
+        """
+        Logs the metric dict passed in
+        :param metrics:
+        :param grad_norm_dic:
+        :return:
+        """
+        # added metrics by Lightning for convenience
+        metrics['epoch'] = self.current_epoch
+
+        # add gpu memory
+        if self.on_gpu and self.log_gpu_memory:
+            mem_map = memory.get_memory_profile()
+            metrics.update(mem_map)
+
+        # add norms
+        metrics.update(grad_norm_dic)
+
+        # turn all tensors to scalars
+        scalar_metrics = self.__metrics_to_scalars(metrics)
+
+        # log actual metrics
+        if self.proc_rank == 0 and self.logger is not None:
+            self.logger.log_metrics(scalar_metrics, step_num=self.global_step)
+            self.logger.save()
+
     def test(self, model=None):
         if model is not None:
             self.testing = True
@@ -1113,7 +1120,7 @@ class Trainer(TrainerIO):
         else:
             self.__run_evaluation(test=True)
 
-    def __metrics_to_scalars(self, metrics, blacklist=set()):
+    def __metrics_to_scalars(self, metrics):
         new_metrics = {}
         for k, v in metrics.items():
             if type(v) is torch.Tensor:
@@ -1122,9 +1129,6 @@ class Trainer(TrainerIO):
             if type(v) is dict:
                 v = self.__metrics_to_scalars(v)
 
-            if k not in blacklist:
-                new_metrics[k] = float(v)
-
         return new_metrics
 
     def __log_vals_blacklist(self):
@@ -1193,41 +1197,64 @@ class Trainer(TrainerIO):
         else:
             output = self.model.training_step(*args)
 
-        # ---------------
-        # TQDM metrics
-        # ---------------
+        # format and reduce outputs accordingly
+        loss, progress_bar_metrics, log_metrics = self.__process_output(output, train=True)
+        return loss, progress_bar_metrics, log_metrics
+
+    def __process_output(self, output, train=False):
+        """
+        Reduces output according to the training mode.
+        Separates loss from logging and tqdm metrics
+        :param output:
+        :return:
+        """
         try:
-            progress_output = output['progress']
+            progress_output = output['progress_bar']
 
             # reduce progress metrics for tqdm when using dp
-            if self.use_dp or self.use_ddp2:
+            if train and self.use_dp or self.use_ddp2:
                 nb_gpus = self.num_gpus
                 progress_output = reduce_distributed_output(progress_output, nb_gpus)
 
-            model_specific_tqdm_metrics_dic = progress_output
+            progress_bar_metrics = progress_output
         except Exception:
-            model_specific_tqdm_metrics_dic = {}
+            progress_bar_metrics = {}
+
+        # extract metrics to log to experiment
+        try:
+            log_output = output['log']
+
+            # reduce progress metrics for tqdm when using dp
+            if train and self.use_dp or self.use_ddp2:
+                nb_gpus = self.num_gpus
+                log_output = reduce_distributed_output(log_output, nb_gpus)
+
+            log_metrics = log_output
+        except Exception:
+            log_metrics = {}
 
         # ---------------
         # EXTRACT LOSS
         # ---------------
         # if output dict doesn't have the keyword loss
         # then assume the output=loss if scalar
-        try:
-            loss = output['loss']
-        except Exception:
-            if type(output) is torch.Tensor:
-                loss = output
-            else:
-                raise RuntimeError(
-                    'No `loss` value in the dictionary returned from `model.training_step()`.'
-                )
+        loss = None
+        if train:
+            try:
+                loss = output['loss']
+            except Exception:
+                if type(output) is torch.Tensor:
+                    loss = output
+                else:
+                    raise RuntimeError(
+                        'No `loss` value in the dictionary returned from `model.training_step()`.'
+                    )
 
-        # when using dp need to reduce the loss
-        if self.use_dp or self.use_ddp2:
-            loss = reduce_distributed_output(loss, self.num_gpus)
+            # when using dp need to reduce the loss
+            if self.use_dp or self.use_ddp2:
+                loss = reduce_distributed_output(loss, self.num_gpus)
 
-        return loss, model_specific_tqdm_metrics_dic
+        return loss, progress_bar_metrics, log_metrics
 
     def __clip_gradients(self):
         if self.gradient_clip_val > 0:
@@ -1244,6 +1271,9 @@ class Trainer(TrainerIO):
         # track grad norms
         grad_norm_dic = {}
 
+        # track metrics to log
+        all_log_metrics = []
+
         if batch is None:
             return 0, grad_norm_dic
 
@@ -1265,10 +1295,12 @@ class Trainer(TrainerIO):
             def optimizer_closure():
                 # forward pass
                 output = self.__training_forward(batch, batch_nb, opt_idx)
-                closure_loss, model_specific_tqdm_metrics = output
+                closure_loss, progress_bar_metrics, log_metrics = output
 
-                # track metrics
-                self.__add_tqdm_metrics(model_specific_tqdm_metrics)
+                # track progress bar metrics
+                self.__add_tqdm_metrics(progress_bar_metrics)
+
+                all_log_metrics.append(log_metrics)
 
                 # accumulate loss
                 # (if accumulate_grad_batches = 1 no effect)
@@ -1321,7 +1353,7 @@ class Trainer(TrainerIO):
                 self.batch_loss_value = 0
                 self.avg_loss = np.mean(self.running_loss[-100:])
 
-                # update progressbar
+                # update progress bar
                 if self.show_progress_bar:
                     # add model specific metrics
                     tqdm_metrics = self.__training_tqdm_dict
@@ -1332,7 +1364,10 @@ class Trainer(TrainerIO):
             model = self.__get_model()
             model.on_batch_end()
 
-        return 0, grad_norm_dic
+        # collapse all metrics into one dict
+        all_log_metrics = {k: v for d in all_log_metrics for k, v in d.items()}
+
+        return 0, grad_norm_dic, all_log_metrics
 
     def __run_evaluation(self, test=False):
         # when testing make sure user defined a test step
@@ -1367,11 +1402,19 @@ class Trainer(TrainerIO):
             if self.fast_dev_run:
                 max_batches = 1
 
-            eval_out_metrics = self.evaluate(self.model,
-                                             dataloaders,
-                                             max_batches,
-                                             test)
-            self.__add_tqdm_metrics(eval_out_metrics)
+            # run evaluation
+            eval_results = self.evaluate(self.model,
+                                         dataloaders,
+                                         max_batches,
+                                         test)
+
+            _, progress_bar_metrics, log_metrics = self.__process_output(eval_results)
+
+            # add metrics to prog bar
+            self.__add_tqdm_metrics(progress_bar_metrics)
+
+            # log metrics
+            self.__log_metrics(log_metrics, {})
 
             # hook
             model.on_post_performance_check()
diff --git a/tests/debug.py b/tests/debug.py
index a80da5b676..bce19eda2e 100644
--- a/tests/debug.py
+++ b/tests/debug.py
@@ -14,6 +14,7 @@ from torch.utils.data import DataLoader
 from torchvision.datasets import MNIST
 import numpy as np
 import pdb
+from . import test_models
 
 
 class CoolModel(pl.LightningModule):
@@ -59,156 +60,6 @@ class CoolModel(pl.LightningModule):
         return DataLoader(MNIST('path/to/save', train=False), batch_size=32)
 
 
-def get_model(use_test_model=False):
-    # set up model with these hyperparams
-    hparams = get_hparams()
-
-    if use_test_model:
-        model = LightningTestModel(hparams)
-    else:
-        model = LightningTemplateModel(hparams)
-
-    return model, hparams
-
-
-def get_exp(debug=True, version=None):
-    # set up exp object without actually saving logs
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-    save_dir = os.path.join(root_dir, 'save_dir')
-    exp = Experiment(debug=debug, save_dir=save_dir, name='tests_tt_dir', version=version)
-    return exp
-
-
-def init_save_dir():
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-    save_dir = os.path.join(root_dir, 'save_dir')
-
-    if os.path.exists(save_dir):
-        shutil.rmtree(save_dir)
-
-    os.makedirs(save_dir, exist_ok=True)
-
-    return save_dir
-
-
-def clear_save_dir():
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-    save_dir = os.path.join(root_dir, 'save_dir')
-    if os.path.exists(save_dir):
-        shutil.rmtree(save_dir)
-
-
-def load_model(exp, save_dir, on_gpu, map_location=None, module_class=LightningTemplateModel):
-
-    # load trained model
-    tags_path = exp.get_data_path(exp.name, exp.version)
-    tags_path = os.path.join(tags_path, 'meta_tags.csv')
-
-    checkpoints = [x for x in os.listdir(save_dir) if '.ckpt' in x]
-    weights_dir = os.path.join(save_dir, checkpoints[0])
-
-    trained_model = module_class.load_from_metrics(weights_path=weights_dir,
-                                                   tags_csv=tags_path,
-                                                   on_gpu=on_gpu,
-                                                   )
-
-    assert trained_model is not None, 'loading model failed'
-
-    return trained_model
-
-
-def run_prediction(dataloader, trained_model):
-    # run prediction on 1 batch
-    for batch in dataloader:
-        break
-
-    x, y = batch
-    x = x.view(x.size(0), -1)
-
-    y_hat = trained_model(x)
-
-    # acc
-    labels_hat = torch.argmax(y_hat, dim=1)
-    val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)
-    val_acc = torch.tensor(val_acc)
-    val_acc = val_acc.item()
-    assert val_acc > 0.70, 'this model is expected to get > 0.7 in test set (it got %f)' % val_acc
-
-
-# ------------------------------------------------------------------------
-def run_gpu_model_test(trainer_options, model, hparams, on_gpu=True):
-    save_dir = init_save_dir()
-
-    # exp file to get meta
-    exp = get_exp(False)
-    exp.argparse(hparams)
-    exp.save()
-
-    # exp file to get weights
-    checkpoint = ModelCheckpoint(save_dir)
-
-    # add these to the trainer options
-    trainer_options['checkpoint_callback'] = checkpoint
-    trainer_options['experiment'] = exp
-
-    # fit model
-    trainer = Trainer(**trainer_options)
-    result = trainer.fit(model)
-
-    # correct result and ok accuracy
-    assert result == 1, 'amp + ddp model failed sto complete'
-
-    # test model loading
-    pretrained_model = load_model(exp, save_dir, on_gpu)
-
-    # test new model accuracy
-    run_prediction(model.test_dataloader, pretrained_model)
-
-    if trainer.use_ddp:
-        # on hpc this would work fine... but need to hack it for the purpose of the test
-        trainer.model = pretrained_model
-        trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers()
-
-    # test HPC loading / saving
-    trainer.hpc_save(save_dir, exp)
-    trainer.hpc_load(save_dir, on_gpu=on_gpu)
-
-    clear_save_dir()
-
-
-def assert_ok_val_acc(trainer):
-    # this model should get 0.80+ acc
-    acc = trainer.training_tqdm_dict['val_acc']
-    assert acc > 0.50, f'model failed to get expected 0.50 validation accuracy. Got: {acc}'
-
-
-def assert_ok_test_acc(trainer):
-    # this model should get 0.80+ acc
-    acc = trainer.training_tqdm_dict['test_acc']
-    assert acc > 0.50, f'model failed to get expected 0.50 validation accuracy. Got: {acc}'
-
-
-def get_hparams(continue_training=False, hpc_exp_number=0):
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-
-    args = {
-        'drop_prob': 0.2,
-        'batch_size': 32,
-        'in_features': 28 * 28,
-        'learning_rate': 0.001 * 8,
-        'optimizer_name': 'adam',
-        'data_root': os.path.join(root_dir, 'mnist'),
-        'out_features': 10,
-        'hidden_dim': 1000}
-
-    if continue_training:
-        args['test_tube_do_checkpoint_load'] = True
-        args['hpc_exp_number'] = hpc_exp_number
-
-    hparams = Namespace(**args)
-    return hparams
-
-
 def main():
     """
     Make sure DDP + AMP continue training correctly
@@ -218,19 +69,45 @@ def main():
     Make sure DDP2 works
     :return:
     """
-    os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
-    model, hparams = get_model()
+    hparams = test_models.get_hparams()
+    model = LightningTestModel(hparams)
+
+    save_dir = test_models.init_save_dir()
+
+    # logger file to get meta
+    logger = test_models.get_test_tube_logger(False)
+    logger.log_hyperparams(hparams)
+    logger.save()
+
+    # logger file to get weights
+    checkpoint = ModelCheckpoint(save_dir)
+
     trainer_options = dict(
         show_progress_bar=True,
         max_nb_epochs=1,
         train_percent_check=0.4,
         val_percent_check=0.2,
-        gpus=2,
-        print_weights_summary=True,
-        distributed_backend='ddp2'
+        checkpoint_callback=checkpoint,
+        logger=logger,
+        gpus=[0, 1],
+        distributed_backend='dp'
     )
 
-    run_gpu_model_test(trainer_options, model, hparams)
+    # fit model
+    trainer = Trainer(**trainer_options)
+    result = trainer.fit(model)
+
+    # correct result and ok accuracy
+    assert result == 1, 'training failed to complete'
+    pretrained_model = test_models.load_model(logger.experiment, save_dir,
+                                              module_class=LightningTestModel)
+
+    new_trainer = Trainer(**trainer_options)
+    new_trainer.test(pretrained_model)
+
+    # test we have good test accuracy
+    test_models.assert_ok_test_acc(new_trainer)
+    test_models.clear_save_dir()
 
 
 if __name__ == '__main__':
diff --git a/tests/test_models.py b/tests/test_models.py
index 9d9536e4bb..ed38dc5e85 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -401,7 +401,7 @@ def test_running_test_pretrained_model_dp():
     checkpoint = ModelCheckpoint(save_dir)
 
     trainer_options = dict(
-        show_progress_bar=False,
+        show_progress_bar=True,
         max_nb_epochs=1,
         train_percent_check=0.4,
         val_percent_check=0.2,
@@ -615,7 +615,7 @@ def test_early_stopping_cpu_model():
         overfit_pct=0.20,
         track_grad_norm=2,
         print_nan_grads=True,
-        show_progress_bar=False,
+        show_progress_bar=True,
         logger=get_test_tube_logger(),
         train_percent_check=0.1,
         val_percent_check=0.1