From 8e9a026bc34d8409faa572a7144c2d96a7c039ed Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Thu, 11 Feb 2021 20:02:07 +0530
Subject: [PATCH] [tests/models] refactor with BoringModel (#5507)

* update with BoringModel

* update with BoringModel

* step

* try TPU

* TPU

* update tests

* update tpu tests

* self

* fix

* dp

* update tests

* ref

* update tests

* fix tpu tests

* fix dp and run_prediction

* dp

* only dp

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 tests/core/test_datamodules.py                |  70 +++--
 tests/helpers/pipelines.py                    |  12 +-
 tests/helpers/simple_models.py                |  14 +-
 .../data/horovod/train_default_model.py       |   6 +-
 tests/models/test_amp.py                      |  33 ++-
 tests/models/test_cpu.py                      |  39 ++-
 tests/models/test_grad_norm.py                |  12 +-
 tests/models/test_hooks.py                    |  34 +--
 tests/models/test_horovod.py                  |  21 +-
 tests/models/test_hparams.py                  | 111 ++++----
 tests/models/test_onnx.py                     |  14 +-
 tests/models/test_restore.py                  | 252 +++++++++++-------
 tests/models/test_tpu.py                      |  82 +++---
 13 files changed, 397 insertions(+), 303 deletions(-)

diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index 1bbbe7c40f..76fdca0fed 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -18,13 +18,16 @@ from unittest.mock import MagicMock
 
 import pytest
 import torch
+import torch.nn.functional as F
 
 from pytorch_lightning import LightningDataModule, Trainer
 from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.states import TrainerState
 from tests.helpers import BoringDataModule, BoringModel
-from tests.helpers.utils import reset_seed
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
+from tests.helpers.utils import reset_seed, set_random_master_port
 
 
 def test_can_prepare_data(tmpdir):
@@ -190,8 +193,8 @@ def test_dm_pickle_after_init(tmpdir):
 def test_train_loop_only(tmpdir):
     reset_seed()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     model.validation_step = None
     model.validation_step_end = None
@@ -207,18 +210,17 @@ def test_train_loop_only(tmpdir):
     )
 
     # fit model
-    result = trainer.fit(model, dm)
+    result = trainer.fit(model, datamodule=dm)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
     assert result
-    # TODO: add end-to-end test
-    # assert trainer.callback_metrics['loss'] < 0.6
+    assert trainer.callback_metrics['train_loss'] < 1.0
 
 
 def test_train_val_loop_only(tmpdir):
     reset_seed()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     model.validation_step = None
     model.validation_step_end = None
@@ -231,11 +233,10 @@ def test_train_val_loop_only(tmpdir):
     )
 
     # fit model
-    result = trainer.fit(model, dm)
+    result = trainer.fit(model, datamodule=dm)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
     assert result
-    # TODO: add end-to-end test
-    # assert trainer.callback_metrics['train_loss'] < 0.6
+    assert trainer.callback_metrics['train_loss'] < 1.0
 
 
 def test_dm_checkpoint_save(tmpdir):
@@ -294,8 +295,8 @@ def test_test_loop_only(tmpdir):
 def test_full_loop(tmpdir):
     reset_seed()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -311,8 +312,7 @@ def test_full_loop(tmpdir):
 
     # test
     result = trainer.test(datamodule=dm)
-    # TODO: add end-to-end test
-    # assert result[0]['test_acc'] > 0.8
+    assert result[0]['test_acc'] > 0.6
 
 
 def test_trainer_attached_to_dm(tmpdir):
@@ -346,8 +346,8 @@ def test_trainer_attached_to_dm(tmpdir):
 def test_full_loop_single_gpu(tmpdir):
     reset_seed()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -364,16 +364,37 @@ def test_full_loop_single_gpu(tmpdir):
 
     # test
     result = trainer.test(datamodule=dm)
-    # TODO: add end-to-end test
-    # assert result[0]['test_acc'] > 0.8
+    assert result[0]['test_acc'] > 0.6
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_full_loop_dp(tmpdir):
-    reset_seed()
+    set_random_master_port()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    class CustomClassificationModelDP(ClassificationModel):
+
+        def _step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            return {'logits': logits, 'y': y}
+
+        def training_step(self, batch, batch_idx):
+            _, y = batch
+            out = self._step(batch, batch_idx)
+            out['loss'] = F.cross_entropy(out['logits'], y)
+            return out
+
+        def validation_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def test_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def test_step_end(self, outputs):
+            self.log('test_acc', self.test_acc(outputs['logits'], outputs['y']))
+
+    dm = ClassifDataModule()
+    model = CustomClassificationModelDP()
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -385,14 +406,13 @@ def test_full_loop_dp(tmpdir):
     )
 
     # fit model
-    result = trainer.fit(model, dm)
+    result = trainer.fit(model, datamodule=dm)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
     assert result
 
     # test
     result = trainer.test(datamodule=dm)
-    # TODO: add end-to-end test
-    # assert result[0]['test_acc'] > 0.8
+    assert result[0]['test_acc'] > 0.6
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py
index f5d9823600..3f131ab055 100644
--- a/tests/helpers/pipelines.py
+++ b/tests/helpers/pipelines.py
@@ -102,9 +102,9 @@ def run_model_test(
 
 def run_prediction(trained_model, dataloader, dp=False, min_acc=0.25):
     if isinstance(trained_model, BoringModel):
-        return _boring_model_run_prediction(trained_model, dataloader, dp, min_acc)
+        return _boring_model_run_prediction(trained_model, dataloader, min_acc)
     else:
-        return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc)
+        return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc=min_acc)
 
 
 def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min_acc=0.50):
@@ -135,11 +135,15 @@ def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min
     assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})"
 
 
-def _boring_model_run_prediction(trained_model, dataloader, dp=False, min_acc=0.25):
+# TODO: This test compares a loss value with a min accuracy - complete non-sense!
+# create BoringModels that make actual predictions!
+def _boring_model_run_prediction(trained_model, dataloader, min_acc=0.25):
     # run prediction on 1 batch
+    trained_model.cpu()
     batch = next(iter(dataloader))
+
     with torch.no_grad():
         output = trained_model(batch)
-    acc = trained_model.loss(batch, output)
 
+    acc = trained_model.loss(batch, output)
     assert acc >= min_acc, f"This model is expected to get, {min_acc} in test set but got {acc}"
diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py
index ebc70690f4..9288a3c802 100644
--- a/tests/helpers/simple_models.py
+++ b/tests/helpers/simple_models.py
@@ -51,18 +51,21 @@ class ClassificationModel(LightningModule):
         x, y = batch
         logits = self.forward(x)
         loss = F.cross_entropy(logits, y)
-        self.log('train_Acc', self.train_acc(logits, y), prog_bar=True)
+        self.log('train_loss', loss, prog_bar=True)
+        self.log('train_acc', self.train_acc(logits, y), prog_bar=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
-        self.log('valid_Acc', self.valid_acc(logits, y), prog_bar=True)
+        self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
+        self.log('val_acc', self.valid_acc(logits, y), prog_bar=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
-        self.log('test_Acc', self.test_acc(logits, y), prog_bar=True)
+        self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
+        self.log('test_acc', self.test_acc(logits, y), prog_bar=True)
 
 
 class RegressionModel(LightningModule):
@@ -98,15 +101,18 @@ class RegressionModel(LightningModule):
         x, y = batch
         out = self.forward(x)
         loss = F.mse_loss(out, y)
+        self.log('train_loss', loss, prog_bar=False)
         self.log('train_MSE', self.train_mse(out, y), prog_bar=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         out = self.forward(x)
-        self.log('valid_MSE', self.valid_mse(out, y), prog_bar=True)
+        self.log('val_loss', F.mse_loss(out, y), prog_bar=False)
+        self.log('val_MSE', self.valid_mse(out, y), prog_bar=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         out = self.forward(x)
+        self.log('test_loss', F.mse_loss(out, y), prog_bar=False)
         self.log('test_MSE', self.test_mse(out, y), prog_bar=True)
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index 24ddbd24c4..93a637dda1 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -36,7 +36,7 @@ if _HOROVOD_AVAILABLE:
 else:
     print('You requested to import Horovod which is missing or not supported for your OS.')
 
-from tests.base import EvalModelTemplate  # noqa: E402
+from tests.helpers import BoringModel  # noqa: E402
 from tests.helpers.pipelines import run_prediction  # noqa: E402
 from tests.helpers.utils import reset_seed, set_random_master_port  # noqa: E402
 
@@ -53,7 +53,7 @@ def run_test_from_config(trainer_options):
     ckpt_path = trainer_options['weights_save_path']
     trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)])
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     trainer = Trainer(**trainer_options)
     trainer.fit(model)
@@ -66,7 +66,7 @@ def run_test_from_config(trainer_options):
         return
 
     # test model loading
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = BoringModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
 
     # test new model accuracy
     test_loaders = model.test_dataloader()
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 8a5b6d005c..8d620bb563 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -24,7 +24,7 @@ from pytorch_lightning import Trainer
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
 
 
 @pytest.mark.skip(reason='dp + amp not supported currently')  # TODO
@@ -41,7 +41,7 @@ def test_amp_single_gpu_dp(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
 
@@ -60,7 +60,7 @@ def test_amp_single_gpu_ddp_spawn(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
 
@@ -81,7 +81,7 @@ def test_amp_multi_gpu_dp(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
 
@@ -100,7 +100,7 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
 
@@ -108,13 +108,13 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir):
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@mock.patch.dict(os.environ, {"SLURM_LOCALID": "0"})
 def test_amp_gpu_ddp_slurm_managed(tmpdir):
     """Make sure DDP + AMP work."""
     # simulate setting slurm flags
     tutils.set_random_master_port()
-    os.environ['SLURM_LOCALID'] = str(0)
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     # exp file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -156,7 +156,7 @@ def test_cpu_model_with_amp(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     with pytest.raises((MisconfigurationException, ModuleNotFoundError)):
         tpipes.run_model_test(trainer_options, model, on_gpu=False)
@@ -165,7 +165,7 @@ def test_cpu_model_with_amp(tmpdir):
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test_amp_without_apex(tmpdir):
     """Check that even with apex amp type without requesting precision=16 the amp backend is void."""
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -190,19 +190,24 @@ def test_amp_without_apex(tmpdir):
 def test_amp_with_apex(tmpdir):
     """Check calling apex scaling in training."""
 
-    class CustomModel(EvalModelTemplate):
+    class CustomModel(BoringModel):
+
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            return super().training_step(batch, batch_idx)
 
         def configure_optimizers(self):
-            optimizer1 = optim.Adam(self.parameters(), lr=self.learning_rate)
-            optimizer2 = optim.SGD(self.parameters(), lr=self.learning_rate)
+            optimizer1 = optim.Adam(self.parameters(), lr=0.01)
+            optimizer2 = optim.SGD(self.parameters(), lr=0.01)
             lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1)
             lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1)
             return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2]
 
     model = CustomModel()
+    model.training_epoch_end = None
+
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=1,
+        max_steps=5,
         precision=16,
         amp_backend='apex',
         gpus=1,
@@ -210,7 +215,7 @@ def test_amp_with_apex(tmpdir):
     assert str(trainer.amp_backend) == "AMPType.APEX"
     trainer.fit(model)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    assert trainer.dev_debugger.count_events('AMP') == 20
+    assert trainer.dev_debugger.count_events('AMP') == 10
 
     assert isinstance(trainer.lr_schedulers[0]['scheduler'].optimizer, optim.Adam)
     assert isinstance(trainer.lr_schedulers[1]['scheduler'].optimizer, optim.SGD)
diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
index c3ac26bdeb..e8970c2df4 100644
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@@ -23,7 +23,6 @@ import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.simple_models import ClassificationModel
@@ -101,10 +100,12 @@ def test_early_stopping_cpu_model(tmpdir):
 
     class ModelTrainVal(BoringModel):
 
-        def validation_epoch_end(self, outputs) -> None:
-            val_loss = torch.stack([x["x"] for x in outputs]).mean()
-            self.log('val_loss', val_loss)
+        def validation_step(self, *args, **kwargs):
+            output = super().validation_step(*args, **kwargs)
+            self.log('val_loss', output['x'])
+            return output
 
+    tutils.reset_seed()
     stopping = EarlyStopping(monitor="val_loss", min_delta=0.1)
     trainer_options = dict(
         callbacks=[stopping],
@@ -198,13 +199,15 @@ def test_running_test_after_fitting(tmpdir):
 
     class ModelTrainValTest(BoringModel):
 
-        def validation_epoch_end(self, outputs) -> None:
-            val_loss = torch.stack([x["x"] for x in outputs]).mean()
-            self.log('val_loss', val_loss)
+        def validation_step(self, *args, **kwargs):
+            output = super().validation_step(*args, **kwargs)
+            self.log('val_loss', output['x'])
+            return output
 
-        def test_epoch_end(self, outputs) -> None:
-            test_loss = torch.stack([x["y"] for x in outputs]).mean()
-            self.log('test_loss', test_loss)
+        def test_step(self, *args, **kwargs):
+            output = super().test_step(*args, **kwargs)
+            self.log('test_loss', output['y'])
+            return output
 
     model = ModelTrainValTest()
 
@@ -244,9 +247,10 @@ def test_running_test_no_val(tmpdir):
         def val_dataloader(self):
             pass
 
-        def test_epoch_end(self, outputs) -> None:
-            test_loss = torch.stack([x["y"] for x in outputs]).mean()
-            self.log('test_loss', test_loss)
+        def test_step(self, *args, **kwargs):
+            output = super().test_step(*args, **kwargs)
+            self.log('test_loss', output['y'])
+            return output
 
     model = ModelTrainTest()
 
@@ -297,15 +301,10 @@ def test_simple_cpu(tmpdir):
 def test_cpu_model(tmpdir):
     """Make sure model trains on CPU."""
     trainer_options = dict(
-        default_root_dir=tmpdir,
-        progress_bar_refresh_rate=0,
-        max_epochs=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4
+        default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=4, limit_val_batches=4
     )
 
-    model = EvalModelTemplate()
-
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
 
 
diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py
index 10cfa0cb9a..4d04911ffa 100644
--- a/tests/models/test_grad_norm.py
+++ b/tests/models/test_grad_norm.py
@@ -20,11 +20,11 @@ import pytest
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
 from tests.helpers.utils import reset_seed
 
 
-class ModelWithManualGradTracker(EvalModelTemplate):
+class ModelWithManualGradTracker(BoringModel):
 
     def __init__(self, norm_type, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -36,9 +36,9 @@ class ModelWithManualGradTracker(EvalModelTemplate):
 
     def training_step(self, batch, batch_idx, optimizer_idx=None):
         # just return a loss, no log or progress bar meta
-        x, y = batch
-        loss_val = self.loss(y, self(x.flatten(1, -1)))
-        return {'loss': loss_val}
+        output = self(batch)
+        loss = self.loss(batch, output)
+        return {'loss': loss}
 
     def on_after_backward(self):
         out, norms = {}, []
@@ -102,7 +102,7 @@ def test_grad_tracking_interval(tmpdir, log_every_n_steps):
     )
 
     with patch.object(trainer.logger, "log_metrics") as mocked:
-        model = EvalModelTemplate()
+        model = BoringModel()
         trainer.fit(model)
         expected = trainer.global_step // log_every_n_steps
         grad_norm_dicts = []
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 8e7615baa7..969597a10f 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -21,14 +21,13 @@ import torch
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel, RandomDataset
 
 
 @pytest.mark.parametrize('max_steps', [1, 2, 3])
 def test_on_before_zero_grad_called(tmpdir, max_steps):
 
-    class CurrentTestModel(EvalModelTemplate):
+    class CurrentTestModel(BoringModel):
         on_before_zero_grad_called = 0
 
         def on_before_zero_grad(self, optimizer):
@@ -40,7 +39,6 @@ def test_on_before_zero_grad_called(tmpdir, max_steps):
         default_root_dir=tmpdir,
         max_steps=max_steps,
         max_epochs=2,
-        num_sanity_val_steps=5,
     )
     assert 0 == model.on_before_zero_grad_called
     trainer.fit(model)
@@ -55,23 +53,24 @@ def test_training_epoch_end_metrics_collection(tmpdir):
     """ Test that progress bar metrics also get collected at the end of an epoch. """
     num_epochs = 3
 
-    class CurrentModel(EvalModelTemplate):
+    class CurrentModel(BoringModel):
 
         def training_step(self, *args, **kwargs):
             output = super().training_step(*args, **kwargs)
-            output['progress_bar'].update({'step_metric': torch.tensor(-1)})
-            output['progress_bar'].update({'shared_metric': 100})
+            self.log_dict({'step_metric': torch.tensor(-1), 'shared_metric': 100}, logger=False, prog_bar=True)
             return output
 
         def training_epoch_end(self, outputs):
             epoch = self.current_epoch
             # both scalar tensors and Python numbers are accepted
-            return {
-                'progress_bar': {
-                    f'epoch_metric_{epoch}': torch.tensor(epoch),  # add a new metric key every epoch
-                    'shared_metric': 111,
-                }
-            }
+            self.log_dict(
+                {
+                    f'epoch_metric_{epoch}': torch.tensor(epoch),
+                    'shared_metric': 111
+                },
+                logger=False,
+                prog_bar=True,
+            )
 
     model = CurrentModel()
     trainer = Trainer(
@@ -103,7 +102,7 @@ def test_training_epoch_end_metrics_collection_on_override(tmpdir):
         def on_train_epoch_end(self, trainer, pl_module, outputs):
             self.len_outputs = len(outputs[0])
 
-    class OverriddenModel(EvalModelTemplate):
+    class OverriddenModel(BoringModel):
 
         def on_train_epoch_start(self):
             self.num_train_batches = 0
@@ -114,7 +113,7 @@ def test_training_epoch_end_metrics_collection_on_override(tmpdir):
         def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
             self.num_train_batches += 1
 
-    class NotOverriddenModel(EvalModelTemplate):
+    class NotOverriddenModel(BoringModel):
 
         def on_train_epoch_start(self):
             self.num_train_batches = 0
@@ -124,6 +123,7 @@ def test_training_epoch_end_metrics_collection_on_override(tmpdir):
 
     overridden_model = OverriddenModel()
     not_overridden_model = NotOverriddenModel()
+    not_overridden_model.training_epoch_end = None
 
     callback = LoggingCallback()
     trainer = Trainer(
@@ -152,7 +152,7 @@ def test_transfer_batch_hook():
             self.samples = data[0]
             self.targets = data[1]
 
-    class CurrentTestModel(EvalModelTemplate):
+    class CurrentTestModel(BoringModel):
 
         hook_called = False
 
@@ -166,7 +166,7 @@ def test_transfer_batch_hook():
             return data
 
     model = CurrentTestModel()
-    batch = CustomBatch((torch.zeros(5, 28), torch.ones(5, 1, dtype=torch.long)))
+    batch = CustomBatch((torch.zeros(5, 32), torch.ones(5, 1, dtype=torch.long)))
 
     trainer = Trainer(gpus=1)
     trainer.accelerator_backend = GPUAccelerator(trainer)
@@ -226,7 +226,7 @@ def test_transfer_batch_hook_ddp(tmpdir):
 @pytest.mark.parametrize('max_epochs,batch_idx_', [(2, 5), (3, 8), (4, 12)])
 def test_on_train_batch_start_hook(max_epochs, batch_idx_):
 
-    class CurrentModel(EvalModelTemplate):
+    class CurrentModel(BoringModel):
 
         def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
             if batch_idx == batch_idx_:
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 948fb0144d..19f39b3da4 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -30,9 +30,8 @@ from pytorch_lightning.accelerators.legacy.horovod_accelerator import HorovodAcc
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
 from tests.helpers.advanced_models import BasicGAN
-from tests.helpers.boring_model import BoringModel
 
 if _HOROVOD_AVAILABLE:
     import horovod
@@ -173,22 +172,17 @@ def test_horovod_amp(tmpdir):
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_horovod_transfer_batch_to_gpu(tmpdir):
 
-    class TestTrainingStepModel(EvalModelTemplate):
+    class TestTrainingStepModel(BoringModel):
 
         def training_step(self, batch, *args, **kwargs):
-            x, y = batch
-            assert str(x.device) != 'cpu'
-            assert str(y.device) != 'cpu'
+            assert str(batch.device) != 'cpu'
             return super(TestTrainingStepModel, self).training_step(batch, *args, **kwargs)
 
         def validation_step(self, batch, *args, **kwargs):
-            x, y = batch
-            assert str(x.device) != 'cpu'
-            assert str(y.device) != 'cpu'
+            assert str(batch.device) != 'cpu'
             return super(TestTrainingStepModel, self).validation_step(batch, *args, **kwargs)
 
-    hparams = EvalModelTemplate.get_default_hparams()
-    model = TestTrainingStepModel(**hparams)
+    model = TestTrainingStepModel()
 
     trainer_options = dict(
         default_root_dir=str(tmpdir),
@@ -205,7 +199,7 @@ def test_horovod_transfer_batch_to_gpu(tmpdir):
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
 def test_horovod_multi_optimizer(tmpdir):
-    model = BasicGAN(**EvalModelTemplate.get_default_hparams())
+    model = BasicGAN()
 
     # fit model
     trainer = Trainer(
@@ -342,8 +336,7 @@ def test_accuracy_metric_horovod():
 
 # @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
 # def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir):
-#     hparams = EvalModelTemplate.get_default_hparams()
-#     model = EvalModelTemplate(**hparams)
+#     model = BoringModel()
 #     model.configure_optimizers = model.configure_optimizers__multiple_schedulers
 #
 #     num_workers = 8
diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py
index 229c8128ae..0e32ebea09 100644
--- a/tests/models/test_hparams.py
+++ b/tests/models/test_hparams.py
@@ -21,15 +21,13 @@ import pytest
 import torch
 from fsspec.implementations.local import LocalFileSystem
 from omegaconf import Container, OmegaConf
-from torch.nn import functional as F
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml
 from pytorch_lightning.utilities import _HYDRA_EXPERIMENTAL_AVAILABLE, AttributeDict, is_picklable
-from tests.base import EvalModelTemplate
-from tests.helpers import BoringModel, TrialMNIST
+from tests.helpers import BoringModel, RandomDataset
 
 if _HYDRA_EXPERIMENTAL_AVAILABLE:
     from hydra.experimental import compose, initialize
@@ -162,7 +160,7 @@ def test_explicit_args_hparams(tmpdir):
     """
 
     # define model
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):
 
         def __init__(self, test_arg, test_arg2):
             super().__init__()
@@ -184,7 +182,7 @@ def test_implicit_args_hparams(tmpdir):
     """
 
     # define model
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):
 
         def __init__(self, test_arg, test_arg2):
             super().__init__()
@@ -206,7 +204,7 @@ def test_explicit_missing_args_hparams(tmpdir):
     """
 
     # define model
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):
 
         def __init__(self, test_arg, test_arg2):
             super().__init__()
@@ -269,7 +267,14 @@ def test_class_nesting():
     A().test()
 
 
-class SubClassEvalModel(EvalModelTemplate):
+class CustomBoringModel(BoringModel):
+
+    def __init__(self, batch_size=64):
+        super().__init__()
+        self.save_hyperparameters()
+
+
+class SubClassBoringModel(CustomBoringModel):
     any_other_loss = torch.nn.CrossEntropyLoss()
 
     def __init__(self, *args, subclass_arg=1200, **kwargs):
@@ -277,18 +282,18 @@ class SubClassEvalModel(EvalModelTemplate):
         self.save_hyperparameters()
 
 
-class SubSubClassEvalModel(SubClassEvalModel):
+class SubSubClassBoringModel(SubClassBoringModel):
     pass
 
 
-class AggSubClassEvalModel(SubClassEvalModel):
+class AggSubClassBoringModel(SubClassBoringModel):
 
     def __init__(self, *args, my_loss=torch.nn.CrossEntropyLoss(), **kwargs):
         super().__init__(*args, **kwargs)
         self.save_hyperparameters()
 
 
-class UnconventionalArgsEvalModel(EvalModelTemplate):
+class UnconventionalArgsBoringModel(CustomBoringModel):
     """ A model that has unconventional names for "self", "*args" and "**kwargs". """
 
     def __init__(obj, *more_args, other_arg=300, **more_kwargs):
@@ -297,7 +302,7 @@ class UnconventionalArgsEvalModel(EvalModelTemplate):
         obj.save_hyperparameters()
 
 
-class DictConfSubClassEvalModel(SubClassEvalModel):
+class DictConfSubClassBoringModel(SubClassBoringModel):
 
     def __init__(self, *args, dict_conf=OmegaConf.create(dict(my_param='something')), **kwargs):
         super().__init__(*args, **kwargs)
@@ -306,31 +311,31 @@ class DictConfSubClassEvalModel(SubClassEvalModel):
 
 @pytest.mark.parametrize(
     "cls", [
-        EvalModelTemplate,
-        SubClassEvalModel,
-        SubSubClassEvalModel,
-        AggSubClassEvalModel,
-        UnconventionalArgsEvalModel,
-        DictConfSubClassEvalModel,
+        CustomBoringModel,
+        SubClassBoringModel,
+        SubSubClassBoringModel,
+        AggSubClassBoringModel,
+        UnconventionalArgsBoringModel,
+        DictConfSubClassBoringModel,
     ]
 )
 def test_collect_init_arguments(tmpdir, cls):
     """ Test that the model automatically saves the arguments passed into the constructor """
     extra_args = {}
-    if cls is AggSubClassEvalModel:
+    if cls is AggSubClassBoringModel:
         extra_args.update(my_loss=torch.nn.CosineEmbeddingLoss())
-    elif cls is DictConfSubClassEvalModel:
+    elif cls is DictConfSubClassBoringModel:
         extra_args.update(dict_conf=OmegaConf.create(dict(my_param='anything')))
 
     model = cls(**extra_args)
-    assert model.hparams.batch_size == 32
+    assert model.hparams.batch_size == 64
     model = cls(batch_size=179, **extra_args)
     assert model.hparams.batch_size == 179
 
-    if isinstance(model, SubClassEvalModel):
+    if isinstance(model, SubClassBoringModel):
         assert model.hparams.subclass_arg == 1200
 
-    if isinstance(model, AggSubClassEvalModel):
+    if isinstance(model, AggSubClassBoringModel):
         assert isinstance(model.hparams.my_loss, torch.nn.CosineEmbeddingLoss)
 
     # verify that the checkpoint saved the correct values
@@ -347,10 +352,10 @@ def test_collect_init_arguments(tmpdir, cls):
     model = cls.load_from_checkpoint(raw_checkpoint_path)
     assert model.hparams.batch_size == 179
 
-    if isinstance(model, AggSubClassEvalModel):
+    if isinstance(model, AggSubClassBoringModel):
         assert isinstance(model.hparams.my_loss, torch.nn.CosineEmbeddingLoss)
 
-    if isinstance(model, DictConfSubClassEvalModel):
+    if isinstance(model, DictConfSubClassBoringModel):
         assert isinstance(model.hparams.dict_conf, Container)
         assert model.hparams.dict_conf['my_param'] == 'anything'
 
@@ -368,7 +373,7 @@ def _raw_checkpoint_path(trainer) -> str:
     return raw_checkpoint_path
 
 
-class LocalVariableModelSuperLast(EvalModelTemplate):
+class LocalVariableModelSuperLast(BoringModel):
     """ This model has the super().__init__() call at the end. """
 
     def __init__(self, arg1, arg2, *args, **kwargs):
@@ -378,7 +383,7 @@ class LocalVariableModelSuperLast(EvalModelTemplate):
         super().__init__(*args, **kwargs)  # this is intentionally here at the end
 
 
-class LocalVariableModelSuperFirst(EvalModelTemplate):
+class LocalVariableModelSuperFirst(BoringModel):
     """ This model has the _auto_collect_arguments() call at the end. """
 
     def __init__(self, arg1, arg2, *args, **kwargs):
@@ -429,16 +434,17 @@ def test_collect_init_arguments_with_local_vars(cls):
 #     assert model.hparams.my_arg == 42
 
 
-class AnotherArgModel(EvalModelTemplate):
+class AnotherArgModel(BoringModel):
 
     def __init__(self, arg1):
         super().__init__()
         self.save_hyperparameters(arg1)
 
 
-class OtherArgsModel(EvalModelTemplate):
+class OtherArgsModel(BoringModel):
 
     def __init__(self, arg1, arg2):
+
         super().__init__()
         self.save_hyperparameters(arg1, arg2)
 
@@ -457,7 +463,7 @@ def test_single_config_models_fail(tmpdir, cls, config):
 
 @pytest.mark.parametrize("past_key", ['module_arguments'])
 def test_load_past_checkpoint(tmpdir, past_key):
-    model = EvalModelTemplate()
+    model = CustomBoringModel()
 
     # verify we can train
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1)
@@ -474,7 +480,7 @@ def test_load_past_checkpoint(tmpdir, past_key):
     torch.save(raw_checkpoint, raw_checkpoint_path)
 
     # verify that model loads correctly
-    model2 = EvalModelTemplate.load_from_checkpoint(raw_checkpoint_path)
+    model2 = CustomBoringModel.load_from_checkpoint(raw_checkpoint_path)
     assert model2.hparams.batch_size == -17
 
 
@@ -486,7 +492,7 @@ def test_hparams_pickle(tmpdir):
     assert ad == pickle.loads(pkl)
 
 
-class UnpickleableArgsEvalModel(EvalModelTemplate):
+class UnpickleableArgsBoringModel(BoringModel):
     """ A model that has an attribute that cannot be pickled. """
 
     def __init__(self, foo='bar', pickle_me=(lambda x: x + 1), **kwargs):
@@ -496,7 +502,7 @@ class UnpickleableArgsEvalModel(EvalModelTemplate):
 
 
 def test_hparams_pickle_warning(tmpdir):
-    model = UnpickleableArgsEvalModel()
+    model = UnpickleableArgsBoringModel()
     trainer = Trainer(default_root_dir=tmpdir, max_steps=1)
     with pytest.warns(UserWarning, match="attribute 'pickle_me' removed from hparams because it cannot be pickled"):
         trainer.fit(model)
@@ -522,38 +528,15 @@ def test_hparams_save_yaml(tmpdir):
     assert load_hparams_from_yaml(path_yaml) == hparams
 
 
-class NoArgsSubClassEvalModel(EvalModelTemplate):
+class NoArgsSubClassBoringModel(CustomBoringModel):
 
     def __init__(self):
         super().__init__()
 
 
-class SimpleNoArgsModel(LightningModule):
-
-    def __init__(self):
-        super().__init__()
-        self.l1 = torch.nn.Linear(28 * 28, 10)
-
-    def forward(self, x):
-        return torch.relu(self.l1(x.view(x.size(0), -1)))
-
-    def training_step(self, batch, batch_nb):
-        x, y = batch
-        loss = F.cross_entropy(self(x), y)
-        return {'loss': loss, 'log': {'train_loss': loss}}
-
-    def test_step(self, batch, batch_nb):
-        x, y = batch
-        loss = F.cross_entropy(self(x), y)
-        return {'loss': loss, 'log': {'train_loss': loss}}
-
-    def configure_optimizers(self):
-        return torch.optim.Adam(self.parameters(), lr=0.02)
-
-
 @pytest.mark.parametrize("cls", [
-    SimpleNoArgsModel,
-    NoArgsSubClassEvalModel,
+    BoringModel,
+    NoArgsSubClassBoringModel,
 ])
 def test_model_nohparams_train_test(tmpdir, cls):
     """Test models that do not tae any argument in init."""
@@ -564,20 +547,20 @@ def test_model_nohparams_train_test(tmpdir, cls):
         default_root_dir=tmpdir,
     )
 
-    train_loader = DataLoader(TrialMNIST(os.getcwd(), train=True, download=True), batch_size=32)
+    train_loader = DataLoader(RandomDataset(32, 64), batch_size=32)
     trainer.fit(model, train_loader)
 
-    test_loader = DataLoader(TrialMNIST(os.getcwd(), train=False, download=True), batch_size=32)
+    test_loader = DataLoader(RandomDataset(32, 64), batch_size=32)
     trainer.test(test_dataloaders=test_loader)
 
 
 def test_model_ignores_non_exist_kwargument(tmpdir):
     """Test that the model takes only valid class arguments."""
 
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):
 
         def __init__(self, batch_size=15):
-            super().__init__(batch_size=batch_size)
+            super().__init__()
             self.save_hyperparameters()
 
     model = LocalModel()
@@ -593,11 +576,11 @@ def test_model_ignores_non_exist_kwargument(tmpdir):
     assert 'non_exist_kwarg' not in model.hparams
 
 
-class SuperClassPositionalArgs(EvalModelTemplate):
+class SuperClassPositionalArgs(BoringModel):
 
     def __init__(self, hparams):
         super().__init__()
-        self._hparams = None  # pretend EvalModelTemplate did not call self.save_hyperparameters()
+        self._hparams = None  # pretend BoringModel did not call self.save_hyperparameters()
         self.hparams = hparams
 
 
diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py
index e031494361..2bd3ebf9b6 100644
--- a/tests/models/test_onnx.py
+++ b/tests/models/test_onnx.py
@@ -21,14 +21,13 @@ import torch
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
-from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel
 
 
 def test_model_saves_with_input_sample(tmpdir):
     """Test that ONNX model saves with input sample and size is greater than 3 MB"""
     model = BoringModel()
-    trainer = Trainer(max_epochs=1)
+    trainer = Trainer(fast_dev_run=True)
     trainer.fit(model)
 
     file_path = os.path.join(tmpdir, "model.onnx")
@@ -42,7 +41,7 @@ def test_model_saves_with_input_sample(tmpdir):
 def test_model_saves_on_gpu(tmpdir):
     """Test that model saves on gpu"""
     model = BoringModel()
-    trainer = Trainer(gpus=1, max_epochs=1)
+    trainer = Trainer(gpus=1, fast_dev_run=True)
     trainer.fit(model)
 
     file_path = os.path.join(tmpdir, "model.onnx")
@@ -55,7 +54,7 @@ def test_model_saves_on_gpu(tmpdir):
 def test_model_saves_with_example_output(tmpdir):
     """Test that ONNX model saves when provided with example output"""
     model = BoringModel()
-    trainer = Trainer(max_epochs=1)
+    trainer = Trainer(fast_dev_run=True)
     trainer.fit(model)
 
     file_path = os.path.join(tmpdir, "model.onnx")
@@ -92,9 +91,10 @@ def test_model_saves_on_multi_gpu(tmpdir):
         progress_bar_refresh_rate=0,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
+    model.example_input_array = torch.randn(5, 32)
 
-    tpipes.run_model_test(trainer_options, model)
+    tpipes.run_model_test(trainer_options, model, min_acc=0.08)
 
     file_path = os.path.join(tmpdir, "model.onnx")
     model.to_onnx(file_path)
@@ -130,7 +130,7 @@ def test_if_inference_output_is_valid(tmpdir):
     model = BoringModel()
     model.example_input_array = torch.randn(5, 32)
 
-    trainer = Trainer(max_epochs=2)
+    trainer = Trainer(fast_dev_run=True)
     trainer.fit(model)
 
     model.eval()
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 9420da74f0..114ebf3368 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -16,18 +16,21 @@ import logging as log
 import os
 import pickle
 from copy import deepcopy
+from typing import Generic, TypeVar
 
 import cloudpickle
 import pytest
 import torch
+import torch.nn.functional as F
 
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.states import RunningStage, TrainerState
-from tests.base import EvalModelTemplate, GenericEvalModelTemplate
 from tests.helpers import BoringModel
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
 
 
 class ModelTrainerPropertyParity(Callback):
@@ -52,14 +55,48 @@ class ModelTrainerPropertyParity(Callback):
         self._check_properties(trainer, pl_module)
 
 
+class ValTestLossBoringModel(BoringModel):
+
+    def __init__(self, batch_size=4):
+        super().__init__()
+        self.save_hyperparameters()
+
+    def validation_step(self, batch, batch_idx):
+        out = super().validation_step(batch, batch_idx)
+        self.log('val_loss', out['x'])
+        return out
+
+    def test_step(self, batch, batch_idx):
+        out = super().test_step(batch, batch_idx)
+        self.log('test_loss', out['y'])
+        return out
+
+
+T = TypeVar('T')
+
+
+class GenericParentValTestLossBoringModel(Generic[T], ValTestLossBoringModel):
+
+    def __init__(self, batch_size: int = 4):
+        super().__init__(batch_size=batch_size)
+
+
+class GenericValTestLossBoringModel(GenericParentValTestLossBoringModel[int]):
+    pass
+
+
 def test_model_properties_resume_from_checkpoint(tmpdir):
-    """ Test that properties like `current_epoch` and `global_step`
-    in model and trainer are always the same. """
-    model = EvalModelTemplate()
-    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    """
+    Test that properties like `current_epoch` and `global_step`
+    in model and trainer are always the same.
+    """
+    model = BoringModel()
+    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
     trainer_args = dict(
         default_root_dir=tmpdir,
         max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
         logger=False,
         callbacks=[checkpoint_callback, ModelTrainerPropertyParity()],  # this performs the assertions
     )
@@ -73,18 +110,19 @@ def test_model_properties_resume_from_checkpoint(tmpdir):
 
 def test_try_resume_from_non_existing_checkpoint(tmpdir):
     """ Test that trying to resume from non-existing `resume_from_checkpoint` fail without error."""
-    model = BoringModel()
-    checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    dm = ClassifDataModule()
+    model = ClassificationModel()
+    checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
         logger=False,
         callbacks=[checkpoint_cb],
-        limit_train_batches=0.1,
-        limit_val_batches=0.1,
+        limit_train_batches=2,
+        limit_val_batches=2,
     )
     # Generate checkpoint `last.ckpt` with BoringModel
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
     # `True` if resume/restore successfully else `False`
     assert trainer.checkpoint_connector.restore(str(tmpdir / "last.ckpt"), trainer.on_gpu)
     assert not trainer.checkpoint_connector.restore(str(tmpdir / "last_non_existing.ckpt"), trainer.on_gpu)
@@ -99,11 +137,12 @@ class CaptureCallbacksBeforeTraining(Callback):
 
 def test_callbacks_state_resume_from_checkpoint(tmpdir):
     """ Test that resuming from a checkpoint restores callbacks that persist state. """
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
     callback_capture = CaptureCallbacksBeforeTraining()
 
     def get_trainer_args():
-        checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+        checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
         trainer_args = dict(
             default_root_dir=tmpdir, max_steps=1, logger=False, callbacks=[
                 checkpoint,
@@ -116,12 +155,12 @@ def test_callbacks_state_resume_from_checkpoint(tmpdir):
 
     # initial training
     trainer = Trainer(**get_trainer_args())
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
     callbacks_before_resume = deepcopy(trainer.callbacks)
 
     # resumed training
     trainer = Trainer(**get_trainer_args(), resume_from_checkpoint=str(tmpdir / "last.ckpt"))
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     assert len(callbacks_before_resume) == len(callback_capture.callbacks)
 
@@ -133,23 +172,24 @@ def test_callbacks_state_resume_from_checkpoint(tmpdir):
 
 def test_callbacks_references_resume_from_checkpoint(tmpdir):
     """ Test that resuming from a checkpoint sets references as expected. """
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
     args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False}
 
     # initial training
-    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
     trainer = Trainer(**args, callbacks=[checkpoint])
     assert checkpoint is trainer.callbacks[-1] is trainer.checkpoint_callback
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     # resumed training
-    new_checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    new_checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
     # pass in a new checkpoint object, which should take
     # precedence over the one in the last.ckpt file
     trainer = Trainer(**args, callbacks=[new_checkpoint], resume_from_checkpoint=str(tmpdir / "last.ckpt"))
     assert checkpoint is not new_checkpoint
     assert new_checkpoint is trainer.callbacks[-1] is trainer.checkpoint_callback
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -158,7 +198,30 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
 
     tutils.set_random_master_port()
 
-    model = EvalModelTemplate()
+    class CustomClassificationModelDP(ClassificationModel):
+
+        def _step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            return {'logits': logits, 'y': y}
+
+        def training_step(self, batch, batch_idx):
+            _, y = batch
+            out = self._step(batch, batch_idx)
+            out['loss'] = F.cross_entropy(out['logits'], y)
+            return out
+
+        def validation_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def test_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def validation_step_end(self, outputs):
+            self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
+
+    dm = ClassifDataModule()
+    model = CustomClassificationModelDP()
 
     # exp file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -169,8 +232,8 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
     trainer_options = dict(
         progress_bar_refresh_rate=0,
         max_epochs=2,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        limit_train_batches=5,
+        limit_val_batches=5,
         callbacks=[checkpoint],
         logger=logger,
         gpus=[0, 1],
@@ -180,21 +243,17 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
 
     # fit model
     trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     # correct result and ok accuracy
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
 
     # run test set
     new_trainer = Trainer(**trainer_options)
-    results = new_trainer.test(pretrained_model)
+    new_trainer.test(pretrained_model)
     pretrained_model.cpu()
 
-    # test we have good test accuracy
-    acc = results[0]['test_acc']
-    assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}"
-
     dataloaders = model.test_dataloader()
     if not isinstance(dataloaders, list):
         dataloaders = [dataloaders]
@@ -207,8 +266,8 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
 def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
     """Verify `test()` on pretrained model."""
     tutils.set_random_master_port()
-
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     # exp file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -219,8 +278,8 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
     trainer_options = dict(
         progress_bar_refresh_rate=0,
         max_epochs=2,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        limit_train_batches=2,
+        limit_val_batches=2,
         callbacks=[checkpoint],
         logger=logger,
         gpus=[0, 1],
@@ -230,33 +289,32 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
 
     # fit model
     trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     log.info(os.listdir(tutils.get_data_path(logger, path_dir=tmpdir)))
 
     # correct result and ok accuracy
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
 
     # run test set
     new_trainer = Trainer(**trainer_options)
-    results = new_trainer.test(pretrained_model)
+    new_trainer.test(pretrained_model)
     pretrained_model.cpu()
 
-    acc = results[0]['test_acc']
-    assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}"
-
-    dataloaders = model.test_dataloader()
+    dataloaders = dm.test_dataloader()
     if not isinstance(dataloaders, list):
         dataloaders = [dataloaders]
 
     for dataloader in dataloaders:
-        tpipes.run_prediction(pretrained_model, dataloader)
+        tpipes.run_prediction(pretrained_model, dataloader, min_acc=0.1)
 
 
 def test_running_test_pretrained_model_cpu(tmpdir):
     """Verify test() on pretrained model."""
-    model = EvalModelTemplate()
+    tutils.reset_seed()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -266,9 +324,10 @@ def test_running_test_pretrained_model_cpu(tmpdir):
 
     trainer_options = dict(
         progress_bar_refresh_rate=0,
-        max_epochs=3,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        max_epochs=2,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
         callbacks=[checkpoint],
         logger=logger,
         default_root_dir=tmpdir,
@@ -276,31 +335,32 @@ def test_running_test_pretrained_model_cpu(tmpdir):
 
     # fit model
     trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     # correct result and ok accuracy
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
 
     new_trainer = Trainer(**trainer_options)
-    new_trainer.test(pretrained_model)
+    new_trainer.test(pretrained_model, datamodule=dm)
 
     # test we have good test accuracy
-    tutils.assert_ok_model_acc(new_trainer)
+    tutils.assert_ok_model_acc(new_trainer, key='test_acc', thr=0.45)
 
 
-@pytest.mark.parametrize('model_template', [EvalModelTemplate, GenericEvalModelTemplate])
+@pytest.mark.parametrize('model_template', [ValTestLossBoringModel, GenericValTestLossBoringModel])
 def test_load_model_from_checkpoint(tmpdir, model_template):
     """Verify test() on pretrained model."""
-    hparams = model_template.get_default_hparams()
-    model = model_template(**hparams)
+    tutils.reset_seed()
+    model = model_template()
 
     trainer_options = dict(
         progress_bar_refresh_rate=0,
         max_epochs=2,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
-        callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=-1)],
+        limit_train_batches=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
+        callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='val_loss', save_top_k=-1)],
         default_root_dir=tmpdir,
     )
 
@@ -315,7 +375,7 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
     # load last checkpoint
     last_checkpoint = sorted(glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, "*.ckpt")))[-1]
 
-    # Since `EvalModelTemplate` has `_save_hparams = True` by default, check that ckpt has hparams
+    # Since `BoringModel` has `_save_hparams = True` by default, check that ckpt has hparams
     ckpt = torch.load(last_checkpoint)
     assert model_template.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), 'hyper_parameters missing from checkpoints'
 
@@ -323,8 +383,8 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
     pretrained_model = model_template.load_from_checkpoint(last_checkpoint)
 
     # test that hparams loaded correctly
-    for k, v in hparams.items():
-        assert getattr(pretrained_model, k) == v
+    for k, v in model.hparams.items():
+        assert getattr(pretrained_model.hparams, k) == v
 
     # assert weights are the same
     for (old_name, old_p), (new_name, new_p) in zip(model.named_parameters(), pretrained_model.named_parameters()):
@@ -334,15 +394,11 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
     new_trainer = Trainer(**trainer_options)
     new_trainer.test(pretrained_model)
 
-    # test we have good test accuracy
-    tutils.assert_ok_model_acc(new_trainer)
-
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_dp_resume(tmpdir):
     """Make sure DP continues training correctly."""
-    hparams = EvalModelTemplate.get_default_hparams()
-    model = EvalModelTemplate(**hparams)
+    model = BoringModel()
 
     trainer_options = dict(max_epochs=1, gpus=2, accelerator='dp', default_root_dir=tmpdir)
 
@@ -355,7 +411,7 @@ def test_dp_resume(tmpdir):
 
     # add these to the trainer options
     trainer_options['logger'] = logger
-    trainer_options['checkpoint_callback'] = checkpoint
+    trainer_options['callbacks'] = [checkpoint]
 
     # fit model
     trainer = Trainer(**trainer_options)
@@ -377,31 +433,38 @@ def test_dp_resume(tmpdir):
     # init new trainer
     new_logger = tutils.get_default_logger(tmpdir, version=logger.version)
     trainer_options['logger'] = new_logger
-    trainer_options['checkpoint_callback'] = ModelCheckpoint(dirpath=tmpdir)
+    trainer_options['callbacks'] = [ModelCheckpoint(dirpath=tmpdir)]
     trainer_options['limit_train_batches'] = 0.5
     trainer_options['limit_val_batches'] = 0.2
     trainer_options['max_epochs'] = 1
     new_trainer = Trainer(**trainer_options)
 
-    # set the epoch start hook so we can predict before the model does the full training
-    def assert_good_acc():
-        assert new_trainer.current_epoch == real_global_epoch and new_trainer.current_epoch > 0
+    class CustomModel(BoringModel):
 
-        # if model and state loaded correctly, predictions will be good even though we
-        # haven't trained with the new loaded model
-        dp_model = new_trainer.model
-        dp_model.eval()
-        dp_model.module.module.running_stage = RunningStage.EVALUATING
+        def __init__(self):
+            super().__init__()
+            self.on_train_start_called = False
 
-        dataloader = trainer.train_dataloader
-        tpipes.run_prediction(dp_model, dataloader, dp=True)
+        # set the epoch start hook so we can predict before the model does the full training
+        def on_train_start(self):
+            assert self.trainer.current_epoch == real_global_epoch and self.trainer.current_epoch > 0
+
+            # if model and state loaded correctly, predictions will be good even though we
+            # haven't trained with the new loaded model
+            dp_model = new_trainer.model
+            dp_model.eval()
+            dp_model.module.module.running_stage = RunningStage.EVALUATING
+
+            dataloader = self.train_dataloader()
+            tpipes.run_prediction(self.trainer.get_model(), dataloader)
+            self.on_train_start_called = True
 
     # new model
-    model = EvalModelTemplate(**hparams)
-    model.on_train_start = assert_good_acc
+    model = CustomModel()
 
     # fit new model which should load hpc weights
     new_trainer.fit(model)
+    assert model.on_train_start_called
 
     # test freeze on gpu
     model.freeze()
@@ -410,7 +473,7 @@ def test_dp_resume(tmpdir):
 
 def test_model_saving_loading(tmpdir):
     """Tests use case where trainer saves the model, and user loads it from tags independently."""
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -418,6 +481,8 @@ def test_model_saving_loading(tmpdir):
     # fit model
     trainer = Trainer(
         max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
         logger=logger,
         callbacks=[ModelCheckpoint(dirpath=tmpdir)],
         default_root_dir=tmpdir,
@@ -432,16 +497,11 @@ def test_model_saving_loading(tmpdir):
     if not isinstance(dataloaders, list):
         dataloaders = [dataloaders]
 
-    for dataloader in dataloaders:
-        for batch in dataloader:
-            break
-
-    x, y = batch
-    x = x.view(x.size(0), -1)
+    batch = next(iter(dataloaders[0]))
 
     # generate preds before saving model
     model.eval()
-    pred_before_saving = model(x)
+    pred_before_saving = model(batch)
 
     # save model
     new_weights_path = os.path.join(tmpdir, 'save_test.ckpt')
@@ -450,7 +510,7 @@ def test_model_saving_loading(tmpdir):
     # load new model
     hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
     hparams_path = os.path.join(hparams_path, 'hparams.yaml')
-    model_2 = EvalModelTemplate.load_from_checkpoint(
+    model_2 = BoringModel.load_from_checkpoint(
         checkpoint_path=new_weights_path,
         hparams_file=hparams_path,
     )
@@ -458,7 +518,7 @@ def test_model_saving_loading(tmpdir):
 
     # make prediction
     # assert that both predictions are the same
-    new_pred = model_2(x)
+    new_pred = model_2(batch)
     assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1
 
 
@@ -468,9 +528,9 @@ def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_c
     # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
     monkeypatch.setenv('TORCH_HOME', tmpdir)
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # Extra layer
-    model.c_d3 = torch.nn.Linear(model.hidden_dim, model.hidden_dim)
+    model.c_d3 = torch.nn.Linear(32, 32)
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -479,6 +539,8 @@ def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_c
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
         logger=logger,
         callbacks=[ModelCheckpoint(dirpath=tmpdir)],
     )
@@ -496,14 +558,14 @@ def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_c
     hparams_url = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}'
     ckpt_path = hparams_url if url_ckpt else new_weights_path
 
-    EvalModelTemplate.load_from_checkpoint(
+    BoringModel.load_from_checkpoint(
         checkpoint_path=ckpt_path,
         hparams_file=hparams_path,
         strict=False,
     )
 
     with pytest.raises(RuntimeError, match=r'Unexpected key\(s\) in state_dict: "c_d3.weight", "c_d3.bias"'):
-        EvalModelTemplate.load_from_checkpoint(
+        BoringModel.load_from_checkpoint(
             checkpoint_path=ckpt_path,
             hparams_file=hparams_path,
             strict=True,
@@ -516,7 +578,7 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
     # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
     monkeypatch.setenv('TORCH_HOME', tmpdir)
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -525,6 +587,8 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
         logger=logger,
         callbacks=[ModelCheckpoint(dirpath=tmpdir)],
     )
@@ -542,7 +606,7 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
     hparams_url = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}'
     ckpt_path = hparams_url if url_ckpt else new_weights_path
 
-    class CurrentModel(EvalModelTemplate):
+    class CurrentModel(BoringModel):
 
         def __init__(self):
             super().__init__()
@@ -563,6 +627,6 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
 
 
 def test_model_pickle(tmpdir):
-    model = EvalModelTemplate()
+    model = BoringModel()
     pickle.dumps(model)
     cloudpickle.dumps(model)
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 98a02d730e..e5895d98b6 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -19,14 +19,14 @@ import pytest
 from torch.utils.data import DataLoader
 
 import tests.helpers.pipelines as tpipes
+import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators import TPUAccelerator
 from pytorch_lightning.callbacks import EarlyStopping
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import EvalModelTemplate
-from tests.helpers.datasets import TrialMNIST
+from tests.helpers import BoringModel, RandomDataset
 from tests.helpers.utils import pl_multi_process_test
 
 if _TPU_AVAILABLE:
@@ -34,7 +34,7 @@ if _TPU_AVAILABLE:
     import torch_xla.distributed.xla_multiprocessing as xmp
     SERIAL_EXEC = xmp.MpSerialExecutor()
 
-_LARGER_DATASET = TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8))
+_LARGER_DATASET = RandomDataset(32, 2000)
 
 
 # 8 cores needs a big dataset
@@ -42,20 +42,30 @@ def _serial_train_loader():
     return DataLoader(_LARGER_DATASET, batch_size=32)
 
 
+class SerialLoaderBoringModel(BoringModel):
+
+    def train_dataloader(self):
+        return DataLoader(RandomDataset(32, 2000), batch_size=32)
+
+    def val_dataloader(self):
+        return DataLoader(RandomDataset(32, 2000), batch_size=32)
+
+
 @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
 def test_model_tpu_cores_1(tmpdir):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
 
@@ -64,16 +74,17 @@ def test_model_tpu_cores_1(tmpdir):
 @pl_multi_process_test
 def test_model_tpu_index(tmpdir, tpu_core):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=[tpu_core],
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
     assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
 
@@ -82,6 +93,7 @@ def test_model_tpu_index(tmpdir, tpu_core):
 @pl_multi_process_test
 def test_model_tpu_cores_8(tmpdir):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
@@ -91,29 +103,27 @@ def test_model_tpu_cores_8(tmpdir):
         limit_val_batches=0.4,
     )
 
-    model = EvalModelTemplate()
     # 8 cores needs a big dataset
-    model.train_dataloader = _serial_train_loader
-    model.val_dataloader = _serial_train_loader
-
-    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+    model = SerialLoaderBoringModel()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)
 
 
 @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
 def test_model_16bit_tpu_cores_1(tmpdir):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         precision=16,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
@@ -123,17 +133,18 @@ def test_model_16bit_tpu_cores_1(tmpdir):
 @pl_multi_process_test
 def test_model_16bit_tpu_index(tmpdir, tpu_core):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         precision=16,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=[tpu_core],
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        limit_train_batches=4,
+        limit_val_batches=2,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
     assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
@@ -143,6 +154,7 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core):
 @pl_multi_process_test
 def test_model_16bit_tpu_cores_8(tmpdir):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         precision=16,
@@ -153,26 +165,32 @@ def test_model_16bit_tpu_cores_8(tmpdir):
         limit_val_batches=0.4,
     )
 
-    model = EvalModelTemplate()
     # 8 cores needs a big dataset
-    model.train_dataloader = _serial_train_loader
-    model.val_dataloader = _serial_train_loader
-
-    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+    model = SerialLoaderBoringModel()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)
 
 
 @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
 def test_model_tpu_early_stop(tmpdir):
     """Test if single TPU core training works"""
-    model = EvalModelTemplate()
+
+    class CustomBoringModel(BoringModel):
+
+        def validation_step(self, *args, **kwargs):
+            out = super().validation_step(*args, **kwargs)
+            self.log('val_loss', out['x'])
+            return out
+
+    tutils.reset_seed()
+    model = CustomBoringModel()
     trainer = Trainer(
-        callbacks=[EarlyStopping()],
+        callbacks=[EarlyStopping(monitor='val_loss')],
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=50,
-        limit_train_batches=10,
-        limit_val_batches=10,
+        limit_train_batches=4,
+        limit_val_batches=4,
         tpu_cores=1,
     )
     trainer.fit(model)
@@ -182,6 +200,7 @@ def test_model_tpu_early_stop(tmpdir):
 @pl_multi_process_test
 def test_tpu_grad_norm(tmpdir):
     """Test if grad_norm works on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
@@ -192,7 +211,7 @@ def test_tpu_grad_norm(tmpdir):
         gradient_clip_val=0.1,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
 
@@ -201,7 +220,8 @@ def test_tpu_grad_norm(tmpdir):
 def test_dataloaders_passed_to_fit(tmpdir):
     """Test if dataloaders passed to trainer works on TPU"""
 
-    model = EvalModelTemplate()
+    tutils.reset_seed()
+    model = BoringModel()
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, tpu_cores=8)
     trainer.fit(model, train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())