[tests/models] refactor with BoringModel (#5507)

* update with BoringModel * update with BoringModel * step * try TPU * TPU * update tests * update tpu tests * self * fix * dp * update tests * ref * update tests * fix tpu tests * fix dp and run_prediction * dp * only dp * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
2021-02-11 20:02:07 +05:30 · 2021-02-11 20:02:07 +05:30 · 8e9a026bc3
parent b434c479e7
commit 8e9a026bc3
13 changed files with 397 additions and 303 deletions
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@ -18,13 +18,16 @@ from unittest.mock import MagicMock

 import pytest
 import torch
+import torch.nn.functional as F

 from pytorch_lightning import LightningDataModule, Trainer
 from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.states import TrainerState
 from tests.helpers import BoringDataModule, BoringModel
-from tests.helpers.utils import reset_seed
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
+from tests.helpers.utils import reset_seed, set_random_master_port


 def test_can_prepare_data(tmpdir):
@ -190,8 +193,8 @@ def test_dm_pickle_after_init(tmpdir):
 def test_train_loop_only(tmpdir):
    reset_seed()

-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()

    model.validation_step = None
    model.validation_step_end = None
@ -207,18 +210,17 @@ def test_train_loop_only(tmpdir):
    )

    # fit model
-    result = trainer.fit(model, dm)
+    result = trainer.fit(model, datamodule=dm)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
    assert result
-    # TODO: add end-to-end test
-    # assert trainer.callback_metrics['loss'] < 0.6
+    assert trainer.callback_metrics['train_loss'] < 1.0


 def test_train_val_loop_only(tmpdir):
    reset_seed()

-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()

    model.validation_step = None
    model.validation_step_end = None
@ -231,11 +233,10 @@ def test_train_val_loop_only(tmpdir):
    )

    # fit model
-    result = trainer.fit(model, dm)
+    result = trainer.fit(model, datamodule=dm)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
    assert result
-    # TODO: add end-to-end test
-    # assert trainer.callback_metrics['train_loss'] < 0.6
+    assert trainer.callback_metrics['train_loss'] < 1.0


 def test_dm_checkpoint_save(tmpdir):
@ -294,8 +295,8 @@ def test_test_loop_only(tmpdir):
 def test_full_loop(tmpdir):
    reset_seed()

-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()

    trainer = Trainer(
        default_root_dir=tmpdir,
@ -311,8 +312,7 @@ def test_full_loop(tmpdir):

    # test
    result = trainer.test(datamodule=dm)
-    # TODO: add end-to-end test
-    # assert result[0]['test_acc'] > 0.8
+    assert result[0]['test_acc'] > 0.6


 def test_trainer_attached_to_dm(tmpdir):
@ -346,8 +346,8 @@ def test_trainer_attached_to_dm(tmpdir):
 def test_full_loop_single_gpu(tmpdir):
    reset_seed()

-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()

    trainer = Trainer(
        default_root_dir=tmpdir,
@ -364,16 +364,37 @@ def test_full_loop_single_gpu(tmpdir):

    # test
    result = trainer.test(datamodule=dm)
-    # TODO: add end-to-end test
-    # assert result[0]['test_acc'] > 0.8
+    assert result[0]['test_acc'] > 0.6


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_full_loop_dp(tmpdir):
-    reset_seed()
+    set_random_master_port()

-    dm = BoringDataModule()
-    model = BoringModel()
+    class CustomClassificationModelDP(ClassificationModel):
+
+        def _step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            return {'logits': logits, 'y': y}
+
+        def training_step(self, batch, batch_idx):
+            _, y = batch
+            out = self._step(batch, batch_idx)
+            out['loss'] = F.cross_entropy(out['logits'], y)
+            return out
+
+        def validation_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def test_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def test_step_end(self, outputs):
+            self.log('test_acc', self.test_acc(outputs['logits'], outputs['y']))
+
+    dm = ClassifDataModule()
+    model = CustomClassificationModelDP()

    trainer = Trainer(
        default_root_dir=tmpdir,
@ -385,14 +406,13 @@ def test_full_loop_dp(tmpdir):
    )

    # fit model
-    result = trainer.fit(model, dm)
+    result = trainer.fit(model, datamodule=dm)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
    assert result

    # test
    result = trainer.test(datamodule=dm)
-    # TODO: add end-to-end test
-    # assert result[0]['test_acc'] > 0.8
+    assert result[0]['test_acc'] > 0.6


@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
--- a/tests/helpers/pipelines.py
+++ b/tests/helpers/pipelines.py
@ -102,9 +102,9 @@ def run_model_test(

 def run_prediction(trained_model, dataloader, dp=False, min_acc=0.25):
    if isinstance(trained_model, BoringModel):
-        return _boring_model_run_prediction(trained_model, dataloader, dp, min_acc)
+        return _boring_model_run_prediction(trained_model, dataloader, min_acc)
    else:
-        return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc)
+        return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc=min_acc)


 def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min_acc=0.50):
@ -135,11 +135,15 @@ def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min
    assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})"


-def _boring_model_run_prediction(trained_model, dataloader, dp=False, min_acc=0.25):
+# TODO: This test compares a loss value with a min accuracy - complete non-sense!
+# create BoringModels that make actual predictions!
+def _boring_model_run_prediction(trained_model, dataloader, min_acc=0.25):
    # run prediction on 1 batch
+    trained_model.cpu()
    batch = next(iter(dataloader))
+
    with torch.no_grad():
        output = trained_model(batch)
-    acc = trained_model.loss(batch, output)

+    acc = trained_model.loss(batch, output)
    assert acc >= min_acc, f"This model is expected to get, {min_acc} in test set but got {acc}"
--- a/tests/helpers/simple_models.py
+++ b/tests/helpers/simple_models.py
@ -51,18 +51,21 @@ class ClassificationModel(LightningModule):
        x, y = batch
        logits = self.forward(x)
        loss = F.cross_entropy(logits, y)
-        self.log('train_Acc', self.train_acc(logits, y), prog_bar=True)
+        self.log('train_loss', loss, prog_bar=True)
+        self.log('train_acc', self.train_acc(logits, y), prog_bar=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
-        self.log('valid_Acc', self.valid_acc(logits, y), prog_bar=True)
+        self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
+        self.log('val_acc', self.valid_acc(logits, y), prog_bar=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
-        self.log('test_Acc', self.test_acc(logits, y), prog_bar=True)
+        self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
+        self.log('test_acc', self.test_acc(logits, y), prog_bar=True)


 class RegressionModel(LightningModule):
@ -98,15 +101,18 @@ class RegressionModel(LightningModule):
        x, y = batch
        out = self.forward(x)
        loss = F.mse_loss(out, y)
+        self.log('train_loss', loss, prog_bar=False)
        self.log('train_MSE', self.train_mse(out, y), prog_bar=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        x, y = batch
        out = self.forward(x)
-        self.log('valid_MSE', self.valid_mse(out, y), prog_bar=True)
+        self.log('val_loss', F.mse_loss(out, y), prog_bar=False)
+        self.log('val_MSE', self.valid_mse(out, y), prog_bar=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        out = self.forward(x)
+        self.log('test_loss', F.mse_loss(out, y), prog_bar=False)
        self.log('test_MSE', self.test_mse(out, y), prog_bar=True)
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@ -36,7 +36,7 @@ if _HOROVOD_AVAILABLE:
 else:
    print('You requested to import Horovod which is missing or not supported for your OS.')

-from tests.base import EvalModelTemplate  # noqa: E402
+from tests.helpers import BoringModel  # noqa: E402
 from tests.helpers.pipelines import run_prediction  # noqa: E402
 from tests.helpers.utils import reset_seed, set_random_master_port  # noqa: E402

@ -53,7 +53,7 @@ def run_test_from_config(trainer_options):
    ckpt_path = trainer_options['weights_save_path']
    trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)])

-    model = EvalModelTemplate()
+    model = BoringModel()

    trainer = Trainer(**trainer_options)
    trainer.fit(model)
@ -66,7 +66,7 @@ def run_test_from_config(trainer_options):
        return

    # test model loading
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = BoringModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@ -24,7 +24,7 @@ from pytorch_lightning import Trainer
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel


@pytest.mark.skip(reason='dp + amp not supported currently')  # TODO
@ -41,7 +41,7 @@ def test_amp_single_gpu_dp(tmpdir):
        precision=16,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
    # tutils.run_model_test(trainer_options, model)
    trainer.fit(model)

@ -60,7 +60,7 @@ def test_amp_single_gpu_ddp_spawn(tmpdir):
        precision=16,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
    # tutils.run_model_test(trainer_options, model)
    trainer.fit(model)

@ -81,7 +81,7 @@ def test_amp_multi_gpu_dp(tmpdir):
        precision=16,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
    # tutils.run_model_test(trainer_options, model)
    trainer.fit(model)

@ -100,7 +100,7 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir):
        precision=16,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
    # tutils.run_model_test(trainer_options, model)
    trainer.fit(model)

@ -108,13 +108,13 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir):


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@mock.patch.dict(os.environ, {"SLURM_LOCALID": "0"})
 def test_amp_gpu_ddp_slurm_managed(tmpdir):
    """Make sure DDP + AMP work."""
    # simulate setting slurm flags
    tutils.set_random_master_port()
-    os.environ['SLURM_LOCALID'] = str(0)

-    model = EvalModelTemplate()
+    model = BoringModel()

    # exp file to get meta
    logger = tutils.get_default_logger(tmpdir)
@ -156,7 +156,7 @@ def test_cpu_model_with_amp(tmpdir):
        precision=16,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()

    with pytest.raises((MisconfigurationException, ModuleNotFoundError)):
        tpipes.run_model_test(trainer_options, model, on_gpu=False)
@ -165,7 +165,7 @@ def test_cpu_model_with_amp(tmpdir):
@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test_amp_without_apex(tmpdir):
    """Check that even with apex amp type without requesting precision=16 the amp backend is void."""
-    model = EvalModelTemplate()
+    model = BoringModel()

    trainer = Trainer(
        default_root_dir=tmpdir,
@ -190,19 +190,24 @@ def test_amp_without_apex(tmpdir):
 def test_amp_with_apex(tmpdir):
    """Check calling apex scaling in training."""

-    class CustomModel(EvalModelTemplate):
+    class CustomModel(BoringModel):
+
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            return super().training_step(batch, batch_idx)

        def configure_optimizers(self):
-            optimizer1 = optim.Adam(self.parameters(), lr=self.learning_rate)
-            optimizer2 = optim.SGD(self.parameters(), lr=self.learning_rate)
+            optimizer1 = optim.Adam(self.parameters(), lr=0.01)
+            optimizer2 = optim.SGD(self.parameters(), lr=0.01)
            lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1)
            lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1)
            return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2]

    model = CustomModel()
+    model.training_epoch_end = None
+
    trainer = Trainer(
        default_root_dir=tmpdir,
-        max_epochs=1,
+        max_steps=5,
        precision=16,
        amp_backend='apex',
        gpus=1,
@ -210,7 +215,7 @@ def test_amp_with_apex(tmpdir):
    assert str(trainer.amp_backend) == "AMPType.APEX"
    trainer.fit(model)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    assert trainer.dev_debugger.count_events('AMP') == 20
+    assert trainer.dev_debugger.count_events('AMP') == 10

    assert isinstance(trainer.lr_schedulers[0]['scheduler'].optimizer, optim.Adam)
    assert isinstance(trainer.lr_schedulers[1]['scheduler'].optimizer, optim.SGD)
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@ -23,7 +23,6 @@ import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.simple_models import ClassificationModel
@ -101,10 +100,12 @@ def test_early_stopping_cpu_model(tmpdir):

    class ModelTrainVal(BoringModel):

-        def validation_epoch_end(self, outputs) -> None:
-            val_loss = torch.stack([x["x"] for x in outputs]).mean()
-            self.log('val_loss', val_loss)
+        def validation_step(self, *args, **kwargs):
+            output = super().validation_step(*args, **kwargs)
+            self.log('val_loss', output['x'])
+            return output

+    tutils.reset_seed()
    stopping = EarlyStopping(monitor="val_loss", min_delta=0.1)
    trainer_options = dict(
        callbacks=[stopping],
@ -198,13 +199,15 @@ def test_running_test_after_fitting(tmpdir):

    class ModelTrainValTest(BoringModel):

-        def validation_epoch_end(self, outputs) -> None:
-            val_loss = torch.stack([x["x"] for x in outputs]).mean()
-            self.log('val_loss', val_loss)
+        def validation_step(self, *args, **kwargs):
+            output = super().validation_step(*args, **kwargs)
+            self.log('val_loss', output['x'])
+            return output

-        def test_epoch_end(self, outputs) -> None:
-            test_loss = torch.stack([x["y"] for x in outputs]).mean()
-            self.log('test_loss', test_loss)
+        def test_step(self, *args, **kwargs):
+            output = super().test_step(*args, **kwargs)
+            self.log('test_loss', output['y'])
+            return output

    model = ModelTrainValTest()

@ -244,9 +247,10 @@ def test_running_test_no_val(tmpdir):
        def val_dataloader(self):
            pass

-        def test_epoch_end(self, outputs) -> None:
-            test_loss = torch.stack([x["y"] for x in outputs]).mean()
-            self.log('test_loss', test_loss)
+        def test_step(self, *args, **kwargs):
+            output = super().test_step(*args, **kwargs)
+            self.log('test_loss', output['y'])
+            return output

    model = ModelTrainTest()

@ -297,15 +301,10 @@ def test_simple_cpu(tmpdir):
 def test_cpu_model(tmpdir):
    """Make sure model trains on CPU."""
    trainer_options = dict(
-        default_root_dir=tmpdir,
-        progress_bar_refresh_rate=0,
-        max_epochs=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4
+        default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=4, limit_val_batches=4
    )

-    model = EvalModelTemplate()
-
+    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)


--- a/tests/models/test_grad_norm.py
+++ b/tests/models/test_grad_norm.py
@ -20,11 +20,11 @@ import pytest

 from pytorch_lightning import Trainer
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
 from tests.helpers.utils import reset_seed


-class ModelWithManualGradTracker(EvalModelTemplate):
+class ModelWithManualGradTracker(BoringModel):

    def __init__(self, norm_type, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -36,9 +36,9 @@ class ModelWithManualGradTracker(EvalModelTemplate):

    def training_step(self, batch, batch_idx, optimizer_idx=None):
        # just return a loss, no log or progress bar meta
-        x, y = batch
-        loss_val = self.loss(y, self(x.flatten(1, -1)))
-        return {'loss': loss_val}
+        output = self(batch)
+        loss = self.loss(batch, output)
+        return {'loss': loss}

    def on_after_backward(self):
        out, norms = {}, []
@ -102,7 +102,7 @@ def test_grad_tracking_interval(tmpdir, log_every_n_steps):
    )

    with patch.object(trainer.logger, "log_metrics") as mocked:
-        model = EvalModelTemplate()
+        model = BoringModel()
        trainer.fit(model)
        expected = trainer.global_step // log_every_n_steps
        grad_norm_dicts = []
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@ -21,14 +21,13 @@ import torch
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel, RandomDataset


@pytest.mark.parametrize('max_steps', [1, 2, 3])
 def test_on_before_zero_grad_called(tmpdir, max_steps):

-    class CurrentTestModel(EvalModelTemplate):
+    class CurrentTestModel(BoringModel):
        on_before_zero_grad_called = 0

        def on_before_zero_grad(self, optimizer):
@ -40,7 +39,6 @@ def test_on_before_zero_grad_called(tmpdir, max_steps):
        default_root_dir=tmpdir,
        max_steps=max_steps,
        max_epochs=2,
-        num_sanity_val_steps=5,
    )
    assert 0 == model.on_before_zero_grad_called
    trainer.fit(model)
@ -55,23 +53,24 @@ def test_training_epoch_end_metrics_collection(tmpdir):
    """ Test that progress bar metrics also get collected at the end of an epoch. """
    num_epochs = 3

-    class CurrentModel(EvalModelTemplate):
+    class CurrentModel(BoringModel):

        def training_step(self, *args, **kwargs):
            output = super().training_step(*args, **kwargs)
-            output['progress_bar'].update({'step_metric': torch.tensor(-1)})
-            output['progress_bar'].update({'shared_metric': 100})
+            self.log_dict({'step_metric': torch.tensor(-1), 'shared_metric': 100}, logger=False, prog_bar=True)
            return output

        def training_epoch_end(self, outputs):
            epoch = self.current_epoch
            # both scalar tensors and Python numbers are accepted
-            return {
-                'progress_bar': {
-                    f'epoch_metric_{epoch}': torch.tensor(epoch),  # add a new metric key every epoch
-                    'shared_metric': 111,
-                }
-            }
+            self.log_dict(
+                {
+                    f'epoch_metric_{epoch}': torch.tensor(epoch),
+                    'shared_metric': 111
+                },
+                logger=False,
+                prog_bar=True,
+            )

    model = CurrentModel()
    trainer = Trainer(
@ -103,7 +102,7 @@ def test_training_epoch_end_metrics_collection_on_override(tmpdir):
        def on_train_epoch_end(self, trainer, pl_module, outputs):
            self.len_outputs = len(outputs[0])

-    class OverriddenModel(EvalModelTemplate):
+    class OverriddenModel(BoringModel):

        def on_train_epoch_start(self):
            self.num_train_batches = 0
@ -114,7 +113,7 @@ def test_training_epoch_end_metrics_collection_on_override(tmpdir):
        def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
            self.num_train_batches += 1

-    class NotOverriddenModel(EvalModelTemplate):
+    class NotOverriddenModel(BoringModel):

        def on_train_epoch_start(self):
            self.num_train_batches = 0
@ -124,6 +123,7 @@ def test_training_epoch_end_metrics_collection_on_override(tmpdir):

    overridden_model = OverriddenModel()
    not_overridden_model = NotOverriddenModel()
+    not_overridden_model.training_epoch_end = None

    callback = LoggingCallback()
    trainer = Trainer(
@ -152,7 +152,7 @@ def test_transfer_batch_hook():
            self.samples = data[0]
            self.targets = data[1]

-    class CurrentTestModel(EvalModelTemplate):
+    class CurrentTestModel(BoringModel):

        hook_called = False

@ -166,7 +166,7 @@ def test_transfer_batch_hook():
            return data

    model = CurrentTestModel()
-    batch = CustomBatch((torch.zeros(5, 28), torch.ones(5, 1, dtype=torch.long)))
+    batch = CustomBatch((torch.zeros(5, 32), torch.ones(5, 1, dtype=torch.long)))

    trainer = Trainer(gpus=1)
    trainer.accelerator_backend = GPUAccelerator(trainer)
@ -226,7 +226,7 @@ def test_transfer_batch_hook_ddp(tmpdir):
@pytest.mark.parametrize('max_epochs,batch_idx_', [(2, 5), (3, 8), (4, 12)])
 def test_on_train_batch_start_hook(max_epochs, batch_idx_):

-    class CurrentModel(EvalModelTemplate):
+    class CurrentModel(BoringModel):

        def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
            if batch_idx == batch_idx_:
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@ -30,9 +30,8 @@ from pytorch_lightning.accelerators.legacy.horovod_accelerator import HorovodAcc
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
 from tests.helpers.advanced_models import BasicGAN
-from tests.helpers.boring_model import BoringModel

 if _HOROVOD_AVAILABLE:
    import horovod
@ -173,22 +172,17 @@ def test_horovod_amp(tmpdir):
@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_horovod_transfer_batch_to_gpu(tmpdir):

-    class TestTrainingStepModel(EvalModelTemplate):
+    class TestTrainingStepModel(BoringModel):

        def training_step(self, batch, *args, **kwargs):
-            x, y = batch
-            assert str(x.device) != 'cpu'
-            assert str(y.device) != 'cpu'
+            assert str(batch.device) != 'cpu'
            return super(TestTrainingStepModel, self).training_step(batch, *args, **kwargs)

        def validation_step(self, batch, *args, **kwargs):
-            x, y = batch
-            assert str(x.device) != 'cpu'
-            assert str(y.device) != 'cpu'
+            assert str(batch.device) != 'cpu'
            return super(TestTrainingStepModel, self).validation_step(batch, *args, **kwargs)

-    hparams = EvalModelTemplate.get_default_hparams()
-    model = TestTrainingStepModel(**hparams)
+    model = TestTrainingStepModel()

    trainer_options = dict(
        default_root_dir=str(tmpdir),
@ -205,7 +199,7 @@ def test_horovod_transfer_batch_to_gpu(tmpdir):

@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
 def test_horovod_multi_optimizer(tmpdir):
-    model = BasicGAN(**EvalModelTemplate.get_default_hparams())
+    model = BasicGAN()

    # fit model
    trainer = Trainer(
@ -342,8 +336,7 @@ def test_accuracy_metric_horovod():

 # @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
 # def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir):
-#     hparams = EvalModelTemplate.get_default_hparams()
-#     model = EvalModelTemplate(**hparams)
+#     model = BoringModel()
 #     model.configure_optimizers = model.configure_optimizers__multiple_schedulers
 #
 #     num_workers = 8
--- a/tests/models/test_hparams.py
+++ b/tests/models/test_hparams.py
@ -21,15 +21,13 @@ import pytest
 import torch
 from fsspec.implementations.local import LocalFileSystem
 from omegaconf import Container, OmegaConf
-from torch.nn import functional as F
 from torch.utils.data import DataLoader

 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml
 from pytorch_lightning.utilities import _HYDRA_EXPERIMENTAL_AVAILABLE, AttributeDict, is_picklable
-from tests.base import EvalModelTemplate
-from tests.helpers import BoringModel, TrialMNIST
+from tests.helpers import BoringModel, RandomDataset

 if _HYDRA_EXPERIMENTAL_AVAILABLE:
    from hydra.experimental import compose, initialize
@ -162,7 +160,7 @@ def test_explicit_args_hparams(tmpdir):
    """

    # define model
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):

        def __init__(self, test_arg, test_arg2):
            super().__init__()
@ -184,7 +182,7 @@ def test_implicit_args_hparams(tmpdir):
    """

    # define model
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):

        def __init__(self, test_arg, test_arg2):
            super().__init__()
@ -206,7 +204,7 @@ def test_explicit_missing_args_hparams(tmpdir):
    """

    # define model
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):

        def __init__(self, test_arg, test_arg2):
            super().__init__()
@ -269,7 +267,14 @@ def test_class_nesting():
    A().test()


-class SubClassEvalModel(EvalModelTemplate):
+class CustomBoringModel(BoringModel):
+
+    def __init__(self, batch_size=64):
+        super().__init__()
+        self.save_hyperparameters()
+
+
+class SubClassBoringModel(CustomBoringModel):
    any_other_loss = torch.nn.CrossEntropyLoss()

    def __init__(self, *args, subclass_arg=1200, **kwargs):
@ -277,18 +282,18 @@ class SubClassEvalModel(EvalModelTemplate):
        self.save_hyperparameters()


-class SubSubClassEvalModel(SubClassEvalModel):
+class SubSubClassBoringModel(SubClassBoringModel):
    pass


-class AggSubClassEvalModel(SubClassEvalModel):
+class AggSubClassBoringModel(SubClassBoringModel):

    def __init__(self, *args, my_loss=torch.nn.CrossEntropyLoss(), **kwargs):
        super().__init__(*args, **kwargs)
        self.save_hyperparameters()


-class UnconventionalArgsEvalModel(EvalModelTemplate):
+class UnconventionalArgsBoringModel(CustomBoringModel):
    """ A model that has unconventional names for "self", "*args" and "**kwargs". """

    def __init__(obj, *more_args, other_arg=300, **more_kwargs):
@ -297,7 +302,7 @@ class UnconventionalArgsEvalModel(EvalModelTemplate):
        obj.save_hyperparameters()


-class DictConfSubClassEvalModel(SubClassEvalModel):
+class DictConfSubClassBoringModel(SubClassBoringModel):

    def __init__(self, *args, dict_conf=OmegaConf.create(dict(my_param='something')), **kwargs):
        super().__init__(*args, **kwargs)
@ -306,31 +311,31 @@ class DictConfSubClassEvalModel(SubClassEvalModel):

@pytest.mark.parametrize(
    "cls", [
-        EvalModelTemplate,
-        SubClassEvalModel,
-        SubSubClassEvalModel,
-        AggSubClassEvalModel,
-        UnconventionalArgsEvalModel,
-        DictConfSubClassEvalModel,
+        CustomBoringModel,
+        SubClassBoringModel,
+        SubSubClassBoringModel,
+        AggSubClassBoringModel,
+        UnconventionalArgsBoringModel,
+        DictConfSubClassBoringModel,
    ]
 )
 def test_collect_init_arguments(tmpdir, cls):
    """ Test that the model automatically saves the arguments passed into the constructor """
    extra_args = {}
-    if cls is AggSubClassEvalModel:
+    if cls is AggSubClassBoringModel:
        extra_args.update(my_loss=torch.nn.CosineEmbeddingLoss())
-    elif cls is DictConfSubClassEvalModel:
+    elif cls is DictConfSubClassBoringModel:
        extra_args.update(dict_conf=OmegaConf.create(dict(my_param='anything')))

    model = cls(**extra_args)
-    assert model.hparams.batch_size == 32
+    assert model.hparams.batch_size == 64
    model = cls(batch_size=179, **extra_args)
    assert model.hparams.batch_size == 179

-    if isinstance(model, SubClassEvalModel):
+    if isinstance(model, SubClassBoringModel):
        assert model.hparams.subclass_arg == 1200

-    if isinstance(model, AggSubClassEvalModel):
+    if isinstance(model, AggSubClassBoringModel):
        assert isinstance(model.hparams.my_loss, torch.nn.CosineEmbeddingLoss)

    # verify that the checkpoint saved the correct values
@ -347,10 +352,10 @@ def test_collect_init_arguments(tmpdir, cls):
    model = cls.load_from_checkpoint(raw_checkpoint_path)
    assert model.hparams.batch_size == 179

-    if isinstance(model, AggSubClassEvalModel):
+    if isinstance(model, AggSubClassBoringModel):
        assert isinstance(model.hparams.my_loss, torch.nn.CosineEmbeddingLoss)

-    if isinstance(model, DictConfSubClassEvalModel):
+    if isinstance(model, DictConfSubClassBoringModel):
        assert isinstance(model.hparams.dict_conf, Container)
        assert model.hparams.dict_conf['my_param'] == 'anything'

@ -368,7 +373,7 @@ def _raw_checkpoint_path(trainer) -> str:
    return raw_checkpoint_path


-class LocalVariableModelSuperLast(EvalModelTemplate):
+class LocalVariableModelSuperLast(BoringModel):
    """ This model has the super().__init__() call at the end. """

    def __init__(self, arg1, arg2, *args, **kwargs):
@ -378,7 +383,7 @@ class LocalVariableModelSuperLast(EvalModelTemplate):
        super().__init__(*args, **kwargs)  # this is intentionally here at the end


-class LocalVariableModelSuperFirst(EvalModelTemplate):
+class LocalVariableModelSuperFirst(BoringModel):
    """ This model has the _auto_collect_arguments() call at the end. """

    def __init__(self, arg1, arg2, *args, **kwargs):
@ -429,16 +434,17 @@ def test_collect_init_arguments_with_local_vars(cls):
 #     assert model.hparams.my_arg == 42


-class AnotherArgModel(EvalModelTemplate):
+class AnotherArgModel(BoringModel):

    def __init__(self, arg1):
        super().__init__()
        self.save_hyperparameters(arg1)


-class OtherArgsModel(EvalModelTemplate):
+class OtherArgsModel(BoringModel):

    def __init__(self, arg1, arg2):
+
        super().__init__()
        self.save_hyperparameters(arg1, arg2)

@ -457,7 +463,7 @@ def test_single_config_models_fail(tmpdir, cls, config):

@pytest.mark.parametrize("past_key", ['module_arguments'])
 def test_load_past_checkpoint(tmpdir, past_key):
-    model = EvalModelTemplate()
+    model = CustomBoringModel()

    # verify we can train
    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1)
@ -474,7 +480,7 @@ def test_load_past_checkpoint(tmpdir, past_key):
    torch.save(raw_checkpoint, raw_checkpoint_path)

    # verify that model loads correctly
-    model2 = EvalModelTemplate.load_from_checkpoint(raw_checkpoint_path)
+    model2 = CustomBoringModel.load_from_checkpoint(raw_checkpoint_path)
    assert model2.hparams.batch_size == -17


@ -486,7 +492,7 @@ def test_hparams_pickle(tmpdir):
    assert ad == pickle.loads(pkl)


-class UnpickleableArgsEvalModel(EvalModelTemplate):
+class UnpickleableArgsBoringModel(BoringModel):
    """ A model that has an attribute that cannot be pickled. """

    def __init__(self, foo='bar', pickle_me=(lambda x: x + 1), **kwargs):
@ -496,7 +502,7 @@ class UnpickleableArgsEvalModel(EvalModelTemplate):


 def test_hparams_pickle_warning(tmpdir):
-    model = UnpickleableArgsEvalModel()
+    model = UnpickleableArgsBoringModel()
    trainer = Trainer(default_root_dir=tmpdir, max_steps=1)
    with pytest.warns(UserWarning, match="attribute 'pickle_me' removed from hparams because it cannot be pickled"):
        trainer.fit(model)
@ -522,38 +528,15 @@ def test_hparams_save_yaml(tmpdir):
    assert load_hparams_from_yaml(path_yaml) == hparams


-class NoArgsSubClassEvalModel(EvalModelTemplate):
+class NoArgsSubClassBoringModel(CustomBoringModel):

    def __init__(self):
        super().__init__()


-class SimpleNoArgsModel(LightningModule):
-
-    def __init__(self):
-        super().__init__()
-        self.l1 = torch.nn.Linear(28 * 28, 10)
-
-    def forward(self, x):
-        return torch.relu(self.l1(x.view(x.size(0), -1)))
-
-    def training_step(self, batch, batch_nb):
-        x, y = batch
-        loss = F.cross_entropy(self(x), y)
-        return {'loss': loss, 'log': {'train_loss': loss}}
-
-    def test_step(self, batch, batch_nb):
-        x, y = batch
-        loss = F.cross_entropy(self(x), y)
-        return {'loss': loss, 'log': {'train_loss': loss}}
-
-    def configure_optimizers(self):
-        return torch.optim.Adam(self.parameters(), lr=0.02)
-
-
@pytest.mark.parametrize("cls", [
-    SimpleNoArgsModel,
-    NoArgsSubClassEvalModel,
+    BoringModel,
+    NoArgsSubClassBoringModel,
 ])
 def test_model_nohparams_train_test(tmpdir, cls):
    """Test models that do not tae any argument in init."""
@ -564,20 +547,20 @@ def test_model_nohparams_train_test(tmpdir, cls):
        default_root_dir=tmpdir,
    )

-    train_loader = DataLoader(TrialMNIST(os.getcwd(), train=True, download=True), batch_size=32)
+    train_loader = DataLoader(RandomDataset(32, 64), batch_size=32)
    trainer.fit(model, train_loader)

-    test_loader = DataLoader(TrialMNIST(os.getcwd(), train=False, download=True), batch_size=32)
+    test_loader = DataLoader(RandomDataset(32, 64), batch_size=32)
    trainer.test(test_dataloaders=test_loader)


 def test_model_ignores_non_exist_kwargument(tmpdir):
    """Test that the model takes only valid class arguments."""

-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):

        def __init__(self, batch_size=15):
-            super().__init__(batch_size=batch_size)
+            super().__init__()
            self.save_hyperparameters()

    model = LocalModel()
@ -593,11 +576,11 @@ def test_model_ignores_non_exist_kwargument(tmpdir):
    assert 'non_exist_kwarg' not in model.hparams


-class SuperClassPositionalArgs(EvalModelTemplate):
+class SuperClassPositionalArgs(BoringModel):

    def __init__(self, hparams):
        super().__init__()
-        self._hparams = None  # pretend EvalModelTemplate did not call self.save_hyperparameters()
+        self._hparams = None  # pretend BoringModel did not call self.save_hyperparameters()
        self.hparams = hparams


--- a/tests/models/test_onnx.py
+++ b/tests/models/test_onnx.py
@ -21,14 +21,13 @@ import torch
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
-from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel


 def test_model_saves_with_input_sample(tmpdir):
    """Test that ONNX model saves with input sample and size is greater than 3 MB"""
    model = BoringModel()
-    trainer = Trainer(max_epochs=1)
+    trainer = Trainer(fast_dev_run=True)
    trainer.fit(model)

    file_path = os.path.join(tmpdir, "model.onnx")
@ -42,7 +41,7 @@ def test_model_saves_with_input_sample(tmpdir):
 def test_model_saves_on_gpu(tmpdir):
    """Test that model saves on gpu"""
    model = BoringModel()
-    trainer = Trainer(gpus=1, max_epochs=1)
+    trainer = Trainer(gpus=1, fast_dev_run=True)
    trainer.fit(model)

    file_path = os.path.join(tmpdir, "model.onnx")
@ -55,7 +54,7 @@ def test_model_saves_on_gpu(tmpdir):
 def test_model_saves_with_example_output(tmpdir):
    """Test that ONNX model saves when provided with example output"""
    model = BoringModel()
-    trainer = Trainer(max_epochs=1)
+    trainer = Trainer(fast_dev_run=True)
    trainer.fit(model)

    file_path = os.path.join(tmpdir, "model.onnx")
@ -92,9 +91,10 @@ def test_model_saves_on_multi_gpu(tmpdir):
        progress_bar_refresh_rate=0,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
+    model.example_input_array = torch.randn(5, 32)

-    tpipes.run_model_test(trainer_options, model)
+    tpipes.run_model_test(trainer_options, model, min_acc=0.08)

    file_path = os.path.join(tmpdir, "model.onnx")
    model.to_onnx(file_path)
@ -130,7 +130,7 @@ def test_if_inference_output_is_valid(tmpdir):
    model = BoringModel()
    model.example_input_array = torch.randn(5, 32)

-    trainer = Trainer(max_epochs=2)
+    trainer = Trainer(fast_dev_run=True)
    trainer.fit(model)

    model.eval()
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@ -16,18 +16,21 @@ import logging as log
 import os
 import pickle
 from copy import deepcopy
+from typing import Generic, TypeVar

 import cloudpickle
 import pytest
 import torch
+import torch.nn.functional as F

 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.states import RunningStage, TrainerState
-from tests.base import EvalModelTemplate, GenericEvalModelTemplate
 from tests.helpers import BoringModel
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel


 class ModelTrainerPropertyParity(Callback):
@ -52,14 +55,48 @@ class ModelTrainerPropertyParity(Callback):
        self._check_properties(trainer, pl_module)


+class ValTestLossBoringModel(BoringModel):
+
+    def __init__(self, batch_size=4):
+        super().__init__()
+        self.save_hyperparameters()
+
+    def validation_step(self, batch, batch_idx):
+        out = super().validation_step(batch, batch_idx)
+        self.log('val_loss', out['x'])
+        return out
+
+    def test_step(self, batch, batch_idx):
+        out = super().test_step(batch, batch_idx)
+        self.log('test_loss', out['y'])
+        return out
+
+
+T = TypeVar('T')
+
+
+class GenericParentValTestLossBoringModel(Generic[T], ValTestLossBoringModel):
+
+    def __init__(self, batch_size: int = 4):
+        super().__init__(batch_size=batch_size)
+
+
+class GenericValTestLossBoringModel(GenericParentValTestLossBoringModel[int]):
+    pass
+
+
 def test_model_properties_resume_from_checkpoint(tmpdir):
-    """ Test that properties like `current_epoch` and `global_step`
-    in model and trainer are always the same. """
-    model = EvalModelTemplate()
-    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    """
+    Test that properties like `current_epoch` and `global_step`
+    in model and trainer are always the same.
+    """
+    model = BoringModel()
+    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
    trainer_args = dict(
        default_root_dir=tmpdir,
        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
        logger=False,
        callbacks=[checkpoint_callback, ModelTrainerPropertyParity()],  # this performs the assertions
    )
@ -73,18 +110,19 @@ def test_model_properties_resume_from_checkpoint(tmpdir):

 def test_try_resume_from_non_existing_checkpoint(tmpdir):
    """ Test that trying to resume from non-existing `resume_from_checkpoint` fail without error."""
-    model = BoringModel()
-    checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    dm = ClassifDataModule()
+    model = ClassificationModel()
+    checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        logger=False,
        callbacks=[checkpoint_cb],
-        limit_train_batches=0.1,
-        limit_val_batches=0.1,
+        limit_train_batches=2,
+        limit_val_batches=2,
    )
    # Generate checkpoint `last.ckpt` with BoringModel
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
    # `True` if resume/restore successfully else `False`
    assert trainer.checkpoint_connector.restore(str(tmpdir / "last.ckpt"), trainer.on_gpu)
    assert not trainer.checkpoint_connector.restore(str(tmpdir / "last_non_existing.ckpt"), trainer.on_gpu)
@ -99,11 +137,12 @@ class CaptureCallbacksBeforeTraining(Callback):

 def test_callbacks_state_resume_from_checkpoint(tmpdir):
    """ Test that resuming from a checkpoint restores callbacks that persist state. """
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
    callback_capture = CaptureCallbacksBeforeTraining()

    def get_trainer_args():
-        checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+        checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
        trainer_args = dict(
            default_root_dir=tmpdir, max_steps=1, logger=False, callbacks=[
                checkpoint,
@ -116,12 +155,12 @@ def test_callbacks_state_resume_from_checkpoint(tmpdir):

    # initial training
    trainer = Trainer(**get_trainer_args())
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
    callbacks_before_resume = deepcopy(trainer.callbacks)

    # resumed training
    trainer = Trainer(**get_trainer_args(), resume_from_checkpoint=str(tmpdir / "last.ckpt"))
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)

    assert len(callbacks_before_resume) == len(callback_capture.callbacks)

@ -133,23 +172,24 @@ def test_callbacks_state_resume_from_checkpoint(tmpdir):

 def test_callbacks_references_resume_from_checkpoint(tmpdir):
    """ Test that resuming from a checkpoint sets references as expected. """
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
    args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False}

    # initial training
-    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
    trainer = Trainer(**args, callbacks=[checkpoint])
    assert checkpoint is trainer.callbacks[-1] is trainer.checkpoint_callback
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)

    # resumed training
-    new_checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    new_checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
    # pass in a new checkpoint object, which should take
    # precedence over the one in the last.ckpt file
    trainer = Trainer(**args, callbacks=[new_checkpoint], resume_from_checkpoint=str(tmpdir / "last.ckpt"))
    assert checkpoint is not new_checkpoint
    assert new_checkpoint is trainer.callbacks[-1] is trainer.checkpoint_callback
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@ -158,7 +198,30 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):

    tutils.set_random_master_port()

-    model = EvalModelTemplate()
+    class CustomClassificationModelDP(ClassificationModel):
+
+        def _step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            return {'logits': logits, 'y': y}
+
+        def training_step(self, batch, batch_idx):
+            _, y = batch
+            out = self._step(batch, batch_idx)
+            out['loss'] = F.cross_entropy(out['logits'], y)
+            return out
+
+        def validation_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def test_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def validation_step_end(self, outputs):
+            self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
+
+    dm = ClassifDataModule()
+    model = CustomClassificationModelDP()

    # exp file to get meta
    logger = tutils.get_default_logger(tmpdir)
@ -169,8 +232,8 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
    trainer_options = dict(
        progress_bar_refresh_rate=0,
        max_epochs=2,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        limit_train_batches=5,
+        limit_val_batches=5,
        callbacks=[checkpoint],
        logger=logger,
        gpus=[0, 1],
@ -180,21 +243,17 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):

    # fit model
    trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)

    # correct result and ok accuracy
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # run test set
    new_trainer = Trainer(**trainer_options)
-    results = new_trainer.test(pretrained_model)
+    new_trainer.test(pretrained_model)
    pretrained_model.cpu()

-    # test we have good test accuracy
-    acc = results[0]['test_acc']
-    assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}"
-
    dataloaders = model.test_dataloader()
    if not isinstance(dataloaders, list):
        dataloaders = [dataloaders]
@ -207,8 +266,8 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
 def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
    """Verify `test()` on pretrained model."""
    tutils.set_random_master_port()
-
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()

    # exp file to get meta
    logger = tutils.get_default_logger(tmpdir)
@ -219,8 +278,8 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
    trainer_options = dict(
        progress_bar_refresh_rate=0,
        max_epochs=2,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        limit_train_batches=2,
+        limit_val_batches=2,
        callbacks=[checkpoint],
        logger=logger,
        gpus=[0, 1],
@ -230,33 +289,32 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):

    # fit model
    trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)

    log.info(os.listdir(tutils.get_data_path(logger, path_dir=tmpdir)))

    # correct result and ok accuracy
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # run test set
    new_trainer = Trainer(**trainer_options)
-    results = new_trainer.test(pretrained_model)
+    new_trainer.test(pretrained_model)
    pretrained_model.cpu()

-    acc = results[0]['test_acc']
-    assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}"
-
-    dataloaders = model.test_dataloader()
+    dataloaders = dm.test_dataloader()
    if not isinstance(dataloaders, list):
        dataloaders = [dataloaders]

    for dataloader in dataloaders:
-        tpipes.run_prediction(pretrained_model, dataloader)
+        tpipes.run_prediction(pretrained_model, dataloader, min_acc=0.1)


 def test_running_test_pretrained_model_cpu(tmpdir):
    """Verify test() on pretrained model."""
-    model = EvalModelTemplate()
+    tutils.reset_seed()
+    dm = ClassifDataModule()
+    model = ClassificationModel()

    # logger file to get meta
    logger = tutils.get_default_logger(tmpdir)
@ -266,9 +324,10 @@ def test_running_test_pretrained_model_cpu(tmpdir):

    trainer_options = dict(
        progress_bar_refresh_rate=0,
-        max_epochs=3,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        max_epochs=2,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
        callbacks=[checkpoint],
        logger=logger,
        default_root_dir=tmpdir,
@ -276,31 +335,32 @@ def test_running_test_pretrained_model_cpu(tmpdir):

    # fit model
    trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)

    # correct result and ok accuracy
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    new_trainer = Trainer(**trainer_options)
-    new_trainer.test(pretrained_model)
+    new_trainer.test(pretrained_model, datamodule=dm)

    # test we have good test accuracy
-    tutils.assert_ok_model_acc(new_trainer)
+    tutils.assert_ok_model_acc(new_trainer, key='test_acc', thr=0.45)


-@pytest.mark.parametrize('model_template', [EvalModelTemplate, GenericEvalModelTemplate])
+@pytest.mark.parametrize('model_template', [ValTestLossBoringModel, GenericValTestLossBoringModel])
 def test_load_model_from_checkpoint(tmpdir, model_template):
    """Verify test() on pretrained model."""
-    hparams = model_template.get_default_hparams()
-    model = model_template(**hparams)
+    tutils.reset_seed()
+    model = model_template()

    trainer_options = dict(
        progress_bar_refresh_rate=0,
        max_epochs=2,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
-        callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=-1)],
+        limit_train_batches=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
+        callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='val_loss', save_top_k=-1)],
        default_root_dir=tmpdir,
    )

@ -315,7 +375,7 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
    # load last checkpoint
    last_checkpoint = sorted(glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, "*.ckpt")))[-1]

-    # Since `EvalModelTemplate` has `_save_hparams = True` by default, check that ckpt has hparams
+    # Since `BoringModel` has `_save_hparams = True` by default, check that ckpt has hparams
    ckpt = torch.load(last_checkpoint)
    assert model_template.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), 'hyper_parameters missing from checkpoints'

@ -323,8 +383,8 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
    pretrained_model = model_template.load_from_checkpoint(last_checkpoint)

    # test that hparams loaded correctly
-    for k, v in hparams.items():
-        assert getattr(pretrained_model, k) == v
+    for k, v in model.hparams.items():
+        assert getattr(pretrained_model.hparams, k) == v

    # assert weights are the same
    for (old_name, old_p), (new_name, new_p) in zip(model.named_parameters(), pretrained_model.named_parameters()):
@ -334,15 +394,11 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
    new_trainer = Trainer(**trainer_options)
    new_trainer.test(pretrained_model)

-    # test we have good test accuracy
-    tutils.assert_ok_model_acc(new_trainer)
-

@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_dp_resume(tmpdir):
    """Make sure DP continues training correctly."""
-    hparams = EvalModelTemplate.get_default_hparams()
-    model = EvalModelTemplate(**hparams)
+    model = BoringModel()

    trainer_options = dict(max_epochs=1, gpus=2, accelerator='dp', default_root_dir=tmpdir)

@ -355,7 +411,7 @@ def test_dp_resume(tmpdir):

    # add these to the trainer options
    trainer_options['logger'] = logger
-    trainer_options['checkpoint_callback'] = checkpoint
+    trainer_options['callbacks'] = [checkpoint]

    # fit model
    trainer = Trainer(**trainer_options)
@ -377,31 +433,38 @@ def test_dp_resume(tmpdir):
    # init new trainer
    new_logger = tutils.get_default_logger(tmpdir, version=logger.version)
    trainer_options['logger'] = new_logger
-    trainer_options['checkpoint_callback'] = ModelCheckpoint(dirpath=tmpdir)
+    trainer_options['callbacks'] = [ModelCheckpoint(dirpath=tmpdir)]
    trainer_options['limit_train_batches'] = 0.5
    trainer_options['limit_val_batches'] = 0.2
    trainer_options['max_epochs'] = 1
    new_trainer = Trainer(**trainer_options)

-    # set the epoch start hook so we can predict before the model does the full training
-    def assert_good_acc():
-        assert new_trainer.current_epoch == real_global_epoch and new_trainer.current_epoch > 0
+    class CustomModel(BoringModel):

-        # if model and state loaded correctly, predictions will be good even though we
-        # haven't trained with the new loaded model
-        dp_model = new_trainer.model
-        dp_model.eval()
-        dp_model.module.module.running_stage = RunningStage.EVALUATING
+        def __init__(self):
+            super().__init__()
+            self.on_train_start_called = False

-        dataloader = trainer.train_dataloader
-        tpipes.run_prediction(dp_model, dataloader, dp=True)
+        # set the epoch start hook so we can predict before the model does the full training
+        def on_train_start(self):
+            assert self.trainer.current_epoch == real_global_epoch and self.trainer.current_epoch > 0
+
+            # if model and state loaded correctly, predictions will be good even though we
+            # haven't trained with the new loaded model
+            dp_model = new_trainer.model
+            dp_model.eval()
+            dp_model.module.module.running_stage = RunningStage.EVALUATING
+
+            dataloader = self.train_dataloader()
+            tpipes.run_prediction(self.trainer.get_model(), dataloader)
+            self.on_train_start_called = True

    # new model
-    model = EvalModelTemplate(**hparams)
-    model.on_train_start = assert_good_acc
+    model = CustomModel()

    # fit new model which should load hpc weights
    new_trainer.fit(model)
+    assert model.on_train_start_called

    # test freeze on gpu
    model.freeze()
@ -410,7 +473,7 @@ def test_dp_resume(tmpdir):

 def test_model_saving_loading(tmpdir):
    """Tests use case where trainer saves the model, and user loads it from tags independently."""
-    model = EvalModelTemplate()
+    model = BoringModel()

    # logger file to get meta
    logger = tutils.get_default_logger(tmpdir)
@ -418,6 +481,8 @@ def test_model_saving_loading(tmpdir):
    # fit model
    trainer = Trainer(
        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
        logger=logger,
        callbacks=[ModelCheckpoint(dirpath=tmpdir)],
        default_root_dir=tmpdir,
@ -432,16 +497,11 @@ def test_model_saving_loading(tmpdir):
    if not isinstance(dataloaders, list):
        dataloaders = [dataloaders]

-    for dataloader in dataloaders:
-        for batch in dataloader:
-            break
-
-    x, y = batch
-    x = x.view(x.size(0), -1)
+    batch = next(iter(dataloaders[0]))

    # generate preds before saving model
    model.eval()
-    pred_before_saving = model(x)
+    pred_before_saving = model(batch)

    # save model
    new_weights_path = os.path.join(tmpdir, 'save_test.ckpt')
@ -450,7 +510,7 @@ def test_model_saving_loading(tmpdir):
    # load new model
    hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
    hparams_path = os.path.join(hparams_path, 'hparams.yaml')
-    model_2 = EvalModelTemplate.load_from_checkpoint(
+    model_2 = BoringModel.load_from_checkpoint(
        checkpoint_path=new_weights_path,
        hparams_file=hparams_path,
    )
@ -458,7 +518,7 @@ def test_model_saving_loading(tmpdir):

    # make prediction
    # assert that both predictions are the same
-    new_pred = model_2(x)
+    new_pred = model_2(batch)
    assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1


@ -468,9 +528,9 @@ def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_c
    # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
    monkeypatch.setenv('TORCH_HOME', tmpdir)

-    model = EvalModelTemplate()
+    model = BoringModel()
    # Extra layer
-    model.c_d3 = torch.nn.Linear(model.hidden_dim, model.hidden_dim)
+    model.c_d3 = torch.nn.Linear(32, 32)

    # logger file to get meta
    logger = tutils.get_default_logger(tmpdir)
@ -479,6 +539,8 @@ def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_c
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
        logger=logger,
        callbacks=[ModelCheckpoint(dirpath=tmpdir)],
    )
@ -496,14 +558,14 @@ def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_c
    hparams_url = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}'
    ckpt_path = hparams_url if url_ckpt else new_weights_path

-    EvalModelTemplate.load_from_checkpoint(
+    BoringModel.load_from_checkpoint(
        checkpoint_path=ckpt_path,
        hparams_file=hparams_path,
        strict=False,
    )

    with pytest.raises(RuntimeError, match=r'Unexpected key\(s\) in state_dict: "c_d3.weight", "c_d3.bias"'):
-        EvalModelTemplate.load_from_checkpoint(
+        BoringModel.load_from_checkpoint(
            checkpoint_path=ckpt_path,
            hparams_file=hparams_path,
            strict=True,
@ -516,7 +578,7 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
    # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
    monkeypatch.setenv('TORCH_HOME', tmpdir)

-    model = EvalModelTemplate()
+    model = BoringModel()

    # logger file to get meta
    logger = tutils.get_default_logger(tmpdir)
@ -525,6 +587,8 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
        logger=logger,
        callbacks=[ModelCheckpoint(dirpath=tmpdir)],
    )
@ -542,7 +606,7 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
    hparams_url = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}'
    ckpt_path = hparams_url if url_ckpt else new_weights_path

-    class CurrentModel(EvalModelTemplate):
+    class CurrentModel(BoringModel):

        def __init__(self):
            super().__init__()
@ -563,6 +627,6 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c


 def test_model_pickle(tmpdir):
-    model = EvalModelTemplate()
+    model = BoringModel()
    pickle.dumps(model)
    cloudpickle.dumps(model)
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@ -19,14 +19,14 @@ import pytest
 from torch.utils.data import DataLoader

 import tests.helpers.pipelines as tpipes
+import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators import TPUAccelerator
 from pytorch_lightning.callbacks import EarlyStopping
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import EvalModelTemplate
-from tests.helpers.datasets import TrialMNIST
+from tests.helpers import BoringModel, RandomDataset
 from tests.helpers.utils import pl_multi_process_test

 if _TPU_AVAILABLE:
@ -34,7 +34,7 @@ if _TPU_AVAILABLE:
    import torch_xla.distributed.xla_multiprocessing as xmp
    SERIAL_EXEC = xmp.MpSerialExecutor()

-_LARGER_DATASET = TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8))
+_LARGER_DATASET = RandomDataset(32, 2000)


 # 8 cores needs a big dataset
@ -42,20 +42,30 @@ def _serial_train_loader():
    return DataLoader(_LARGER_DATASET, batch_size=32)


+class SerialLoaderBoringModel(BoringModel):
+
+    def train_dataloader(self):
+        return DataLoader(RandomDataset(32, 2000), batch_size=32)
+
+    def val_dataloader(self):
+        return DataLoader(RandomDataset(32, 2000), batch_size=32)
+
+
@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
@pl_multi_process_test
 def test_model_tpu_cores_1(tmpdir):
    """Make sure model trains on TPU."""
+    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)


@ -64,16 +74,17 @@ def test_model_tpu_cores_1(tmpdir):
@pl_multi_process_test
 def test_model_tpu_index(tmpdir, tpu_core):
    """Make sure model trains on TPU."""
+    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=[tpu_core],
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
    assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'

@ -82,6 +93,7 @@ def test_model_tpu_index(tmpdir, tpu_core):
@pl_multi_process_test
 def test_model_tpu_cores_8(tmpdir):
    """Make sure model trains on TPU."""
+    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
@ -91,29 +103,27 @@ def test_model_tpu_cores_8(tmpdir):
        limit_val_batches=0.4,
    )

-    model = EvalModelTemplate()
    # 8 cores needs a big dataset
-    model.train_dataloader = _serial_train_loader
-    model.val_dataloader = _serial_train_loader
-
-    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+    model = SerialLoaderBoringModel()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)


@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
@pl_multi_process_test
 def test_model_16bit_tpu_cores_1(tmpdir):
    """Make sure model trains on TPU."""
+    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        precision=16,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)
    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"

@ -123,17 +133,18 @@ def test_model_16bit_tpu_cores_1(tmpdir):
@pl_multi_process_test
 def test_model_16bit_tpu_index(tmpdir, tpu_core):
    """Make sure model trains on TPU."""
+    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        precision=16,
        progress_bar_refresh_rate=0,
        max_epochs=1,
        tpu_cores=[tpu_core],
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        limit_train_batches=4,
+        limit_val_batches=2,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False)
    assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
@ -143,6 +154,7 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core):
@pl_multi_process_test
 def test_model_16bit_tpu_cores_8(tmpdir):
    """Make sure model trains on TPU."""
+    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        precision=16,
@ -153,26 +165,32 @@ def test_model_16bit_tpu_cores_8(tmpdir):
        limit_val_batches=0.4,
    )

-    model = EvalModelTemplate()
    # 8 cores needs a big dataset
-    model.train_dataloader = _serial_train_loader
-    model.val_dataloader = _serial_train_loader
-
-    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+    model = SerialLoaderBoringModel()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)


@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
@pl_multi_process_test
 def test_model_tpu_early_stop(tmpdir):
    """Test if single TPU core training works"""
-    model = EvalModelTemplate()
+
+    class CustomBoringModel(BoringModel):
+
+        def validation_step(self, *args, **kwargs):
+            out = super().validation_step(*args, **kwargs)
+            self.log('val_loss', out['x'])
+            return out
+
+    tutils.reset_seed()
+    model = CustomBoringModel()
    trainer = Trainer(
-        callbacks=[EarlyStopping()],
+        callbacks=[EarlyStopping(monitor='val_loss')],
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
        max_epochs=50,
-        limit_train_batches=10,
-        limit_val_batches=10,
+        limit_train_batches=4,
+        limit_val_batches=4,
        tpu_cores=1,
    )
    trainer.fit(model)
@ -182,6 +200,7 @@ def test_model_tpu_early_stop(tmpdir):
@pl_multi_process_test
 def test_tpu_grad_norm(tmpdir):
    """Test if grad_norm works on TPU."""
+    tutils.reset_seed()
    trainer_options = dict(
        default_root_dir=tmpdir,
        progress_bar_refresh_rate=0,
@ -192,7 +211,7 @@ def test_tpu_grad_norm(tmpdir):
        gradient_clip_val=0.1,
    )

-    model = EvalModelTemplate()
+    model = BoringModel()
    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)


@ -201,7 +220,8 @@ def test_tpu_grad_norm(tmpdir):
 def test_dataloaders_passed_to_fit(tmpdir):
    """Test if dataloaders passed to trainer works on TPU"""

-    model = EvalModelTemplate()
+    tutils.reset_seed()
+    model = BoringModel()

    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, tpu_cores=8)
    trainer.fit(model, train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())