From 031274c25dedc92e383d2715e283a55a2b102d29 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 23 Sep 2020 00:19:46 -0400
Subject: [PATCH] fix dp issues + update examples and test examples (#3618)

* fix dp

* fix dp

* fix dp

* fix dp

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples

* fix examples
---
 pl_examples/README.md                         |  72 ++-----
 pl_examples/__init__.py                       | 147 --------------
 pl_examples/basic_examples/README.md          |  69 +++----
 pl_examples/basic_examples/autoencoder.py     |  99 ++++++++++
 pl_examples/basic_examples/cpu_template.py    |  51 -----
 pl_examples/basic_examples/gpu_template.py    |  52 -----
 .../basic_examples/image_classifier.py        | 123 ++++++++++++
 pl_examples/basic_examples/mnist.py           | 111 +++++++++++
 .../basic_examples/multi_node_ddp2_demo.py    |  53 ------
 .../basic_examples/multi_node_ddp_demo.py     |  53 ------
 pl_examples/basic_examples/submit_ddp2_job.sh |   2 +-
 pl_examples/basic_examples/submit_ddp_job.sh  |   2 +-
 .../domain_templates/semantic_segmentation.py |   2 +-
 .../{models => domain_templates}/unet.py      |   0
 pl_examples/models/lightning_template.py      | 180 ------------------
 pl_examples/test_examples.py                  |  79 --------
 pytorch_lightning/accelerators/dp_backend.py  |   6 +
 pytorch_lightning/overrides/data_parallel.py  |  45 ++++-
 pytorch_lightning/utilities/warning_utils.py  |  12 ++
 tests/base/develop_utils.py                   |   1 -
 .../models => tests/examples}/__init__.py     |   0
 tests/examples/test_examples.py               |  94 +++++++++
 22 files changed, 538 insertions(+), 715 deletions(-)
 create mode 100644 pl_examples/basic_examples/autoencoder.py
 delete mode 100644 pl_examples/basic_examples/cpu_template.py
 delete mode 100644 pl_examples/basic_examples/gpu_template.py
 create mode 100644 pl_examples/basic_examples/image_classifier.py
 create mode 100644 pl_examples/basic_examples/mnist.py
 delete mode 100644 pl_examples/basic_examples/multi_node_ddp2_demo.py
 delete mode 100644 pl_examples/basic_examples/multi_node_ddp_demo.py
 rename pl_examples/{models => domain_templates}/unet.py (100%)
 delete mode 100644 pl_examples/models/lightning_template.py
 delete mode 100644 pl_examples/test_examples.py
 create mode 100644 pytorch_lightning/utilities/warning_utils.py
 rename {pl_examples/models => tests/examples}/__init__.py (100%)
 create mode 100644 tests/examples/test_examples.py

diff --git a/pl_examples/README.md b/pl_examples/README.md
index 93715b0e44..f47132999e 100644
--- a/pl_examples/README.md
+++ b/pl_examples/README.md
@@ -1,67 +1,19 @@
 # Examples   
-This folder has 3 sections:   
-
-## Basic Examples   
-Use these examples to test how lightning works.   
-
-#### Test on CPU  
-```bash
-python cpu_template.py
-```
-
----   
-#### Train on a single GPU
-```bash
-python gpu_template.py --gpus 1
-```   
-
----    
-#### DataParallel (dp)   
-Train on multiple GPUs using DataParallel.
-
-```bash
-python gpu_template.py --gpus 2 --distributed_backend dp
-```   
+Our most robust examples showing all sorts of implementations
+can be found in our sister library [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2).
 
 ---
-#### DistributedDataParallel (ddp)    
 
-Train on multiple GPUs using DistributedDataParallel   
-```bash
-python gpu_template.py --gpus 2 --distributed_backend ddp
-```
+## Basic examples
+In this folder we add 3 simple examples:
+
+* [Image Classifier]() (trains arbitrary datasets with arbitrary backbones).
+* [MNIST classifier]() (defined the model inside the lightningModule).
+* [Autoencoder]() (shows how the LightningModule is meant to be used as a system)
 
 ---
-#### DistributedDataParallel+DP (ddp2)    
 
-Train on multiple GPUs using DistributedDataParallel + dataparallel.
-On a single node, uses all GPUs for 1 model. Then shares gradient information
-across nodes.   
-```bash
-python gpu_template.py --gpus 2 --distributed_backend ddp2
-```
-
-## Multi-node example   
-
-This demo launches a job using 2 GPUs on 2 different nodes (4 GPUs total).
-To run this demo do the following:
-
-1. Log into the jumphost node of your SLURM-managed cluster.  
-2. Create a conda environment with Lightning and a GPU PyTorch version.   
-3. Choose a script to submit    
-
-### DDP  
-Submit this job to run with DistributedDataParallel (2 nodes, 2 gpus each)
-```bash
-sbatch ddp_job_submit.sh YourEnv
-```
-
-### DDP2  
-Submit this job to run with a different implementation of DistributedDataParallel.
-In this version, each node acts like DataParallel but syncs across nodes like DDP.
-```bash
-sbatch ddp2_job_submit.sh YourEnv
-```
-
-## Domain templates   
-These are templates to show common approaches such as GANs and RL.
+## Domain examples
+This folder contains older examples. You should instead use the examples 
+in [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2) 
+for advanced use cases.
diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py
index 1c5908539c..e69de29bb2 100644
--- a/pl_examples/__init__.py
+++ b/pl_examples/__init__.py
@@ -1,147 +0,0 @@
-"""
-Template model definition
--------------------------
-
-In 99% of cases you want to just copy `one of the examples
-<https://github.com/PyTorchLightning/pytorch-lightning/tree/master/pl_examples>`_
-to start a new lightningModule and change the core of what your model is actually trying to do.
-
-.. code-block:: bash
-
-    # get a copy of the module template
-    wget https://raw.githubusercontent.com/PyTorchLightning/pytorch-lightning/master/pl_examples/new_project_templates/lightning_module_template.py  # noqa: E501
-
-
-Trainer Example
----------------
-
-**`__main__` function**
-
-Normally, we want to let the `__main__` function start the training.
- Inside the main we parse training arguments with whatever hyperparameters we want.
- Your LightningModule will have a chance to add hyperparameters.
-
-.. code-block:: python
-
-    from test_tube import HyperOptArgumentParser
-
-    if __name__ == '__main__':
-
-        # use default args given by lightning
-        root_dir = os.path.split(os.path.dirname(sys.modules['__main__'].__file__))[0]
-        parent_parser = HyperOptArgumentParser(strategy='random_search', add_help=False)
-        add_default_args(parent_parser, root_dir)
-
-        # allow model to overwrite or extend args
-        parser = ExampleModel.add_model_specific_args(parent_parser)
-        hyperparams = parser.parse_args()
-
-        # train model
-        main(hyperparams)
-
-**Main Function**
-
-The main function is your entry into the program. This is where you init your model, checkpoint directory,
- and launch the training. The main function should have 3 arguments:
-
-- hparams: a configuration of hyperparameters.
-- slurm_manager: Slurm cluster manager object (can be None)
-- dict: for you to return any values you want (useful in meta-learning, otherwise set to)
-
-.. code-block:: python
-
-    def main(hparams, cluster, results_dict):
-        # build model
-        model = MyLightningModule(hparams)
-
-        # configure trainer
-        trainer = Trainer()
-
-        # train model
-        trainer.fit(model)
-
-
-The `__main__` function will start training on your **main** function.
- If you use the HyperParameterOptimizer in hyper parameter optimization mode,
- this main function will get one set of hyperparameters. If you use it as a simple
- argument parser you get the default arguments in the argument parser.
-
-So, calling main(hyperparams) runs the model with the default argparse arguments.::
-
-    main(hyperparams)
-
-
-CPU hyperparameter search
--------------------------
-
-.. code-block:: python
-
-    # run a grid search over 20 hyperparameter combinations.
-    hyperparams.optimize_parallel_cpu(
-        main_local,
-        nb_trials=20,
-        nb_workers=1
-    )
-
-
-Hyperparameter search on a single or multiple GPUs
---------------------------------------------------
-
-.. code-block:: python
-
-    # run a grid search over 20 hyperparameter combinations.
-    hyperparams.optimize_parallel_gpu(
-        main_local,
-        nb_trials=20,
-        nb_workers=1,
-        gpus=[0,1,2,3]
-    )
-
-
-Hyperparameter search on a SLURM HPC cluster
---------------------------------------------
-
-.. code-block:: python
-
-    def optimize_on_cluster(hyperparams):
-        # enable cluster training
-        cluster = SlurmCluster(
-            hyperparam_optimizer=hyperparams,
-            log_path=hyperparams.tt_save_path,
-            test_tube_exp_name=hyperparams.tt_name
-        )
-
-        # email for cluster coms
-        cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True)
-
-        # configure cluster
-        cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
-        cluster.job_time = '48:00:00'
-        cluster.gpu_type = '1080ti'
-        cluster.memory_mb_per_node = 48000
-
-        # any modules for code to run in env
-        cluster.add_command('source activate pytorch_lightning')
-
-        # name of exp
-        job_display_name = hyperparams.tt_name.split('_')[0]
-        job_display_name = job_display_name[0:3]
-
-        # run hopt
-        logging.info('submitting jobs...')
-        cluster.optimize_parallel_cluster_gpu(
-            main,
-            nb_trials=hyperparams.nb_hopt_trials,
-            job_name=job_display_name
-        )
-
-    # run cluster hyperparameter search
-    optimize_on_cluster(hyperparams)
-
-"""
-
-from pl_examples.models.lightning_template import LightningTemplateModel
-
-__all__ = [
-    'LightningTemplateModel'
-]
diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md
index 63fdc7f8c4..4dcf06a74b 100644
--- a/pl_examples/basic_examples/README.md
+++ b/pl_examples/basic_examples/README.md
@@ -1,44 +1,47 @@
 ## Basic Examples   
 Use these examples to test how lightning works.   
 
-#### Test on CPU  
+#### MNIST
+Trains MNIST where the model is defined inside the LightningModule.
 ```bash
-python cpu_template.py
+# cpu
+python mnist.py
+
+# gpus (any number)
+python mnist.py
+
+# dataparallel
+python mnist.py --gpus 2 --distributed_backend 'dp'
 ```
 
 ---   
-#### Train on a single GPU
+#### Image classifier
+Generic image classifier with an arbitrary backbone (ie: a simple system)
 ```bash
-python gpu_template.py --gpus 1
-```   
+# cpu
+python image_classifier.py
 
+# gpus (any number)
+python image_classifier.py --gpus 2
+
+# dataparallel
+python image_classifier.py --gpus 2 --distributed_backend 'dp'
+```
+
+---   
+#### Autoencoder
+Showing the power of a system... arbitrarily complex training loops
+```bash
+# cpu
+python autoencoder.py
+
+# gpus (any number)
+python autoencoder.py --gpus 2
+
+# dataparallel
+python autoencoder.py --gpus 2 --distributed_backend 'dp'
+```
 ---    
-#### DataParallel (dp)   
-Train on multiple GPUs using DataParallel.
-
-```bash
-python gpu_template.py --gpus 2 --distributed_backend dp
-```   
-
----
-#### DistributedDataParallel (ddp)    
-
-Train on multiple GPUs using DistributedDataParallel   
-```bash
-python gpu_template.py --gpus 2 --distributed_backend ddp
-```
-
----
-#### DistributedDataParallel+DP (ddp2)    
-
-Train on multiple GPUs using DistributedDataParallel + DataParallel.
-On a single node, uses all GPUs for 1 model. Then shares gradient information
-across nodes.   
-```bash
-python gpu_template.py --gpus 2 --distributed_backend ddp2
-```
-
-
 # Multi-node example   
 
 This demo launches a job using 2 GPUs on 2 different nodes (4 GPUs total).
@@ -51,12 +54,12 @@ To run this demo do the following:
 #### DDP  
 Submit this job to run with DistributedDataParallel (2 nodes, 2 gpus each)
 ```bash
-sbatch ddp_job_submit.sh YourEnv
+sbatch submit_ddp_job.sh YourEnv
 ```
 
 #### DDP2  
 Submit this job to run with a different implementation of DistributedDataParallel.
 In this version, each node acts like DataParallel but syncs across nodes like DDP.
 ```bash
-sbatch ddp2_job_submit.sh YourEnv
+sbatch submit_ddp2_job.sh YourEnv
 ```
diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py
new file mode 100644
index 0000000000..9fe1588617
--- /dev/null
+++ b/pl_examples/basic_examples/autoencoder.py
@@ -0,0 +1,99 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+from torch.utils.data import random_split
+from tests.base.datasets import MNIST
+
+
+class LitAutoEncoder(pl.LightningModule):
+
+    def __init__(self):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(28 * 28, 64),
+            nn.ReLU(),
+            nn.Linear(64, 3)
+        )
+        self.decoder = nn.Sequential(
+            nn.Linear(3, 64),
+            nn.ReLU(),
+            nn.Linear(64, 28 * 28)
+        )
+
+    def forward(self, x):
+        # in lightning, forward defines the prediction/inference actions
+        embedding = self.encoder(x)
+        return embedding
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        x = x.view(x.size(0), -1)
+        z = self.encoder(x)
+        x_hat = self.decoder(z)
+        loss = F.mse_loss(x_hat, x)
+        return pl.TrainResult(loss, checkpoint_on=loss)
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
+        return optimizer
+
+
+def cli_main():
+    pl.seed_everything(1234)
+
+    # ------------
+    # args
+    # ------------
+    parser = ArgumentParser()
+    parser.add_argument('--batch_size', default=32, type=int)
+    parser.add_argument('--hidden_dim', type=int, default=128)
+    parser = pl.Trainer.add_argparse_args(parser)
+    args = parser.parse_args()
+
+    # ------------
+    # data
+    # ------------
+    dataset = MNIST('', train=True, download=True)
+    mnist_test = MNIST('', train=False, download=True)
+    mnist_train, mnist_val = random_split(dataset, [55000, 5000])
+
+    train_loader = DataLoader(mnist_train, batch_size=args.batch_size)
+    val_loader = DataLoader(mnist_val, batch_size=args.batch_size)
+    test_loader = DataLoader(mnist_test, batch_size=args.batch_size)
+
+    # ------------
+    # model
+    # ------------
+    model = LitAutoEncoder()
+
+    # ------------
+    # training
+    # ------------
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model, train_loader, val_loader)
+
+    # ------------
+    # testing
+    # ------------
+    trainer.test(test_dataloaders=test_loader)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/pl_examples/basic_examples/cpu_template.py b/pl_examples/basic_examples/cpu_template.py
deleted file mode 100644
index 9d1fb52495..0000000000
--- a/pl_examples/basic_examples/cpu_template.py
+++ /dev/null
@@ -1,51 +0,0 @@
-"""
-Runs a model on the CPU on a single node.
-"""
-import os
-from argparse import ArgumentParser
-
-from pl_examples.models.lightning_template import LightningTemplateModel
-from pytorch_lightning import Trainer, seed_everything
-
-seed_everything(234)
-
-
-def main(args):
-    """ Main training routine specific for this project. """
-    # ------------------------
-    # 1 INIT LIGHTNING MODEL
-    # ------------------------
-    model = LightningTemplateModel(**vars(args))
-
-    # ------------------------
-    # 2 INIT TRAINER
-    # ------------------------
-    trainer = Trainer.from_argparse_args(args)
-
-    # ------------------------
-    # 3 START TRAINING
-    # ------------------------
-    trainer.fit(model)
-
-
-def run_cli():
-    # ------------------------
-    # TRAINING ARGUMENTS
-    # ------------------------
-    # these are project-wide arguments
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-    parent_parser = ArgumentParser(add_help=False)
-
-    # each LightningModule defines arguments relevant to it
-    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
-    parser = Trainer.add_argparse_args(parser)
-    args = parser.parse_args()
-
-    # ---------------------
-    # RUN TRAINING
-    # ---------------------
-    main(args)
-
-
-if __name__ == '__main__':
-    run_cli()
diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py
deleted file mode 100644
index 64cbabb10b..0000000000
--- a/pl_examples/basic_examples/gpu_template.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""
-Runs a model on a single node across multiple gpus.
-"""
-import os
-from argparse import ArgumentParser
-
-from pl_examples.models.lightning_template import LightningTemplateModel
-from pytorch_lightning import Trainer, seed_everything
-
-seed_everything(234)
-
-
-def main(args):
-    """ Main training routine specific for this project. """
-    # ------------------------
-    # 1 INIT LIGHTNING MODEL
-    # ------------------------
-    model = LightningTemplateModel(**vars(args))
-
-    # ------------------------
-    # 2 INIT TRAINER
-    # ------------------------
-    trainer = Trainer.from_argparse_args(args)
-
-    # ------------------------
-    # 3 START TRAINING
-    # ------------------------
-    trainer.fit(model)
-
-
-def run_cli():
-    # ------------------------
-    # TRAINING ARGUMENTS
-    # ------------------------
-    # these are project-wide arguments
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-    parent_parser = ArgumentParser(add_help=False)
-
-    # each LightningModule defines arguments relevant to it
-    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
-    parser = Trainer.add_argparse_args(parser)
-    parser.set_defaults(gpus=2)
-    args = parser.parse_args()
-
-    # ---------------------
-    # RUN TRAINING
-    # ---------------------
-    main(args)
-
-
-if __name__ == '__main__':
-    run_cli()
diff --git a/pl_examples/basic_examples/image_classifier.py b/pl_examples/basic_examples/image_classifier.py
new file mode 100644
index 0000000000..a06183a64d
--- /dev/null
+++ b/pl_examples/basic_examples/image_classifier.py
@@ -0,0 +1,123 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+
+import torch
+import pytorch_lightning as pl
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from tests.base.datasets import MNIST
+
+
+class Backbone(torch.nn.Module):
+    def __init__(self, hidden_dim=128):
+        super().__init__()
+        self.l1 = torch.nn.Linear(28 * 28, hidden_dim)
+        self.l2 = torch.nn.Linear(hidden_dim, 10)
+
+    def forward(self, x):
+        x = x.view(x.size(0), -1)
+        x = torch.relu(self.l1(x))
+        x = torch.relu(self.l2(x))
+        return x
+
+
+class LitClassifier(pl.LightningModule):
+    def __init__(self, backbone, learning_rate=1e-3):
+        super().__init__()
+        self.save_hyperparameters()
+        self.backbone = backbone
+
+    def forward(self, x):
+        # use forward for inference/predictions
+        embedding = self.backbone(x)
+        return embedding
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        loss = F.cross_entropy(y_hat, y)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        loss = F.cross_entropy(y_hat, y)
+        result = pl.EvalResult(checkpoint_on=loss)
+        result.log('valid_loss', loss)
+        return result
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        loss = F.cross_entropy(y_hat, y)
+        result = pl.EvalResult(checkpoint_on=loss)
+        result.log('test_loss', loss)
+        return result
+
+    def configure_optimizers(self):
+        # self.hparams available because we called self.save_hyperparameters()
+        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
+
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = ArgumentParser(parents=[parent_parser], add_help=False)
+        parser.add_argument('--learning_rate', type=float, default=0.0001)
+        return parser
+
+
+def cli_main():
+    pl.seed_everything(1234)
+
+    # ------------
+    # args
+    # ------------
+    parser = ArgumentParser()
+    parser.add_argument('--batch_size', default=32, type=int)
+    parser.add_argument('--hidden_dim', type=int, default=128)
+    parser = pl.Trainer.add_argparse_args(parser)
+    parser = LitClassifier.add_model_specific_args(parser)
+    args = parser.parse_args()
+
+    # ------------
+    # data
+    # ------------
+    dataset = MNIST('', train=True, download=True)
+    mnist_test = MNIST('', train=False, download=True)
+    mnist_train, mnist_val = random_split(dataset, [55000, 5000])
+
+    train_loader = DataLoader(mnist_train, batch_size=args.batch_size)
+    val_loader = DataLoader(mnist_val, batch_size=args.batch_size)
+    test_loader = DataLoader(mnist_test, batch_size=args.batch_size)
+
+    # ------------
+    # model
+    # ------------
+    model = LitClassifier(Backbone(hidden_dim=args.hidden_dim), args.learning_rate)
+
+    # ------------
+    # training
+    # ------------
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model, train_loader, val_loader)
+
+    # ------------
+    # testing
+    # ------------
+    trainer.test(test_dataloaders=test_loader)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/pl_examples/basic_examples/mnist.py b/pl_examples/basic_examples/mnist.py
new file mode 100644
index 0000000000..2613b89d2d
--- /dev/null
+++ b/pl_examples/basic_examples/mnist.py
@@ -0,0 +1,111 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from argparse import ArgumentParser
+
+import torch
+import pytorch_lightning as pl
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from tests.base.datasets import MNIST
+
+
+class LitClassifier(pl.LightningModule):
+    def __init__(self, hidden_dim=128, learning_rate=1e-3):
+        super().__init__()
+        self.save_hyperparameters()
+
+        self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
+        self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)
+
+    def forward(self, x):
+        x = x.view(x.size(0), -1)
+        x = torch.relu(self.l1(x))
+        x = torch.relu(self.l2(x))
+        return x
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        loss = F.cross_entropy(y_hat, y)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        loss = F.cross_entropy(y_hat, y)
+        result = pl.EvalResult(checkpoint_on=loss)
+        result.log('valid_loss', loss)
+        return result
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        loss = F.cross_entropy(y_hat, y)
+        result = pl.EvalResult(checkpoint_on=loss)
+        result.log('test_loss', loss)
+        return result
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
+
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = ArgumentParser(parents=[parent_parser], add_help=False)
+        parser.add_argument('--hidden_dim', type=int, default=128)
+        parser.add_argument('--learning_rate', type=float, default=0.0001)
+        return parser
+
+
+def cli_main():
+    pl.seed_everything(1234)
+
+    # ------------
+    # args
+    # ------------
+    parser = ArgumentParser()
+    parser.add_argument('--batch_size', default=32, type=int)
+    parser = pl.Trainer.add_argparse_args(parser)
+    parser = LitClassifier.add_model_specific_args(parser)
+    args = parser.parse_args()
+
+    # ------------
+    # data
+    # ------------
+    dataset = MNIST('', train=True, download=True)
+    mnist_test = MNIST('', train=False, download=True)
+    mnist_train, mnist_val = random_split(dataset, [55000, 5000])
+
+    train_loader = DataLoader(mnist_train, batch_size=args.batch_size)
+    val_loader = DataLoader(mnist_val, batch_size=args.batch_size)
+    test_loader = DataLoader(mnist_test, batch_size=args.batch_size)
+
+    # ------------
+    # model
+    # ------------
+    model = LitClassifier(args.hidden_dim, args.learning_rate)
+
+    # ------------
+    # training
+    # ------------
+    trainer = pl.Trainer.from_argparse_args(args)
+    trainer.fit(model, train_loader, val_loader)
+
+    # ------------
+    # testing
+    # ------------
+    trainer.test(test_dataloaders=test_loader)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/pl_examples/basic_examples/multi_node_ddp2_demo.py b/pl_examples/basic_examples/multi_node_ddp2_demo.py
deleted file mode 100644
index aead1fba1e..0000000000
--- a/pl_examples/basic_examples/multi_node_ddp2_demo.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""
-Multi-node example (GPU)
-"""
-import os
-from argparse import ArgumentParser
-
-from pl_examples.models.lightning_template import LightningTemplateModel
-from pytorch_lightning import Trainer, seed_everything
-
-seed_everything(234)
-
-
-def main(args):
-    """Main training routine specific for this project."""
-    # ------------------------
-    # 1 INIT LIGHTNING MODEL
-    # ------------------------
-    model = LightningTemplateModel(args)
-
-    # ------------------------
-    # 2 INIT TRAINER
-    # ------------------------
-    trainer = Trainer(
-        gpus=args.gpus,
-        num_nodes=args.num_nodes,
-        distributed_backend='ddp2',
-        max_epochs=args.max_epochs,
-        max_steps=args.max_steps,
-    )
-
-    # ------------------------
-    # 3 START TRAINING
-    # ------------------------
-    trainer.fit(model)
-
-
-def run_cli():
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-    parent_parser = ArgumentParser(add_help=False)
-
-    # each LightningModule defines arguments relevant to it
-    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
-    parser = Trainer.add_argparse_args(parser)
-    args = parser.parse_args()
-
-    # ---------------------
-    # RUN TRAINING
-    # ---------------------
-    main(args)
-
-
-if __name__ == '__main__':
-    run_cli()
diff --git a/pl_examples/basic_examples/multi_node_ddp_demo.py b/pl_examples/basic_examples/multi_node_ddp_demo.py
deleted file mode 100644
index 84f241391e..0000000000
--- a/pl_examples/basic_examples/multi_node_ddp_demo.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""
-Multi-node example (GPU)
-"""
-import os
-from argparse import ArgumentParser
-
-from pl_examples.models.lightning_template import LightningTemplateModel
-from pytorch_lightning import Trainer, seed_everything
-
-seed_everything(234)
-
-
-def main(args):
-    """Main training routine specific for this project."""
-    # ------------------------
-    # 1 INIT LIGHTNING MODEL
-    # ------------------------
-    model = LightningTemplateModel(args)
-
-    # ------------------------
-    # 2 INIT TRAINER
-    # ------------------------
-    trainer = Trainer(
-        gpus=args.gpus,
-        num_nodes=args.num_nodes,
-        distributed_backend='ddp',
-        max_epochs=args.max_epochs,
-        max_steps=args.max_steps,
-    )
-
-    # ------------------------
-    # 3 START TRAINING
-    # ------------------------
-    trainer.fit(model)
-
-
-def run_cli():
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-    parent_parser = ArgumentParser(add_help=False)
-
-    # each LightningModule defines arguments relevant to it
-    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
-    parser = Trainer.add_argparse_args(parser)
-    args = parser.parse_args()
-
-    # ---------------------
-    # RUN TRAINING
-    # ---------------------
-    main(args)
-
-
-if __name__ == '__main__':
-    run_cli()
diff --git a/pl_examples/basic_examples/submit_ddp2_job.sh b/pl_examples/basic_examples/submit_ddp2_job.sh
index 6e433f5fcd..e5d5801ac7 100755
--- a/pl_examples/basic_examples/submit_ddp2_job.sh
+++ b/pl_examples/basic_examples/submit_ddp2_job.sh
@@ -24,4 +24,4 @@ source activate $1
 # -------------------------
 
 # run script from above
-srun python3 multi_node_ddp2_demo.py
+srun python3 image_classifier.py --distributed_backend 'ddp2' --gpus 2 --num_nodes 2
diff --git a/pl_examples/basic_examples/submit_ddp_job.sh b/pl_examples/basic_examples/submit_ddp_job.sh
index bf53a65368..8c8e6d6313 100755
--- a/pl_examples/basic_examples/submit_ddp_job.sh
+++ b/pl_examples/basic_examples/submit_ddp_job.sh
@@ -24,4 +24,4 @@ source activate $1
 # -------------------------
 
 # run script from above
-srun python3 multi_node_ddp_demo.py
+srun python3 image_classifier.py --distributed_backend 'ddp' --gpus 2 --num_nodes 2
diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py
index fff3d1cf98..af37575bbf 100644
--- a/pl_examples/domain_templates/semantic_segmentation.py
+++ b/pl_examples/domain_templates/semantic_segmentation.py
@@ -10,7 +10,7 @@ from PIL import Image
 from torch.utils.data import DataLoader, Dataset
 
 import pytorch_lightning as pl
-from pl_examples.models.unet import UNet
+from pl_examples.domain_templates.unet import UNet
 from pytorch_lightning.loggers import WandbLogger
 
 DEFAULT_VOID_LABELS = (0, 1, 2, 3, 4, 5, 6, 9, 10, 14, 15, 16, 18, 29, 30, -1)
diff --git a/pl_examples/models/unet.py b/pl_examples/domain_templates/unet.py
similarity index 100%
rename from pl_examples/models/unet.py
rename to pl_examples/domain_templates/unet.py
diff --git a/pl_examples/models/lightning_template.py b/pl_examples/models/lightning_template.py
deleted file mode 100644
index 099d6b5a65..0000000000
--- a/pl_examples/models/lightning_template.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Example template for defining a system.
-"""
-import os
-from argparse import ArgumentParser
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchvision.transforms as transforms
-from torch import optim
-from torch.utils.data import DataLoader
-from torchvision.datasets import MNIST
-
-from pytorch_lightning.core import LightningModule
-
-
-class LightningTemplateModel(LightningModule):
-    """
-    Sample model to show how to define a template.
-
-    Example:
-
-        >>> # define simple Net for MNIST dataset
-        >>> params = dict(
-        ...     in_features=28 * 28,
-        ...     hidden_dim=1000,
-        ...     out_features=10,
-        ...     drop_prob=0.2,
-        ...     learning_rate=0.001 * 8,
-        ...     batch_size=2,
-        ...     data_root='./datasets',
-        ...     num_workers=4,
-        ... )
-        >>> model = LightningTemplateModel(**params)
-    """
-
-    def __init__(self,
-                 in_features: int = 28 * 28,
-                 hidden_dim: int = 1000,
-                 out_features: int = 10,
-                 drop_prob: float = 0.2,
-                 learning_rate: float = 0.001 * 8,
-                 batch_size: int = 2,
-                 data_root: str = './datasets',
-                 num_workers: int = 4,
-                 **kwargs
-                 ):
-        # init superclass
-        super().__init__()
-        # save all variables in __init__ signature to self.hparams
-        self.save_hyperparameters()
-
-        self.c_d1 = nn.Linear(in_features=self.hparams.in_features,
-                              out_features=self.hparams.hidden_dim)
-        self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim)
-        self.c_d1_drop = nn.Dropout(self.hparams.drop_prob)
-
-        self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim,
-
-                              out_features=self.hparams.out_features)
-
-        self.example_input_array = torch.zeros(2, 1, 28, 28)
-
-    def forward(self, x):
-        """
-        No special modification required for Lightning, define it as you normally would
-        in the `nn.Module` in vanilla PyTorch.
-        """
-        x = self.c_d1(x.view(x.size(0), -1))
-        x = torch.tanh(x)
-        x = self.c_d1_bn(x)
-        x = self.c_d1_drop(x)
-        x = self.c_d2(x)
-        return x
-
-    def training_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the training loop with the data from the training dataloader
-        passed in as `batch`.
-        """
-        # forward pass
-        x, y = batch
-        y_hat = self(x)
-        loss = F.cross_entropy(y_hat, y)
-        tensorboard_logs = {'train_loss': loss}
-        return {'loss': loss, 'log': tensorboard_logs}
-
-    def validation_step(self, batch, batch_idx):
-        """
-        Lightning calls this inside the validation loop with the data from the validation dataloader
-        passed in as `batch`.
-        """
-        x, y = batch
-        y_hat = self(x)
-        val_loss = F.cross_entropy(y_hat, y)
-        labels_hat = torch.argmax(y_hat, dim=1)
-        n_correct_pred = torch.sum(y == labels_hat).item()
-        return {'val_loss': val_loss, "n_correct_pred": n_correct_pred, "n_pred": len(x)}
-
-    def test_step(self, batch, batch_idx):
-        x, y = batch
-        y_hat = self(x)
-        test_loss = F.cross_entropy(y_hat, y)
-        labels_hat = torch.argmax(y_hat, dim=1)
-        n_correct_pred = torch.sum(y == labels_hat).item()
-        return {'test_loss': test_loss, "n_correct_pred": n_correct_pred, "n_pred": len(x)}
-
-    def validation_epoch_end(self, outputs):
-        """
-        Called at the end of validation to aggregate outputs.
-        :param outputs: list of individual outputs of each validation step.
-        """
-        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
-        val_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs)
-        tensorboard_logs = {'val_loss': avg_loss, 'val_acc': val_acc}
-        return {'val_loss': avg_loss, 'log': tensorboard_logs}
-
-    def test_epoch_end(self, outputs):
-        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
-        test_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs)
-        tensorboard_logs = {'test_loss': avg_loss, 'test_acc': test_acc}
-        return {'test_loss': avg_loss, 'log': tensorboard_logs}
-
-    # ---------------------
-    # TRAINING SETUP
-    # ---------------------
-    def configure_optimizers(self):
-        """
-        Return whatever optimizers and learning rate schedulers you want here.
-        At least one optimizer is required.
-        """
-        optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
-        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
-        return [optimizer], [scheduler]
-
-    def prepare_data(self):
-        MNIST(self.hparams.data_root, train=True, download=True, transform=transforms.ToTensor())
-        MNIST(self.hparams.data_root, train=False, download=True, transform=transforms.ToTensor())
-
-    def setup(self, stage):
-        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
-        self.mnist_train = MNIST(self.hparams.data_root, train=True, download=False, transform=transform)
-        self.mnist_test = MNIST(self.hparams.data_root, train=False, download=False, transform=transform)
-
-    def train_dataloader(self):
-        return DataLoader(self.mnist_train, batch_size=self.hparams.batch_size, num_workers=self.hparams.num_workers)
-
-    def val_dataloader(self):
-        return DataLoader(self.mnist_test, batch_size=self.hparams.batch_size, num_workers=self.hparams.num_workers)
-
-    def test_dataloader(self):
-        return DataLoader(self.mnist_test, batch_size=self.hparams.batch_size, num_workers=self.hparams.num_workers)
-
-    @staticmethod
-    def add_model_specific_args(parent_parser, root_dir):  # pragma: no-cover
-        """
-        Define parameters that only apply to this model
-        """
-        parser = ArgumentParser(parents=[parent_parser])
-
-        # param overwrites
-        # parser.set_defaults(gradient_clip_val=5.0)
-
-        # network params
-        parser.add_argument('--in_features', default=28 * 28, type=int)
-        parser.add_argument('--hidden_dim', default=50000, type=int)
-        # use 500 for CPU, 50000 for GPU to see speed difference
-        parser.add_argument('--out_features', default=10, type=int)
-        parser.add_argument('--drop_prob', default=0.2, type=float)
-
-        # data
-        parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
-        parser.add_argument('--num_workers', default=4, type=int)
-
-        # training params (opt)
-        parser.add_argument('--epochs', default=20, type=int)
-        parser.add_argument('--batch_size', default=64, type=int)
-        parser.add_argument('--learning_rate', default=0.001, type=float)
-        return parser
diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py
deleted file mode 100644
index 67f69663a3..0000000000
--- a/pl_examples/test_examples.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import os
-from unittest import mock
-
-import numpy as np
-import pytest
-import torch
-from PIL import Image
-
-
-@pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3'])
-def test_cpu_template(cli_args):
-    """Test running CLI for an example with default params."""
-    from pl_examples.basic_examples.cpu_template import run_cli
-
-    cli_args = cli_args.split(' ') if cli_args else []
-    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
-        run_cli()
-
-
-@pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --gpus 1'])
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_gpu_template(cli_args):
-    """Test running CLI for an example with default params."""
-    from pl_examples.basic_examples.gpu_template import run_cli
-
-    cli_args = cli_args.split(' ') if cli_args else []
-    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
-        run_cli()
-
-
-@pytest.mark.parametrize('cli_args', [
-    '--max_epochs 1 --gpus 1',
-    '--max_epochs 1 --gpus 1 --evaluate',
-])
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_imagenet(tmpdir, cli_args):
-    """Test running CLI for the ImageNet example with default params."""
-
-    from pl_examples.domain_templates.imagenet import run_cli
-
-    # https://github.com/pytorch/vision/blob/master/test/fakedata_generation.py#L105
-    def _make_image(file_path):
-        Image.fromarray(np.zeros((32, 32, 3), dtype=np.uint8)).save(file_path)
-
-    for split in ['train', 'val']:
-        for class_id in ['a', 'b']:
-            os.makedirs(os.path.join(tmpdir, split, class_id))
-            # Generate 5 black images
-            for image_id in range(5):
-                _make_image(os.path.join(tmpdir, split, class_id, str(image_id) + '.JPEG'))
-
-    cli_args = cli_args.split(' ') if cli_args else []
-    cli_args += ['--data-path', str(tmpdir)]
-    cli_args += ['--default_root_dir', str(tmpdir)]
-
-    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
-        run_cli()
-
-
-# @pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2'])
-# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-# def test_multi_node_ddp(cli_args):
-#     """Test running CLI for an example with default params."""
-#     from pl_examples.basic_examples.multi_node_ddp_demo import run_cli
-#
-#     cli_args = cli_args.split(' ') if cli_args else []
-#     with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
-#         run_cli()
-
-
-# @pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2'])
-# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-# def test_multi_node_ddp2(cli_args):
-#     """Test running CLI for an example with default params."""
-#     from pl_examples.basic_examples.multi_node_ddp2_demo import run_cli
-#
-#     cli_args = cli_args.split(' ') if cli_args else []
-#     with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
-#         run_cli()
diff --git a/pytorch_lightning/accelerators/dp_backend.py b/pytorch_lightning/accelerators/dp_backend.py
index 4bced57e94..0c1229781e 100644
--- a/pytorch_lightning/accelerators/dp_backend.py
+++ b/pytorch_lightning/accelerators/dp_backend.py
@@ -121,16 +121,22 @@ class DataParallelBackend(Accelerator):
     def training_step_end(self, output):
         if isinstance(output, Result):
             output.dp_reduce()
+        elif isinstance(output, torch.Tensor):
+            output = output.mean()
         return output
 
     def validation_step_end(self, output):
         if isinstance(output, Result):
             output.dp_reduce()
+        elif isinstance(output, torch.Tensor):
+            output = output.mean()
         return output
 
     def test_step_end(self, output):
         if isinstance(output, Result):
             output.dp_reduce()
+        elif isinstance(output, torch.Tensor):
+            output = output.mean()
         return output
 
     def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py
index 921c5ef082..78f16e8bc1 100644
--- a/pytorch_lightning/overrides/data_parallel.py
+++ b/pytorch_lightning/overrides/data_parallel.py
@@ -24,6 +24,7 @@ from torch.nn.parallel import DistributedDataParallel
 from torch.nn.parallel._functions import Gather
 
 from pytorch_lightning.core.step_result import Result
+from pytorch_lightning.utilities.warning_utils import WarningCache
 
 
 def _find_tensors(obj):  # pragma: no-cover
@@ -54,6 +55,9 @@ def get_a_var(obj):  # pragma: no-cover
     return None
 
 
+warning_cache = WarningCache()
+
+
 class LightningDataParallel(DataParallel):
     """
     Override the forward call in lightning so it goes to training and validation step respectively
@@ -157,6 +161,8 @@ class LightningDistributedDataParallel(DistributedDataParallel):
 
     def forward(self, *inputs, **kwargs):  # pragma: no-cover
         self._sync_params()
+        fx_called: str = ''
+
         if self.device_ids:
             inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
             if len(self.device_ids) == 1:
@@ -168,10 +174,13 @@ class LightningDistributedDataParallel(DistributedDataParallel):
                 # lightning
                 if self.module.training:
                     output = self.module.training_step(*inputs[0], **kwargs[0])
+                    fx_called = 'training_step'
                 elif self.module.testing:
                     output = self.module.test_step(*inputs[0], **kwargs[0])
+                    fx_called = 'test_step'
                 else:
                     output = self.module.validation_step(*inputs[0], **kwargs[0])
+                    fx_called = 'validation_step'
             else:
                 outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
                 output = self.gather(outputs, self.output_device)
@@ -195,9 +204,30 @@ class LightningDistributedDataParallel(DistributedDataParallel):
                 self.reducer.prepare_for_backward(list(_find_tensors(output)))
             else:
                 self.reducer.prepare_for_backward([])
+
+        if output is None:
+            warn_missing_output(fx_called)
+
+            m = f'{fx_called} returned None. Did you forget to re'
         return output
 
 
+def warn_missing_output(fx_called):
+    if fx_called == 'training_step':
+        m = """
+            Your training_step returned None. You should instead do:
+            return loss
+            or
+            return TrainResult
+        """
+    elif fx_called in ['validation_step', 'test_step']:
+        m = f"""
+            Your {fx_called} returned None. You should instead do:
+            return EvalResult
+        """
+    warning_cache.warn(m)
+
+
 def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):  # pragma: no-cover
     r"""Applies each `module` in :attr:`modules` in parallel on arguments
     contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
@@ -229,6 +259,7 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):  # pragma: n
 
     def _worker(i, module, input, kwargs, device=None):
         torch.set_grad_enabled(grad_enabled)
+        fx_called: str = ''
         if device is None:
             device = get_a_var(input).get_device()
         try:
@@ -243,14 +274,18 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):  # pragma: n
                 # CHANGE
                 if module.training:
                     output = module.training_step(*input, **kwargs)
-
+                    fx_called = 'training_step'
                 elif module.testing:
                     output = module.test_step(*input, **kwargs)
-
+                    fx_called = 'test_step'
                 else:
                     output = module.validation_step(*input, **kwargs)
+                    fx_called = 'validation_step'
 
-                if module.use_dp or module.use_ddp2:
+                if output is None:
+                    warn_missing_output(fx_called)
+
+                if output is not None and (module.use_dp or module.use_ddp2):
                     auto_squeeze_dim_zeros(output)
                 # ---------------
 
@@ -296,6 +331,10 @@ def auto_squeeze_dim_zeros(output):
     :param output:
     :return:
     """
+    if isinstance(output, torch.Tensor):
+        output = output.unsqueeze(0)
+        return output
+
     for k, v in output.items():
         if not isinstance(v, torch.Tensor):
             continue
diff --git a/pytorch_lightning/utilities/warning_utils.py b/pytorch_lightning/utilities/warning_utils.py
new file mode 100644
index 0000000000..242d5c0695
--- /dev/null
+++ b/pytorch_lightning/utilities/warning_utils.py
@@ -0,0 +1,12 @@
+from pytorch_lightning.utilities.distributed import rank_zero_warn
+
+
+class WarningCache:
+
+    def __init__(self):
+        self.warnings = set()
+
+    def warn(self, m):
+        if m not in self.warnings:
+            self.warnings.add(m)
+            rank_zero_warn(m)
diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py
index 9b3e84fe94..22b581d4eb 100644
--- a/tests/base/develop_utils.py
+++ b/tests/base/develop_utils.py
@@ -3,7 +3,6 @@ import os
 
 import numpy as np
 
-# from pl_examples import LightningTemplateModel
 from pytorch_lightning import seed_everything
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger
diff --git a/pl_examples/models/__init__.py b/tests/examples/__init__.py
similarity index 100%
rename from pl_examples/models/__init__.py
rename to tests/examples/__init__.py
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
new file mode 100644
index 0000000000..7fe5d4ed60
--- /dev/null
+++ b/tests/examples/test_examples.py
@@ -0,0 +1,94 @@
+from unittest import mock
+import torch
+import pytest
+
+dp_16_args = """
+--max_epochs 1 \
+--batch_size 32 \
+--limit_train_batches 2 \
+--limit_val_batches 2 \
+--gpus 2 \
+--distributed_backend dp \
+--precision 16 \
+"""
+
+cpu_args = """
+--max_epochs 1 \
+--batch_size 32 \
+--limit_train_batches 2 \
+--limit_val_batches 2 \
+"""
+
+ddp_args = """
+--max_epochs 1 \
+--batch_size 32 \
+--limit_train_batches 2 \
+--limit_val_batches 2 \
+--gpus 2 \
+--precision 16 \
+"""
+
+
+# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+# @pytest.mark.parametrize('cli_args', [dp_16_args])
+# def test_examples_dp_mnist(cli_args):
+#     from pl_examples.basic_examples.mnist import cli_main
+#
+#     with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()):
+#         cli_main()
+
+
+# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+# @pytest.mark.parametrize('cli_args', [dp_16_args])
+# def test_examples_dp_image_classifier(cli_args):
+#     from pl_examples.basic_examples.image_classifier import cli_main
+#
+#     with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()):
+#         cli_main()
+#
+#
+# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+# @pytest.mark.parametrize('cli_args', [dp_16_args])
+# def test_examples_dp_autoencoder(cli_args):
+#     from pl_examples.basic_examples.autoencoder import cli_main
+#
+#     with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()):
+#         cli_main()
+
+
+# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+# @pytest.mark.parametrize('cli_args', [ddp_args])
+# def test_examples_ddp_mnist(cli_args):
+#     from pl_examples.basic_examples.mnist import cli_main
+#
+#     with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()):
+#         cli_main()
+#
+#
+# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+# @pytest.mark.parametrize('cli_args', [ddp_args])
+# def test_examples_ddp_image_classifier(cli_args):
+#     from pl_examples.basic_examples.image_classifier import cli_main
+#
+#     with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()):
+#         cli_main()
+#
+#
+# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+# @pytest.mark.parametrize('cli_args', [ddp_args])
+# def test_examples_ddp_autoencoder(cli_args):
+#     from pl_examples.basic_examples.autoencoder import cli_main
+#
+#     with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()):
+#         cli_main()
+#
+
+@pytest.mark.parametrize('cli_args', [cpu_args])
+def test_examples_cpu(cli_args):
+    from pl_examples.basic_examples.mnist import cli_main as mnist_cli
+    from pl_examples.basic_examples.image_classifier import cli_main as ic_cli
+    from pl_examples.basic_examples.autoencoder import cli_main as ae_cli
+
+    for cli_cmd in [mnist_cli, ic_cli, ae_cli]:
+        with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()):
+            cli_cmd()