diff --git a/docs/source/_images/mnist_imgs/pt_to_pl.jpg b/docs/source/_images/mnist_imgs/pt_to_pl.jpg
index 6bd8b3f2d0..4bad788502 100644
Binary files a/docs/source/_images/mnist_imgs/pt_to_pl.jpg and b/docs/source/_images/mnist_imgs/pt_to_pl.jpg differ
diff --git a/docs/source/child_modules.rst b/docs/source/child_modules.rst
index 6360171c3c..a31fb245ba 100644
--- a/docs/source/child_modules.rst
+++ b/docs/source/child_modules.rst
@@ -4,7 +4,7 @@ Research projects tend to test different approaches to the same dataset.
 This is very easy to do in Lightning with inheritance.
 
 For example, imaging we now want to train an Autoencoder to use as a feature extractor for MNIST images.
-Recall that `CoolMNIST` already defines all the dataloading etc... The only things
+Recall that `LitMNIST` already defines all the dataloading etc... The only things
 that change in the `Autoencoder` model are the init, forward, training, validation and test step.
 
 .. code-block:: python
@@ -12,7 +12,7 @@ that change in the `Autoencoder` model are the init, forward, training, validati
     class Encoder(torch.nn.Module):
         ...
 
-    class AutoEncoder(CoolMNIST):
+    class AutoEncoder(LitMNIST):
         def __init__(self):
             self.encoder = Encoder()
             self.decoder = Decoder()
diff --git a/docs/source/hyperparameters.rst b/docs/source/hyperparameters.rst
index d298ca9e90..e802473f7b 100644
--- a/docs/source/hyperparameters.rst
+++ b/docs/source/hyperparameters.rst
@@ -26,9 +26,9 @@ Now we can parametrize the LightningModule.
 .. code-block:: python
     :emphasize-lines: 5,6,7,12,14
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
       def __init__(self, hparams):
-        super(CoolMNIST, self).__init__()
+        super(LitMNIST, self).__init__()
         self.hparams = hparams
 
         self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim)
@@ -46,7 +46,7 @@ Now we can parametrize the LightningModule.
         return Adam(self.parameters(), lr=self.hparams.learning_rate)
 
     hparams = parse_args()
-    model = CoolMNIST(hparams)
+    model = LitMNIST(hparams)
 
 .. note:: Bonus! if (hparams) is in your module, Lightning will save it into the checkpoint and restore your
     model using those hparams exactly.
@@ -69,7 +69,7 @@ We set up the main training entry point file like this:
 .. code-block:: python
 
     def main(args):
-        model = CoolMNIST(hparams=args)
+        model = LitMNIST(hparams=args)
         trainer = Trainer(max_epochs=args.max_epochs)
         trainer.fit(model)
 
@@ -100,7 +100,7 @@ We can do it by changing how we init the trainer.
 .. code-block:: python
 
     def main(args):
-        model = CoolMNIST(hparams=args)
+        model = LitMNIST(hparams=args)
 
         # makes all trainer options available from the command line
         trainer = Trainer.from_argparse_args(args)
@@ -119,9 +119,9 @@ polluting the main.py file, the LightningModule lets you define arguments for ea
 
 .. code-block:: python
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
       def __init__(self, hparams):
-        super(CoolMNIST, self).__init__()
+        super(LitMNIST, self).__init__()
         self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim)
 
         @staticmethod
@@ -151,9 +151,9 @@ Now we can allow each model to inject the arguments it needs in the main.py
         if args.model_name == 'gan':
             model = GoodGAN(hparams=args)
         elif args.model_name == 'mnist':
-            model = CoolMNIST(hparams=args)
+            model = LitMNIST(hparams=args)
 
-        model = CoolMNIST(hparams=args)
+        model = LitMNIST(hparams=args)
         trainer = Trainer(max_epochs=args.max_epochs)
         trainer.fit(model)
 
@@ -169,7 +169,7 @@ Now we can allow each model to inject the arguments it needs in the main.py
         if temp_args.model_name == 'gan':
             parser = GoodGAN.add_model_specific_args(parser)
         elif temp_args.model_name == 'mnist':
-            parser = CoolMNIST.add_model_specific_args(parser)
+            parser = LitMNIST.add_model_specific_args(parser)
 
         args = parser.parse_args()
 
diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst
index 81be134acd..c3dbb57e7f 100644
--- a/docs/source/introduction_guide.rst
+++ b/docs/source/introduction_guide.rst
@@ -116,10 +116,10 @@ a 3-layer neural network.
     from torch import nn
     import pytorch_lightning as pl
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
 
       def __init__(self):
-        super(CoolMNIST, self).__init__()
+        super(LitMNIST, self).__init__()
 
         # mnist images are (1, 28, 28) (channels, width, height)
         self.layer_1 = torch.nn.Linear(28 * 28, 128)
@@ -154,7 +154,7 @@ EXACTLY the same as you would a PyTorch Module.
 
 .. code-block:: default
 
-    net = CoolMNIST()
+    net = LitMNIST()
     x = torch.Tensor(1, 1, 28, 28)
     out = net(x)
 
@@ -198,7 +198,7 @@ the LightningModule
     import os
     from torchvision import datasets, transforms
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
 
       def train_dataloader(self):
         transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
@@ -218,7 +218,7 @@ In PyTorch we do it as follows:
 .. code-block:: python
 
     from torch.optim import Adam
-    optimizer = Adam(CoolMNIST().parameters(), lr=1e-3)
+    optimizer = Adam(LitMNIST().parameters(), lr=1e-3)
 
 
 In Lightning we do the same but organize it under the configure_optimizers method.
@@ -226,7 +226,7 @@ If you don't define this, Lightning will automatically use `Adam(self.parameters
 
 .. code-block:: python
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
 
       def configure_optimizers(self):
         return Adam(self.parameters(), lr=1e-3)
@@ -268,7 +268,7 @@ in the LightningModule
 
 .. code-block:: python
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
 
       def training_step(self, batch, batch_idx):
         x, y = batch
@@ -295,9 +295,9 @@ For clarity, we'll recall that the full LightningModule now looks like this.
 
 .. code-block:: python
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
       def __init__(self):
-        super(CoolMNIST, self).__init__()
+        super(LitMNIST, self).__init__()
         self.layer_1 = torch.nn.Linear(28 * 28, 128)
         self.layer_2 = torch.nn.Linear(128, 256)
         self.layer_3 = torch.nn.Linear(256, 10)
@@ -340,7 +340,7 @@ Train on CPU
 
     from pytorch_lightning import Trainer
 
-    model = CoolMNIST()
+    model = LitMNIST()
     trainer = Trainer()
     trainer.fit(model)
 
@@ -376,7 +376,7 @@ But the beauty is all the magic you can do with the trainer flags. For instance,
 
 .. code-block:: python
 
-    model = CoolMNIST()
+    model = LitMNIST()
     trainer = Trainer(gpus=1)
     trainer.fit(model)
 
@@ -391,7 +391,7 @@ Or you can also train on multiple GPUs.
 
 .. code-block:: python
 
-    model = CoolMNIST()
+    model = LitMNIST()
     trainer = Trainer(gpus=8)
     trainer.fit(model)
 
@@ -400,7 +400,7 @@ Or multiple nodes
 .. code-block:: python
 
     # (32 GPUs)
-    model = CoolMNIST()
+    model = LitMNIST()
     trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp')
     trainer.fit(model)
 
@@ -471,29 +471,47 @@ In distributed training (multiple GPUs and multiple TPU cores) each GPU or TPU c
 of this program. This means that without taking any care you will download the dataset N times which
 will cause all sorts of issues.
 
-To solve this problem, move the download code to the `prepare_data` method in the LightningModule
+To solve this problem, move the download code to the `prepare_data` method in the LightningModule.
+In this method we do all the preparation we need to do once (instead of on every gpu).
 
 .. code-block:: python
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
       def prepare_data(self):
-        MNIST(os.getcwd(), train=True, download=True, transform=transform)
+        # transform
+        transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+
+        # download
+        mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform)
+        mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transform)
+
+        # train/val split
+        mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
+
+        # assign to use in dataloaders
+        self.train_dataset = mnist_train
+        self.val_dataset = mnist_val
+        self.test_dataset = mnist_test
 
       def train_dataloader(self):
-        transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
-        mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform)
-        return DataLoader(mnist_train, batch_size=64)
+        return DataLoader(train_dataset, batch_size=64)
+
+      def val_dataloader(self):
+        return DataLoader(mnist_val, batch_size=64)
+
+      def test_dataloader(self):
+        return DataLoader(mnist_test, batch_size=64)
 
 The `prepare_data` method is also a good place to do any data processing that needs to be done only
 once (ie: download or tokenize, etc...).
 
 .. note:: Lightning inserts the correct DistributedSampler for distributed training. No need to add yourself!
 
-Now we can train the LightningModule on a TPU wihout doing anything else!
+Now we can train the LightningModule on a TPU without doing anything else!
 
 .. code-block:: python
 
-    model = CoolMNIST()
+    model = LitMNIST()
     trainer = Trainer(num_tpu_cores=8)
     trainer.fit(model)
 
@@ -531,9 +549,9 @@ Now we can parametrize the LightningModule.
 .. code-block:: python
     :emphasize-lines: 5,6,7,12,14
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
       def __init__(self, hparams):
-        super(CoolMNIST, self).__init__()
+        super(LitMNIST, self).__init__()
         self.hparams = hparams
 
         self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim)
@@ -551,7 +569,7 @@ Now we can parametrize the LightningModule.
         return Adam(self.parameters(), lr=self.hparams.learning_rate)
 
     hparams = parse_args()
-    model = CoolMNIST(hparams)
+    model = LitMNIST(hparams)
 
 .. note:: Bonus! if (hparams) is in your module, Lightning will save it into the checkpoint and restore your
     model using those hparams exactly.
@@ -596,7 +614,7 @@ sample split in the `train_dataloader` method.
 
 .. code-block:: python
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
       def validation_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
@@ -625,7 +643,7 @@ while checking the validation set.
 
     from pytorch_lightning import Trainer
 
-    model = CoolMNIST()
+    model = LitMNIST()
     trainer = Trainer(num_tpu_cores=8)
     trainer.fit(model)
 
@@ -650,7 +668,7 @@ Just like the validation loop, we define exactly the same steps for testing:
 
 .. code-block:: python
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
       def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
@@ -676,7 +694,7 @@ Once you train your model simply call `.test()`.
 
     from pytorch_lightning import Trainer
 
-    model = CoolMNIST()
+    model = LitMNIST()
     trainer = Trainer(num_tpu_cores=8)
     trainer.fit(model)
 
@@ -687,7 +705,7 @@ You can also run the test from a saved lightning model
 
 .. code-block:: python
 
-    model = CoolMNIST.load_from_checkpoint(PATH)
+    model = LitMNIST.load_from_checkpoint(PATH)
     trainer = Trainer(num_tpu_cores=8)
     trainer.test(model)
 
@@ -704,7 +722,7 @@ and use it for prediction.
 
 .. code-block:: python
 
-    model = CoolMNIST.load_from_checkpoint(PATH)
+    model = LitMNIST.load_from_checkpoint(PATH)
     x = torch.Tensor(1, 1, 28, 28)
     out = model(x)
 
@@ -773,7 +791,7 @@ Or maybe we have a model that we use to do generation
 
 .. code-block:: python
 
-    class CoolMNISTDreamer(pl.LightningModule):
+    class LitMNISTDreamer(pl.LightningModule):
 
       def forward(self, z):
         imgs = self.decoder(z)
@@ -789,7 +807,7 @@ Or maybe we have a model that we use to do generation
 
 .. code-block:: python
 
-    model = CoolMNISTDreamer.load_from_checkpoint(PATH)
+    model = LitMNISTDreamer.load_from_checkpoint(PATH)
     z = sample_noise()
     generated_imgs = model(z)
 
@@ -823,7 +841,7 @@ With your own
 
 .. code-block:: python
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
 
         def backward(self, use_amp, loss, optimizer):
             # do a custom way of backward
@@ -846,7 +864,7 @@ you could do your own:
 
 .. code-block:: python
 
-    class CoolMNIST(pl.LightningModule):
+    class LitMNIST(pl.LightningModule):
 
         def configure_ddp(self, model, device_ids):
 
diff --git a/pytorch_lightning/core/__init__.py b/pytorch_lightning/core/__init__.py
index 8cf7e9ade8..39479cf28d 100644
--- a/pytorch_lightning/core/__init__.py
+++ b/pytorch_lightning/core/__init__.py
@@ -71,10 +71,10 @@ Here are the only required methods.
 
     import pytorch_lightning as pl
 
-    class CoolModel(pl.LightningModule):
+    class LitModel(pl.LightningModule):
 
         def __init__(self):
-            super(CoolModel, self).__init__()
+            super(LitModel, self).__init__()
             self.l1 = torch.nn.Linear(28 * 28, 10)
 
         def forward(self, x):
@@ -97,7 +97,7 @@ Which you can train by doing:
 .. code-block:: python
 
    trainer = pl.Trainer()
-   model = CoolModel()
+   model = LitModel()
 
    trainer.fit(model)
 
@@ -133,7 +133,7 @@ Thus, if we wanted to add a validation loop you would add this to your Lightning
 
 .. code-block:: python
 
-        class CoolModel(pl.LightningModule):
+        class LitModel(pl.LightningModule):
             def validation_step(self, batch, batch_idx):
                 x, y = batch
                 y_hat = self.forward(x)
@@ -152,7 +152,7 @@ Add test loop
 
 .. code-block:: python
 
-        class CoolModel(pl.LightningModule):
+        class LitModel(pl.LightningModule):
             def test_step(self, batch, batch_idx):
                 x, y = batch
                 y_hat = self.forward(x)
@@ -250,11 +250,31 @@ allow for this
 .. code-block:: python
 
     def prepare_data(self):
-        # do stuff that writes to disk or should be done once
-        # this will only happen from the master GPU or TPU core
+        # download
+        mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
+        mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())
+
+        # train/val split
+        mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
+
+        # assign to use in dataloaders
+        self.train_dataset = mnist_train
+        self.val_dataset = mnist_val
+        self.test_dataset = mnist_test
+
+      def train_dataloader(self):
+        return DataLoader(train_dataset, batch_size=64)
+
+      def val_dataloader(self):
+        return DataLoader(mnist_val, batch_size=64)
+
+      def test_dataloader(self):
+        return DataLoader(mnist_test, batch_size=64)
 
 .. note:: ``prepare_data`` is called once.
 
+.. note:: Do anything with data that needs to happen ONLY once here, like download, tokenize, etc...
+
 Lifecycle
 ---------
 The methods in the LightningModule are called in this order:
@@ -262,16 +282,15 @@ The methods in the LightningModule are called in this order:
     1. ```__init__```
     2. ```prepare_data```
     3. ```configure_optimizers```
-    4. ```prepare_data```
-    5. ```train_dataloader```
+    4. ```train_dataloader```
 
     If you define a validation loop then
 
-    6. ```val_dataloader```
+    5. ```val_dataloader```
 
     And if you define a test loop:
 
-    7. ```test_dataloader```
+    6. ```test_dataloader```
 
 .. note:: ``test_dataloader`` is only called with ``.test()``
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index d8d5f1d5e5..18e038fd86 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -914,24 +914,26 @@ class LightningModule(ABC, GradInformation, ModelIO, ModelHooks):
                     dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called every epoch
                     return [gen_opt, dis_opt], [gen_sched, dis_sched]
 
-        .. note:: Lightning calls ``.backward()`` and ``.step()`` on each optimizer
+        Some things to know
+
+            - Lightning calls ``.backward()`` and ``.step()`` on each optimizer
             and learning rate scheduler as needed.
 
-        .. note:: If you use 16-bit precision (``use_amp=True``), Lightning will automatically
+            - If you use 16-bit precision (``precision=16``), Lightning will automatically
             handle the optimizers for you.
 
-        .. note:: If you use multiple optimizers, training_step will have an additional
+            - If you use multiple optimizers, training_step will have an additional
             ``optimizer_idx`` parameter.
 
-        .. note:: If you use LBFGS lightning handles the closure function automatically for you
+            - If you use LBFGS lightning handles the closure function automatically for you
 
-        .. note:: If you use multiple optimizers, gradients will be calculated only
+            - If you use multiple optimizers, gradients will be calculated only
             for the parameters of current optimizer at each training step.
 
-        .. note:: If you need to control how often those optimizers step or override the
+            - If you need to control how often those optimizers step or override the
             default .step() schedule, override the `optimizer_step` hook.
 
-        .. note:: If you only want to call a learning rate scheduler every `x` step or epoch,
+            - If you only want to call a learning rate scheduler every `x` step or epoch,
             you can input this as 'frequency' key: dict(scheduler=lr_scheduler,
                                                         interval='step' or 'epoch', frequency=x)
 
diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py
index 92521ac00f..70cc4da76f 100644
--- a/pytorch_lightning/trainer/__init__.py
+++ b/pytorch_lightning/trainer/__init__.py
@@ -251,8 +251,6 @@ early_stop_callback (:class:`pytorch_lightning.callbacks.EarlyStopping`)
 - ``None``: The default callback monitoring ``'val_loss'`` is created.
 - Default: ``None``.
 
-.. note:: If ``'val_loss'`` is not found will work as if early stopping is disabled.
-
 .. code-block:: python
 
     trainer = Trainer(early_stop_callback=early_stop_callback)
@@ -270,6 +268,8 @@ Example::
         mode='min'
     )
 
+.. note:: If ``'val_loss'`` is not found will work as if early stopping is disabled.
+
 fast_dev_run
 ^^^^^^^^^^^^
 
@@ -354,8 +354,6 @@ Options:
 - 'min_max'
 - 'all'
 
-.. note:: Might slow performance because it uses the output of nvidia-smi.
-
 Example::
 
     # default used by the Trainer
@@ -367,6 +365,8 @@ Example::
     # log only the min and max memory on the master node
     trainer = Trainer(log_gpu_memory='min_max')
 
+.. note:: Might slow performance because it uses the output of nvidia-smi.
+
 log_save_interval
 ^^^^^^^^^^^^^^^^^
 
@@ -773,8 +773,6 @@ and the trainer will apply Truncated Backprop to it.
 recurrent network trajectories."
 <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.56.7941&rep=rep1&type=pdf>`_)
 
-.. note::  Make sure your batches have a sequence dimension.
-
 Example::
 
     # default used by the Trainer (ie: disabled)
@@ -783,6 +781,7 @@ Example::
     # backprop every 5 steps in a batch
     trainer = Trainer(truncated_bptt_steps=5)
 
+.. note::  Make sure your batches have a sequence dimension.
 
 Lightning takes care to split your batch along the time-dimension.