diff --git a/docs/source/_images/mnist_imgs/pt_to_pl.jpg b/docs/source/_images/mnist_imgs/pt_to_pl.jpg index 6bd8b3f2d0..4bad788502 100644 Binary files a/docs/source/_images/mnist_imgs/pt_to_pl.jpg and b/docs/source/_images/mnist_imgs/pt_to_pl.jpg differ diff --git a/docs/source/child_modules.rst b/docs/source/child_modules.rst index 6360171c3c..a31fb245ba 100644 --- a/docs/source/child_modules.rst +++ b/docs/source/child_modules.rst @@ -4,7 +4,7 @@ Research projects tend to test different approaches to the same dataset. This is very easy to do in Lightning with inheritance. For example, imaging we now want to train an Autoencoder to use as a feature extractor for MNIST images. -Recall that `CoolMNIST` already defines all the dataloading etc... The only things +Recall that `LitMNIST` already defines all the dataloading etc... The only things that change in the `Autoencoder` model are the init, forward, training, validation and test step. .. code-block:: python @@ -12,7 +12,7 @@ that change in the `Autoencoder` model are the init, forward, training, validati class Encoder(torch.nn.Module): ... - class AutoEncoder(CoolMNIST): + class AutoEncoder(LitMNIST): def __init__(self): self.encoder = Encoder() self.decoder = Decoder() diff --git a/docs/source/hyperparameters.rst b/docs/source/hyperparameters.rst index d298ca9e90..e802473f7b 100644 --- a/docs/source/hyperparameters.rst +++ b/docs/source/hyperparameters.rst @@ -26,9 +26,9 @@ Now we can parametrize the LightningModule. .. code-block:: python :emphasize-lines: 5,6,7,12,14 - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def __init__(self, hparams): - super(CoolMNIST, self).__init__() + super(LitMNIST, self).__init__() self.hparams = hparams self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim) @@ -46,7 +46,7 @@ Now we can parametrize the LightningModule. return Adam(self.parameters(), lr=self.hparams.learning_rate) hparams = parse_args() - model = CoolMNIST(hparams) + model = LitMNIST(hparams) .. note:: Bonus! if (hparams) is in your module, Lightning will save it into the checkpoint and restore your model using those hparams exactly. @@ -69,7 +69,7 @@ We set up the main training entry point file like this: .. code-block:: python def main(args): - model = CoolMNIST(hparams=args) + model = LitMNIST(hparams=args) trainer = Trainer(max_epochs=args.max_epochs) trainer.fit(model) @@ -100,7 +100,7 @@ We can do it by changing how we init the trainer. .. code-block:: python def main(args): - model = CoolMNIST(hparams=args) + model = LitMNIST(hparams=args) # makes all trainer options available from the command line trainer = Trainer.from_argparse_args(args) @@ -119,9 +119,9 @@ polluting the main.py file, the LightningModule lets you define arguments for ea .. code-block:: python - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def __init__(self, hparams): - super(CoolMNIST, self).__init__() + super(LitMNIST, self).__init__() self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim) @staticmethod @@ -151,9 +151,9 @@ Now we can allow each model to inject the arguments it needs in the main.py if args.model_name == 'gan': model = GoodGAN(hparams=args) elif args.model_name == 'mnist': - model = CoolMNIST(hparams=args) + model = LitMNIST(hparams=args) - model = CoolMNIST(hparams=args) + model = LitMNIST(hparams=args) trainer = Trainer(max_epochs=args.max_epochs) trainer.fit(model) @@ -169,7 +169,7 @@ Now we can allow each model to inject the arguments it needs in the main.py if temp_args.model_name == 'gan': parser = GoodGAN.add_model_specific_args(parser) elif temp_args.model_name == 'mnist': - parser = CoolMNIST.add_model_specific_args(parser) + parser = LitMNIST.add_model_specific_args(parser) args = parser.parse_args() diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index 81be134acd..c3dbb57e7f 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -116,10 +116,10 @@ a 3-layer neural network. from torch import nn import pytorch_lightning as pl - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def __init__(self): - super(CoolMNIST, self).__init__() + super(LitMNIST, self).__init__() # mnist images are (1, 28, 28) (channels, width, height) self.layer_1 = torch.nn.Linear(28 * 28, 128) @@ -154,7 +154,7 @@ EXACTLY the same as you would a PyTorch Module. .. code-block:: default - net = CoolMNIST() + net = LitMNIST() x = torch.Tensor(1, 1, 28, 28) out = net(x) @@ -198,7 +198,7 @@ the LightningModule import os from torchvision import datasets, transforms - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def train_dataloader(self): transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) @@ -218,7 +218,7 @@ In PyTorch we do it as follows: .. code-block:: python from torch.optim import Adam - optimizer = Adam(CoolMNIST().parameters(), lr=1e-3) + optimizer = Adam(LitMNIST().parameters(), lr=1e-3) In Lightning we do the same but organize it under the configure_optimizers method. @@ -226,7 +226,7 @@ If you don't define this, Lightning will automatically use `Adam(self.parameters .. code-block:: python - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def configure_optimizers(self): return Adam(self.parameters(), lr=1e-3) @@ -268,7 +268,7 @@ in the LightningModule .. code-block:: python - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def training_step(self, batch, batch_idx): x, y = batch @@ -295,9 +295,9 @@ For clarity, we'll recall that the full LightningModule now looks like this. .. code-block:: python - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def __init__(self): - super(CoolMNIST, self).__init__() + super(LitMNIST, self).__init__() self.layer_1 = torch.nn.Linear(28 * 28, 128) self.layer_2 = torch.nn.Linear(128, 256) self.layer_3 = torch.nn.Linear(256, 10) @@ -340,7 +340,7 @@ Train on CPU from pytorch_lightning import Trainer - model = CoolMNIST() + model = LitMNIST() trainer = Trainer() trainer.fit(model) @@ -376,7 +376,7 @@ But the beauty is all the magic you can do with the trainer flags. For instance, .. code-block:: python - model = CoolMNIST() + model = LitMNIST() trainer = Trainer(gpus=1) trainer.fit(model) @@ -391,7 +391,7 @@ Or you can also train on multiple GPUs. .. code-block:: python - model = CoolMNIST() + model = LitMNIST() trainer = Trainer(gpus=8) trainer.fit(model) @@ -400,7 +400,7 @@ Or multiple nodes .. code-block:: python # (32 GPUs) - model = CoolMNIST() + model = LitMNIST() trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp') trainer.fit(model) @@ -471,29 +471,47 @@ In distributed training (multiple GPUs and multiple TPU cores) each GPU or TPU c of this program. This means that without taking any care you will download the dataset N times which will cause all sorts of issues. -To solve this problem, move the download code to the `prepare_data` method in the LightningModule +To solve this problem, move the download code to the `prepare_data` method in the LightningModule. +In this method we do all the preparation we need to do once (instead of on every gpu). .. code-block:: python - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def prepare_data(self): - MNIST(os.getcwd(), train=True, download=True, transform=transform) + # transform + transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) + + # download + mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform) + mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transform) + + # train/val split + mnist_train, mnist_val = random_split(mnist_train, [55000, 5000]) + + # assign to use in dataloaders + self.train_dataset = mnist_train + self.val_dataset = mnist_val + self.test_dataset = mnist_test def train_dataloader(self): - transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) - mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform) - return DataLoader(mnist_train, batch_size=64) + return DataLoader(train_dataset, batch_size=64) + + def val_dataloader(self): + return DataLoader(mnist_val, batch_size=64) + + def test_dataloader(self): + return DataLoader(mnist_test, batch_size=64) The `prepare_data` method is also a good place to do any data processing that needs to be done only once (ie: download or tokenize, etc...). .. note:: Lightning inserts the correct DistributedSampler for distributed training. No need to add yourself! -Now we can train the LightningModule on a TPU wihout doing anything else! +Now we can train the LightningModule on a TPU without doing anything else! .. code-block:: python - model = CoolMNIST() + model = LitMNIST() trainer = Trainer(num_tpu_cores=8) trainer.fit(model) @@ -531,9 +549,9 @@ Now we can parametrize the LightningModule. .. code-block:: python :emphasize-lines: 5,6,7,12,14 - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def __init__(self, hparams): - super(CoolMNIST, self).__init__() + super(LitMNIST, self).__init__() self.hparams = hparams self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim) @@ -551,7 +569,7 @@ Now we can parametrize the LightningModule. return Adam(self.parameters(), lr=self.hparams.learning_rate) hparams = parse_args() - model = CoolMNIST(hparams) + model = LitMNIST(hparams) .. note:: Bonus! if (hparams) is in your module, Lightning will save it into the checkpoint and restore your model using those hparams exactly. @@ -596,7 +614,7 @@ sample split in the `train_dataloader` method. .. code-block:: python - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def validation_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) @@ -625,7 +643,7 @@ while checking the validation set. from pytorch_lightning import Trainer - model = CoolMNIST() + model = LitMNIST() trainer = Trainer(num_tpu_cores=8) trainer.fit(model) @@ -650,7 +668,7 @@ Just like the validation loop, we define exactly the same steps for testing: .. code-block:: python - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def test_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) @@ -676,7 +694,7 @@ Once you train your model simply call `.test()`. from pytorch_lightning import Trainer - model = CoolMNIST() + model = LitMNIST() trainer = Trainer(num_tpu_cores=8) trainer.fit(model) @@ -687,7 +705,7 @@ You can also run the test from a saved lightning model .. code-block:: python - model = CoolMNIST.load_from_checkpoint(PATH) + model = LitMNIST.load_from_checkpoint(PATH) trainer = Trainer(num_tpu_cores=8) trainer.test(model) @@ -704,7 +722,7 @@ and use it for prediction. .. code-block:: python - model = CoolMNIST.load_from_checkpoint(PATH) + model = LitMNIST.load_from_checkpoint(PATH) x = torch.Tensor(1, 1, 28, 28) out = model(x) @@ -773,7 +791,7 @@ Or maybe we have a model that we use to do generation .. code-block:: python - class CoolMNISTDreamer(pl.LightningModule): + class LitMNISTDreamer(pl.LightningModule): def forward(self, z): imgs = self.decoder(z) @@ -789,7 +807,7 @@ Or maybe we have a model that we use to do generation .. code-block:: python - model = CoolMNISTDreamer.load_from_checkpoint(PATH) + model = LitMNISTDreamer.load_from_checkpoint(PATH) z = sample_noise() generated_imgs = model(z) @@ -823,7 +841,7 @@ With your own .. code-block:: python - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def backward(self, use_amp, loss, optimizer): # do a custom way of backward @@ -846,7 +864,7 @@ you could do your own: .. code-block:: python - class CoolMNIST(pl.LightningModule): + class LitMNIST(pl.LightningModule): def configure_ddp(self, model, device_ids): diff --git a/pytorch_lightning/core/__init__.py b/pytorch_lightning/core/__init__.py index 8cf7e9ade8..39479cf28d 100644 --- a/pytorch_lightning/core/__init__.py +++ b/pytorch_lightning/core/__init__.py @@ -71,10 +71,10 @@ Here are the only required methods. import pytorch_lightning as pl - class CoolModel(pl.LightningModule): + class LitModel(pl.LightningModule): def __init__(self): - super(CoolModel, self).__init__() + super(LitModel, self).__init__() self.l1 = torch.nn.Linear(28 * 28, 10) def forward(self, x): @@ -97,7 +97,7 @@ Which you can train by doing: .. code-block:: python trainer = pl.Trainer() - model = CoolModel() + model = LitModel() trainer.fit(model) @@ -133,7 +133,7 @@ Thus, if we wanted to add a validation loop you would add this to your Lightning .. code-block:: python - class CoolModel(pl.LightningModule): + class LitModel(pl.LightningModule): def validation_step(self, batch, batch_idx): x, y = batch y_hat = self.forward(x) @@ -152,7 +152,7 @@ Add test loop .. code-block:: python - class CoolModel(pl.LightningModule): + class LitModel(pl.LightningModule): def test_step(self, batch, batch_idx): x, y = batch y_hat = self.forward(x) @@ -250,11 +250,31 @@ allow for this .. code-block:: python def prepare_data(self): - # do stuff that writes to disk or should be done once - # this will only happen from the master GPU or TPU core + # download + mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()) + mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor()) + + # train/val split + mnist_train, mnist_val = random_split(mnist_train, [55000, 5000]) + + # assign to use in dataloaders + self.train_dataset = mnist_train + self.val_dataset = mnist_val + self.test_dataset = mnist_test + + def train_dataloader(self): + return DataLoader(train_dataset, batch_size=64) + + def val_dataloader(self): + return DataLoader(mnist_val, batch_size=64) + + def test_dataloader(self): + return DataLoader(mnist_test, batch_size=64) .. note:: ``prepare_data`` is called once. +.. note:: Do anything with data that needs to happen ONLY once here, like download, tokenize, etc... + Lifecycle --------- The methods in the LightningModule are called in this order: @@ -262,16 +282,15 @@ The methods in the LightningModule are called in this order: 1. ```__init__``` 2. ```prepare_data``` 3. ```configure_optimizers``` - 4. ```prepare_data``` - 5. ```train_dataloader``` + 4. ```train_dataloader``` If you define a validation loop then - 6. ```val_dataloader``` + 5. ```val_dataloader``` And if you define a test loop: - 7. ```test_dataloader``` + 6. ```test_dataloader``` .. note:: ``test_dataloader`` is only called with ``.test()`` diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index d8d5f1d5e5..18e038fd86 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -914,24 +914,26 @@ class LightningModule(ABC, GradInformation, ModelIO, ModelHooks): dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called every epoch return [gen_opt, dis_opt], [gen_sched, dis_sched] - .. note:: Lightning calls ``.backward()`` and ``.step()`` on each optimizer + Some things to know + + - Lightning calls ``.backward()`` and ``.step()`` on each optimizer and learning rate scheduler as needed. - .. note:: If you use 16-bit precision (``use_amp=True``), Lightning will automatically + - If you use 16-bit precision (``precision=16``), Lightning will automatically handle the optimizers for you. - .. note:: If you use multiple optimizers, training_step will have an additional + - If you use multiple optimizers, training_step will have an additional ``optimizer_idx`` parameter. - .. note:: If you use LBFGS lightning handles the closure function automatically for you + - If you use LBFGS lightning handles the closure function automatically for you - .. note:: If you use multiple optimizers, gradients will be calculated only + - If you use multiple optimizers, gradients will be calculated only for the parameters of current optimizer at each training step. - .. note:: If you need to control how often those optimizers step or override the + - If you need to control how often those optimizers step or override the default .step() schedule, override the `optimizer_step` hook. - .. note:: If you only want to call a learning rate scheduler every `x` step or epoch, + - If you only want to call a learning rate scheduler every `x` step or epoch, you can input this as 'frequency' key: dict(scheduler=lr_scheduler, interval='step' or 'epoch', frequency=x) diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 92521ac00f..70cc4da76f 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -251,8 +251,6 @@ early_stop_callback (:class:`pytorch_lightning.callbacks.EarlyStopping`) - ``None``: The default callback monitoring ``'val_loss'`` is created. - Default: ``None``. -.. note:: If ``'val_loss'`` is not found will work as if early stopping is disabled. - .. code-block:: python trainer = Trainer(early_stop_callback=early_stop_callback) @@ -270,6 +268,8 @@ Example:: mode='min' ) +.. note:: If ``'val_loss'`` is not found will work as if early stopping is disabled. + fast_dev_run ^^^^^^^^^^^^ @@ -354,8 +354,6 @@ Options: - 'min_max' - 'all' -.. note:: Might slow performance because it uses the output of nvidia-smi. - Example:: # default used by the Trainer @@ -367,6 +365,8 @@ Example:: # log only the min and max memory on the master node trainer = Trainer(log_gpu_memory='min_max') +.. note:: Might slow performance because it uses the output of nvidia-smi. + log_save_interval ^^^^^^^^^^^^^^^^^ @@ -773,8 +773,6 @@ and the trainer will apply Truncated Backprop to it. recurrent network trajectories." `_) -.. note:: Make sure your batches have a sequence dimension. - Example:: # default used by the Trainer (ie: disabled) @@ -783,6 +781,7 @@ Example:: # backprop every 5 steps in a batch trainer = Trainer(truncated_bptt_steps=5) +.. note:: Make sure your batches have a sequence dimension. Lightning takes care to split your batch along the time-dimension.