diff --git a/pl_examples/README.md b/pl_examples/README.md index 93715b0e44..f47132999e 100644 --- a/pl_examples/README.md +++ b/pl_examples/README.md @@ -1,67 +1,19 @@ # Examples -This folder has 3 sections: - -## Basic Examples -Use these examples to test how lightning works. - -#### Test on CPU -```bash -python cpu_template.py -``` - ---- -#### Train on a single GPU -```bash -python gpu_template.py --gpus 1 -``` - ---- -#### DataParallel (dp) -Train on multiple GPUs using DataParallel. - -```bash -python gpu_template.py --gpus 2 --distributed_backend dp -``` +Our most robust examples showing all sorts of implementations +can be found in our sister library [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2). --- -#### DistributedDataParallel (ddp) -Train on multiple GPUs using DistributedDataParallel -```bash -python gpu_template.py --gpus 2 --distributed_backend ddp -``` +## Basic examples +In this folder we add 3 simple examples: + +* [Image Classifier]() (trains arbitrary datasets with arbitrary backbones). +* [MNIST classifier]() (defined the model inside the lightningModule). +* [Autoencoder]() (shows how the LightningModule is meant to be used as a system) --- -#### DistributedDataParallel+DP (ddp2) -Train on multiple GPUs using DistributedDataParallel + dataparallel. -On a single node, uses all GPUs for 1 model. Then shares gradient information -across nodes. -```bash -python gpu_template.py --gpus 2 --distributed_backend ddp2 -``` - -## Multi-node example - -This demo launches a job using 2 GPUs on 2 different nodes (4 GPUs total). -To run this demo do the following: - -1. Log into the jumphost node of your SLURM-managed cluster. -2. Create a conda environment with Lightning and a GPU PyTorch version. -3. Choose a script to submit - -### DDP -Submit this job to run with DistributedDataParallel (2 nodes, 2 gpus each) -```bash -sbatch ddp_job_submit.sh YourEnv -``` - -### DDP2 -Submit this job to run with a different implementation of DistributedDataParallel. -In this version, each node acts like DataParallel but syncs across nodes like DDP. -```bash -sbatch ddp2_job_submit.sh YourEnv -``` - -## Domain templates -These are templates to show common approaches such as GANs and RL. +## Domain examples +This folder contains older examples. You should instead use the examples +in [PyTorch-Lightning-Bolts](https://pytorch-lightning-bolts.readthedocs.io/en/latest/convolutional.html#gpt-2) +for advanced use cases. diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py index 1c5908539c..e69de29bb2 100644 --- a/pl_examples/__init__.py +++ b/pl_examples/__init__.py @@ -1,147 +0,0 @@ -""" -Template model definition -------------------------- - -In 99% of cases you want to just copy `one of the examples -`_ -to start a new lightningModule and change the core of what your model is actually trying to do. - -.. code-block:: bash - - # get a copy of the module template - wget https://raw.githubusercontent.com/PyTorchLightning/pytorch-lightning/master/pl_examples/new_project_templates/lightning_module_template.py # noqa: E501 - - -Trainer Example ---------------- - -**`__main__` function** - -Normally, we want to let the `__main__` function start the training. - Inside the main we parse training arguments with whatever hyperparameters we want. - Your LightningModule will have a chance to add hyperparameters. - -.. code-block:: python - - from test_tube import HyperOptArgumentParser - - if __name__ == '__main__': - - # use default args given by lightning - root_dir = os.path.split(os.path.dirname(sys.modules['__main__'].__file__))[0] - parent_parser = HyperOptArgumentParser(strategy='random_search', add_help=False) - add_default_args(parent_parser, root_dir) - - # allow model to overwrite or extend args - parser = ExampleModel.add_model_specific_args(parent_parser) - hyperparams = parser.parse_args() - - # train model - main(hyperparams) - -**Main Function** - -The main function is your entry into the program. This is where you init your model, checkpoint directory, - and launch the training. The main function should have 3 arguments: - -- hparams: a configuration of hyperparameters. -- slurm_manager: Slurm cluster manager object (can be None) -- dict: for you to return any values you want (useful in meta-learning, otherwise set to) - -.. code-block:: python - - def main(hparams, cluster, results_dict): - # build model - model = MyLightningModule(hparams) - - # configure trainer - trainer = Trainer() - - # train model - trainer.fit(model) - - -The `__main__` function will start training on your **main** function. - If you use the HyperParameterOptimizer in hyper parameter optimization mode, - this main function will get one set of hyperparameters. If you use it as a simple - argument parser you get the default arguments in the argument parser. - -So, calling main(hyperparams) runs the model with the default argparse arguments.:: - - main(hyperparams) - - -CPU hyperparameter search -------------------------- - -.. code-block:: python - - # run a grid search over 20 hyperparameter combinations. - hyperparams.optimize_parallel_cpu( - main_local, - nb_trials=20, - nb_workers=1 - ) - - -Hyperparameter search on a single or multiple GPUs --------------------------------------------------- - -.. code-block:: python - - # run a grid search over 20 hyperparameter combinations. - hyperparams.optimize_parallel_gpu( - main_local, - nb_trials=20, - nb_workers=1, - gpus=[0,1,2,3] - ) - - -Hyperparameter search on a SLURM HPC cluster --------------------------------------------- - -.. code-block:: python - - def optimize_on_cluster(hyperparams): - # enable cluster training - cluster = SlurmCluster( - hyperparam_optimizer=hyperparams, - log_path=hyperparams.tt_save_path, - test_tube_exp_name=hyperparams.tt_name - ) - - # email for cluster coms - cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True) - - # configure cluster - cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus - cluster.job_time = '48:00:00' - cluster.gpu_type = '1080ti' - cluster.memory_mb_per_node = 48000 - - # any modules for code to run in env - cluster.add_command('source activate pytorch_lightning') - - # name of exp - job_display_name = hyperparams.tt_name.split('_')[0] - job_display_name = job_display_name[0:3] - - # run hopt - logging.info('submitting jobs...') - cluster.optimize_parallel_cluster_gpu( - main, - nb_trials=hyperparams.nb_hopt_trials, - job_name=job_display_name - ) - - # run cluster hyperparameter search - optimize_on_cluster(hyperparams) - -""" - -from pl_examples.models.lightning_template import LightningTemplateModel - -__all__ = [ - 'LightningTemplateModel' -] diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md index 63fdc7f8c4..4dcf06a74b 100644 --- a/pl_examples/basic_examples/README.md +++ b/pl_examples/basic_examples/README.md @@ -1,44 +1,47 @@ ## Basic Examples Use these examples to test how lightning works. -#### Test on CPU +#### MNIST +Trains MNIST where the model is defined inside the LightningModule. ```bash -python cpu_template.py +# cpu +python mnist.py + +# gpus (any number) +python mnist.py + +# dataparallel +python mnist.py --gpus 2 --distributed_backend 'dp' ``` --- -#### Train on a single GPU +#### Image classifier +Generic image classifier with an arbitrary backbone (ie: a simple system) ```bash -python gpu_template.py --gpus 1 -``` +# cpu +python image_classifier.py +# gpus (any number) +python image_classifier.py --gpus 2 + +# dataparallel +python image_classifier.py --gpus 2 --distributed_backend 'dp' +``` + +--- +#### Autoencoder +Showing the power of a system... arbitrarily complex training loops +```bash +# cpu +python autoencoder.py + +# gpus (any number) +python autoencoder.py --gpus 2 + +# dataparallel +python autoencoder.py --gpus 2 --distributed_backend 'dp' +``` --- -#### DataParallel (dp) -Train on multiple GPUs using DataParallel. - -```bash -python gpu_template.py --gpus 2 --distributed_backend dp -``` - ---- -#### DistributedDataParallel (ddp) - -Train on multiple GPUs using DistributedDataParallel -```bash -python gpu_template.py --gpus 2 --distributed_backend ddp -``` - ---- -#### DistributedDataParallel+DP (ddp2) - -Train on multiple GPUs using DistributedDataParallel + DataParallel. -On a single node, uses all GPUs for 1 model. Then shares gradient information -across nodes. -```bash -python gpu_template.py --gpus 2 --distributed_backend ddp2 -``` - - # Multi-node example This demo launches a job using 2 GPUs on 2 different nodes (4 GPUs total). @@ -51,12 +54,12 @@ To run this demo do the following: #### DDP Submit this job to run with DistributedDataParallel (2 nodes, 2 gpus each) ```bash -sbatch ddp_job_submit.sh YourEnv +sbatch submit_ddp_job.sh YourEnv ``` #### DDP2 Submit this job to run with a different implementation of DistributedDataParallel. In this version, each node acts like DataParallel but syncs across nodes like DDP. ```bash -sbatch ddp2_job_submit.sh YourEnv +sbatch submit_ddp2_job.sh YourEnv ``` diff --git a/pl_examples/basic_examples/autoencoder.py b/pl_examples/basic_examples/autoencoder.py new file mode 100644 index 0000000000..9fe1588617 --- /dev/null +++ b/pl_examples/basic_examples/autoencoder.py @@ -0,0 +1,99 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser +import torch +from torch import nn +import torch.nn.functional as F +from torch.utils.data import DataLoader +import pytorch_lightning as pl +from torch.utils.data import random_split +from tests.base.datasets import MNIST + + +class LitAutoEncoder(pl.LightningModule): + + def __init__(self): + super().__init__() + self.encoder = nn.Sequential( + nn.Linear(28 * 28, 64), + nn.ReLU(), + nn.Linear(64, 3) + ) + self.decoder = nn.Sequential( + nn.Linear(3, 64), + nn.ReLU(), + nn.Linear(64, 28 * 28) + ) + + def forward(self, x): + # in lightning, forward defines the prediction/inference actions + embedding = self.encoder(x) + return embedding + + def training_step(self, batch, batch_idx): + x, y = batch + x = x.view(x.size(0), -1) + z = self.encoder(x) + x_hat = self.decoder(z) + loss = F.mse_loss(x_hat, x) + return pl.TrainResult(loss, checkpoint_on=loss) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) + return optimizer + + +def cli_main(): + pl.seed_everything(1234) + + # ------------ + # args + # ------------ + parser = ArgumentParser() + parser.add_argument('--batch_size', default=32, type=int) + parser.add_argument('--hidden_dim', type=int, default=128) + parser = pl.Trainer.add_argparse_args(parser) + args = parser.parse_args() + + # ------------ + # data + # ------------ + dataset = MNIST('', train=True, download=True) + mnist_test = MNIST('', train=False, download=True) + mnist_train, mnist_val = random_split(dataset, [55000, 5000]) + + train_loader = DataLoader(mnist_train, batch_size=args.batch_size) + val_loader = DataLoader(mnist_val, batch_size=args.batch_size) + test_loader = DataLoader(mnist_test, batch_size=args.batch_size) + + # ------------ + # model + # ------------ + model = LitAutoEncoder() + + # ------------ + # training + # ------------ + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model, train_loader, val_loader) + + # ------------ + # testing + # ------------ + trainer.test(test_dataloaders=test_loader) + + +if __name__ == '__main__': + cli_main() diff --git a/pl_examples/basic_examples/cpu_template.py b/pl_examples/basic_examples/cpu_template.py deleted file mode 100644 index 9d1fb52495..0000000000 --- a/pl_examples/basic_examples/cpu_template.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Runs a model on the CPU on a single node. -""" -import os -from argparse import ArgumentParser - -from pl_examples.models.lightning_template import LightningTemplateModel -from pytorch_lightning import Trainer, seed_everything - -seed_everything(234) - - -def main(args): - """ Main training routine specific for this project. """ - # ------------------------ - # 1 INIT LIGHTNING MODEL - # ------------------------ - model = LightningTemplateModel(**vars(args)) - - # ------------------------ - # 2 INIT TRAINER - # ------------------------ - trainer = Trainer.from_argparse_args(args) - - # ------------------------ - # 3 START TRAINING - # ------------------------ - trainer.fit(model) - - -def run_cli(): - # ------------------------ - # TRAINING ARGUMENTS - # ------------------------ - # these are project-wide arguments - root_dir = os.path.dirname(os.path.realpath(__file__)) - parent_parser = ArgumentParser(add_help=False) - - # each LightningModule defines arguments relevant to it - parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) - parser = Trainer.add_argparse_args(parser) - args = parser.parse_args() - - # --------------------- - # RUN TRAINING - # --------------------- - main(args) - - -if __name__ == '__main__': - run_cli() diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py deleted file mode 100644 index 64cbabb10b..0000000000 --- a/pl_examples/basic_examples/gpu_template.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Runs a model on a single node across multiple gpus. -""" -import os -from argparse import ArgumentParser - -from pl_examples.models.lightning_template import LightningTemplateModel -from pytorch_lightning import Trainer, seed_everything - -seed_everything(234) - - -def main(args): - """ Main training routine specific for this project. """ - # ------------------------ - # 1 INIT LIGHTNING MODEL - # ------------------------ - model = LightningTemplateModel(**vars(args)) - - # ------------------------ - # 2 INIT TRAINER - # ------------------------ - trainer = Trainer.from_argparse_args(args) - - # ------------------------ - # 3 START TRAINING - # ------------------------ - trainer.fit(model) - - -def run_cli(): - # ------------------------ - # TRAINING ARGUMENTS - # ------------------------ - # these are project-wide arguments - root_dir = os.path.dirname(os.path.realpath(__file__)) - parent_parser = ArgumentParser(add_help=False) - - # each LightningModule defines arguments relevant to it - parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) - parser = Trainer.add_argparse_args(parser) - parser.set_defaults(gpus=2) - args = parser.parse_args() - - # --------------------- - # RUN TRAINING - # --------------------- - main(args) - - -if __name__ == '__main__': - run_cli() diff --git a/pl_examples/basic_examples/image_classifier.py b/pl_examples/basic_examples/image_classifier.py new file mode 100644 index 0000000000..a06183a64d --- /dev/null +++ b/pl_examples/basic_examples/image_classifier.py @@ -0,0 +1,123 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser + +import torch +import pytorch_lightning as pl +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from tests.base.datasets import MNIST + + +class Backbone(torch.nn.Module): + def __init__(self, hidden_dim=128): + super().__init__() + self.l1 = torch.nn.Linear(28 * 28, hidden_dim) + self.l2 = torch.nn.Linear(hidden_dim, 10) + + def forward(self, x): + x = x.view(x.size(0), -1) + x = torch.relu(self.l1(x)) + x = torch.relu(self.l2(x)) + return x + + +class LitClassifier(pl.LightningModule): + def __init__(self, backbone, learning_rate=1e-3): + super().__init__() + self.save_hyperparameters() + self.backbone = backbone + + def forward(self, x): + # use forward for inference/predictions + embedding = self.backbone(x) + return embedding + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.backbone(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self.backbone(x) + loss = F.cross_entropy(y_hat, y) + result = pl.EvalResult(checkpoint_on=loss) + result.log('valid_loss', loss) + return result + + def test_step(self, batch, batch_idx): + x, y = batch + y_hat = self.backbone(x) + loss = F.cross_entropy(y_hat, y) + result = pl.EvalResult(checkpoint_on=loss) + result.log('test_loss', loss) + return result + + def configure_optimizers(self): + # self.hparams available because we called self.save_hyperparameters() + return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + + @staticmethod + def add_model_specific_args(parent_parser): + parser = ArgumentParser(parents=[parent_parser], add_help=False) + parser.add_argument('--learning_rate', type=float, default=0.0001) + return parser + + +def cli_main(): + pl.seed_everything(1234) + + # ------------ + # args + # ------------ + parser = ArgumentParser() + parser.add_argument('--batch_size', default=32, type=int) + parser.add_argument('--hidden_dim', type=int, default=128) + parser = pl.Trainer.add_argparse_args(parser) + parser = LitClassifier.add_model_specific_args(parser) + args = parser.parse_args() + + # ------------ + # data + # ------------ + dataset = MNIST('', train=True, download=True) + mnist_test = MNIST('', train=False, download=True) + mnist_train, mnist_val = random_split(dataset, [55000, 5000]) + + train_loader = DataLoader(mnist_train, batch_size=args.batch_size) + val_loader = DataLoader(mnist_val, batch_size=args.batch_size) + test_loader = DataLoader(mnist_test, batch_size=args.batch_size) + + # ------------ + # model + # ------------ + model = LitClassifier(Backbone(hidden_dim=args.hidden_dim), args.learning_rate) + + # ------------ + # training + # ------------ + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model, train_loader, val_loader) + + # ------------ + # testing + # ------------ + trainer.test(test_dataloaders=test_loader) + + +if __name__ == '__main__': + cli_main() diff --git a/pl_examples/basic_examples/mnist.py b/pl_examples/basic_examples/mnist.py new file mode 100644 index 0000000000..2613b89d2d --- /dev/null +++ b/pl_examples/basic_examples/mnist.py @@ -0,0 +1,111 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from argparse import ArgumentParser + +import torch +import pytorch_lightning as pl +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from tests.base.datasets import MNIST + + +class LitClassifier(pl.LightningModule): + def __init__(self, hidden_dim=128, learning_rate=1e-3): + super().__init__() + self.save_hyperparameters() + + self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim) + self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10) + + def forward(self, x): + x = x.view(x.size(0), -1) + x = torch.relu(self.l1(x)) + x = torch.relu(self.l2(x)) + return x + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + result = pl.EvalResult(checkpoint_on=loss) + result.log('valid_loss', loss) + return result + + def test_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + result = pl.EvalResult(checkpoint_on=loss) + result.log('test_loss', loss) + return result + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + + @staticmethod + def add_model_specific_args(parent_parser): + parser = ArgumentParser(parents=[parent_parser], add_help=False) + parser.add_argument('--hidden_dim', type=int, default=128) + parser.add_argument('--learning_rate', type=float, default=0.0001) + return parser + + +def cli_main(): + pl.seed_everything(1234) + + # ------------ + # args + # ------------ + parser = ArgumentParser() + parser.add_argument('--batch_size', default=32, type=int) + parser = pl.Trainer.add_argparse_args(parser) + parser = LitClassifier.add_model_specific_args(parser) + args = parser.parse_args() + + # ------------ + # data + # ------------ + dataset = MNIST('', train=True, download=True) + mnist_test = MNIST('', train=False, download=True) + mnist_train, mnist_val = random_split(dataset, [55000, 5000]) + + train_loader = DataLoader(mnist_train, batch_size=args.batch_size) + val_loader = DataLoader(mnist_val, batch_size=args.batch_size) + test_loader = DataLoader(mnist_test, batch_size=args.batch_size) + + # ------------ + # model + # ------------ + model = LitClassifier(args.hidden_dim, args.learning_rate) + + # ------------ + # training + # ------------ + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model, train_loader, val_loader) + + # ------------ + # testing + # ------------ + trainer.test(test_dataloaders=test_loader) + + +if __name__ == '__main__': + cli_main() diff --git a/pl_examples/basic_examples/multi_node_ddp2_demo.py b/pl_examples/basic_examples/multi_node_ddp2_demo.py deleted file mode 100644 index aead1fba1e..0000000000 --- a/pl_examples/basic_examples/multi_node_ddp2_demo.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Multi-node example (GPU) -""" -import os -from argparse import ArgumentParser - -from pl_examples.models.lightning_template import LightningTemplateModel -from pytorch_lightning import Trainer, seed_everything - -seed_everything(234) - - -def main(args): - """Main training routine specific for this project.""" - # ------------------------ - # 1 INIT LIGHTNING MODEL - # ------------------------ - model = LightningTemplateModel(args) - - # ------------------------ - # 2 INIT TRAINER - # ------------------------ - trainer = Trainer( - gpus=args.gpus, - num_nodes=args.num_nodes, - distributed_backend='ddp2', - max_epochs=args.max_epochs, - max_steps=args.max_steps, - ) - - # ------------------------ - # 3 START TRAINING - # ------------------------ - trainer.fit(model) - - -def run_cli(): - root_dir = os.path.dirname(os.path.realpath(__file__)) - parent_parser = ArgumentParser(add_help=False) - - # each LightningModule defines arguments relevant to it - parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) - parser = Trainer.add_argparse_args(parser) - args = parser.parse_args() - - # --------------------- - # RUN TRAINING - # --------------------- - main(args) - - -if __name__ == '__main__': - run_cli() diff --git a/pl_examples/basic_examples/multi_node_ddp_demo.py b/pl_examples/basic_examples/multi_node_ddp_demo.py deleted file mode 100644 index 84f241391e..0000000000 --- a/pl_examples/basic_examples/multi_node_ddp_demo.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Multi-node example (GPU) -""" -import os -from argparse import ArgumentParser - -from pl_examples.models.lightning_template import LightningTemplateModel -from pytorch_lightning import Trainer, seed_everything - -seed_everything(234) - - -def main(args): - """Main training routine specific for this project.""" - # ------------------------ - # 1 INIT LIGHTNING MODEL - # ------------------------ - model = LightningTemplateModel(args) - - # ------------------------ - # 2 INIT TRAINER - # ------------------------ - trainer = Trainer( - gpus=args.gpus, - num_nodes=args.num_nodes, - distributed_backend='ddp', - max_epochs=args.max_epochs, - max_steps=args.max_steps, - ) - - # ------------------------ - # 3 START TRAINING - # ------------------------ - trainer.fit(model) - - -def run_cli(): - root_dir = os.path.dirname(os.path.realpath(__file__)) - parent_parser = ArgumentParser(add_help=False) - - # each LightningModule defines arguments relevant to it - parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) - parser = Trainer.add_argparse_args(parser) - args = parser.parse_args() - - # --------------------- - # RUN TRAINING - # --------------------- - main(args) - - -if __name__ == '__main__': - run_cli() diff --git a/pl_examples/basic_examples/submit_ddp2_job.sh b/pl_examples/basic_examples/submit_ddp2_job.sh index 6e433f5fcd..e5d5801ac7 100755 --- a/pl_examples/basic_examples/submit_ddp2_job.sh +++ b/pl_examples/basic_examples/submit_ddp2_job.sh @@ -24,4 +24,4 @@ source activate $1 # ------------------------- # run script from above -srun python3 multi_node_ddp2_demo.py +srun python3 image_classifier.py --distributed_backend 'ddp2' --gpus 2 --num_nodes 2 diff --git a/pl_examples/basic_examples/submit_ddp_job.sh b/pl_examples/basic_examples/submit_ddp_job.sh index bf53a65368..8c8e6d6313 100755 --- a/pl_examples/basic_examples/submit_ddp_job.sh +++ b/pl_examples/basic_examples/submit_ddp_job.sh @@ -24,4 +24,4 @@ source activate $1 # ------------------------- # run script from above -srun python3 multi_node_ddp_demo.py +srun python3 image_classifier.py --distributed_backend 'ddp' --gpus 2 --num_nodes 2 diff --git a/pl_examples/domain_templates/semantic_segmentation.py b/pl_examples/domain_templates/semantic_segmentation.py index fff3d1cf98..af37575bbf 100644 --- a/pl_examples/domain_templates/semantic_segmentation.py +++ b/pl_examples/domain_templates/semantic_segmentation.py @@ -10,7 +10,7 @@ from PIL import Image from torch.utils.data import DataLoader, Dataset import pytorch_lightning as pl -from pl_examples.models.unet import UNet +from pl_examples.domain_templates.unet import UNet from pytorch_lightning.loggers import WandbLogger DEFAULT_VOID_LABELS = (0, 1, 2, 3, 4, 5, 6, 9, 10, 14, 15, 16, 18, 29, 30, -1) diff --git a/pl_examples/models/unet.py b/pl_examples/domain_templates/unet.py similarity index 100% rename from pl_examples/models/unet.py rename to pl_examples/domain_templates/unet.py diff --git a/pl_examples/models/lightning_template.py b/pl_examples/models/lightning_template.py deleted file mode 100644 index 099d6b5a65..0000000000 --- a/pl_examples/models/lightning_template.py +++ /dev/null @@ -1,180 +0,0 @@ -""" -Example template for defining a system. -""" -import os -from argparse import ArgumentParser - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torchvision.transforms as transforms -from torch import optim -from torch.utils.data import DataLoader -from torchvision.datasets import MNIST - -from pytorch_lightning.core import LightningModule - - -class LightningTemplateModel(LightningModule): - """ - Sample model to show how to define a template. - - Example: - - >>> # define simple Net for MNIST dataset - >>> params = dict( - ... in_features=28 * 28, - ... hidden_dim=1000, - ... out_features=10, - ... drop_prob=0.2, - ... learning_rate=0.001 * 8, - ... batch_size=2, - ... data_root='./datasets', - ... num_workers=4, - ... ) - >>> model = LightningTemplateModel(**params) - """ - - def __init__(self, - in_features: int = 28 * 28, - hidden_dim: int = 1000, - out_features: int = 10, - drop_prob: float = 0.2, - learning_rate: float = 0.001 * 8, - batch_size: int = 2, - data_root: str = './datasets', - num_workers: int = 4, - **kwargs - ): - # init superclass - super().__init__() - # save all variables in __init__ signature to self.hparams - self.save_hyperparameters() - - self.c_d1 = nn.Linear(in_features=self.hparams.in_features, - out_features=self.hparams.hidden_dim) - self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim) - self.c_d1_drop = nn.Dropout(self.hparams.drop_prob) - - self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, - - out_features=self.hparams.out_features) - - self.example_input_array = torch.zeros(2, 1, 28, 28) - - def forward(self, x): - """ - No special modification required for Lightning, define it as you normally would - in the `nn.Module` in vanilla PyTorch. - """ - x = self.c_d1(x.view(x.size(0), -1)) - x = torch.tanh(x) - x = self.c_d1_bn(x) - x = self.c_d1_drop(x) - x = self.c_d2(x) - return x - - def training_step(self, batch, batch_idx): - """ - Lightning calls this inside the training loop with the data from the training dataloader - passed in as `batch`. - """ - # forward pass - x, y = batch - y_hat = self(x) - loss = F.cross_entropy(y_hat, y) - tensorboard_logs = {'train_loss': loss} - return {'loss': loss, 'log': tensorboard_logs} - - def validation_step(self, batch, batch_idx): - """ - Lightning calls this inside the validation loop with the data from the validation dataloader - passed in as `batch`. - """ - x, y = batch - y_hat = self(x) - val_loss = F.cross_entropy(y_hat, y) - labels_hat = torch.argmax(y_hat, dim=1) - n_correct_pred = torch.sum(y == labels_hat).item() - return {'val_loss': val_loss, "n_correct_pred": n_correct_pred, "n_pred": len(x)} - - def test_step(self, batch, batch_idx): - x, y = batch - y_hat = self(x) - test_loss = F.cross_entropy(y_hat, y) - labels_hat = torch.argmax(y_hat, dim=1) - n_correct_pred = torch.sum(y == labels_hat).item() - return {'test_loss': test_loss, "n_correct_pred": n_correct_pred, "n_pred": len(x)} - - def validation_epoch_end(self, outputs): - """ - Called at the end of validation to aggregate outputs. - :param outputs: list of individual outputs of each validation step. - """ - avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() - val_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs) - tensorboard_logs = {'val_loss': avg_loss, 'val_acc': val_acc} - return {'val_loss': avg_loss, 'log': tensorboard_logs} - - def test_epoch_end(self, outputs): - avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean() - test_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs) - tensorboard_logs = {'test_loss': avg_loss, 'test_acc': test_acc} - return {'test_loss': avg_loss, 'log': tensorboard_logs} - - # --------------------- - # TRAINING SETUP - # --------------------- - def configure_optimizers(self): - """ - Return whatever optimizers and learning rate schedulers you want here. - At least one optimizer is required. - """ - optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) - return [optimizer], [scheduler] - - def prepare_data(self): - MNIST(self.hparams.data_root, train=True, download=True, transform=transforms.ToTensor()) - MNIST(self.hparams.data_root, train=False, download=True, transform=transforms.ToTensor()) - - def setup(self, stage): - transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) - self.mnist_train = MNIST(self.hparams.data_root, train=True, download=False, transform=transform) - self.mnist_test = MNIST(self.hparams.data_root, train=False, download=False, transform=transform) - - def train_dataloader(self): - return DataLoader(self.mnist_train, batch_size=self.hparams.batch_size, num_workers=self.hparams.num_workers) - - def val_dataloader(self): - return DataLoader(self.mnist_test, batch_size=self.hparams.batch_size, num_workers=self.hparams.num_workers) - - def test_dataloader(self): - return DataLoader(self.mnist_test, batch_size=self.hparams.batch_size, num_workers=self.hparams.num_workers) - - @staticmethod - def add_model_specific_args(parent_parser, root_dir): # pragma: no-cover - """ - Define parameters that only apply to this model - """ - parser = ArgumentParser(parents=[parent_parser]) - - # param overwrites - # parser.set_defaults(gradient_clip_val=5.0) - - # network params - parser.add_argument('--in_features', default=28 * 28, type=int) - parser.add_argument('--hidden_dim', default=50000, type=int) - # use 500 for CPU, 50000 for GPU to see speed difference - parser.add_argument('--out_features', default=10, type=int) - parser.add_argument('--drop_prob', default=0.2, type=float) - - # data - parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str) - parser.add_argument('--num_workers', default=4, type=int) - - # training params (opt) - parser.add_argument('--epochs', default=20, type=int) - parser.add_argument('--batch_size', default=64, type=int) - parser.add_argument('--learning_rate', default=0.001, type=float) - return parser diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py deleted file mode 100644 index 67f69663a3..0000000000 --- a/pl_examples/test_examples.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -from unittest import mock - -import numpy as np -import pytest -import torch -from PIL import Image - - -@pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3']) -def test_cpu_template(cli_args): - """Test running CLI for an example with default params.""" - from pl_examples.basic_examples.cpu_template import run_cli - - cli_args = cli_args.split(' ') if cli_args else [] - with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): - run_cli() - - -@pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --gpus 1']) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_gpu_template(cli_args): - """Test running CLI for an example with default params.""" - from pl_examples.basic_examples.gpu_template import run_cli - - cli_args = cli_args.split(' ') if cli_args else [] - with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): - run_cli() - - -@pytest.mark.parametrize('cli_args', [ - '--max_epochs 1 --gpus 1', - '--max_epochs 1 --gpus 1 --evaluate', -]) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_imagenet(tmpdir, cli_args): - """Test running CLI for the ImageNet example with default params.""" - - from pl_examples.domain_templates.imagenet import run_cli - - # https://github.com/pytorch/vision/blob/master/test/fakedata_generation.py#L105 - def _make_image(file_path): - Image.fromarray(np.zeros((32, 32, 3), dtype=np.uint8)).save(file_path) - - for split in ['train', 'val']: - for class_id in ['a', 'b']: - os.makedirs(os.path.join(tmpdir, split, class_id)) - # Generate 5 black images - for image_id in range(5): - _make_image(os.path.join(tmpdir, split, class_id, str(image_id) + '.JPEG')) - - cli_args = cli_args.split(' ') if cli_args else [] - cli_args += ['--data-path', str(tmpdir)] - cli_args += ['--default_root_dir', str(tmpdir)] - - with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): - run_cli() - - -# @pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2']) -# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -# def test_multi_node_ddp(cli_args): -# """Test running CLI for an example with default params.""" -# from pl_examples.basic_examples.multi_node_ddp_demo import run_cli -# -# cli_args = cli_args.split(' ') if cli_args else [] -# with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): -# run_cli() - - -# @pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2']) -# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -# def test_multi_node_ddp2(cli_args): -# """Test running CLI for an example with default params.""" -# from pl_examples.basic_examples.multi_node_ddp2_demo import run_cli -# -# cli_args = cli_args.split(' ') if cli_args else [] -# with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): -# run_cli() diff --git a/pytorch_lightning/accelerators/dp_backend.py b/pytorch_lightning/accelerators/dp_backend.py index 4bced57e94..0c1229781e 100644 --- a/pytorch_lightning/accelerators/dp_backend.py +++ b/pytorch_lightning/accelerators/dp_backend.py @@ -121,16 +121,22 @@ class DataParallelBackend(Accelerator): def training_step_end(self, output): if isinstance(output, Result): output.dp_reduce() + elif isinstance(output, torch.Tensor): + output = output.mean() return output def validation_step_end(self, output): if isinstance(output, Result): output.dp_reduce() + elif isinstance(output, torch.Tensor): + output = output.mean() return output def test_step_end(self, output): if isinstance(output, Result): output.dp_reduce() + elif isinstance(output, torch.Tensor): + output = output.mean() return output def reinit_scheduler_properties(self, optimizers: list, schedulers: list): diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 921c5ef082..78f16e8bc1 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -24,6 +24,7 @@ from torch.nn.parallel import DistributedDataParallel from torch.nn.parallel._functions import Gather from pytorch_lightning.core.step_result import Result +from pytorch_lightning.utilities.warning_utils import WarningCache def _find_tensors(obj): # pragma: no-cover @@ -54,6 +55,9 @@ def get_a_var(obj): # pragma: no-cover return None +warning_cache = WarningCache() + + class LightningDataParallel(DataParallel): """ Override the forward call in lightning so it goes to training and validation step respectively @@ -157,6 +161,8 @@ class LightningDistributedDataParallel(DistributedDataParallel): def forward(self, *inputs, **kwargs): # pragma: no-cover self._sync_params() + fx_called: str = '' + if self.device_ids: inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) if len(self.device_ids) == 1: @@ -168,10 +174,13 @@ class LightningDistributedDataParallel(DistributedDataParallel): # lightning if self.module.training: output = self.module.training_step(*inputs[0], **kwargs[0]) + fx_called = 'training_step' elif self.module.testing: output = self.module.test_step(*inputs[0], **kwargs[0]) + fx_called = 'test_step' else: output = self.module.validation_step(*inputs[0], **kwargs[0]) + fx_called = 'validation_step' else: outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs) output = self.gather(outputs, self.output_device) @@ -195,9 +204,30 @@ class LightningDistributedDataParallel(DistributedDataParallel): self.reducer.prepare_for_backward(list(_find_tensors(output))) else: self.reducer.prepare_for_backward([]) + + if output is None: + warn_missing_output(fx_called) + + m = f'{fx_called} returned None. Did you forget to re' return output +def warn_missing_output(fx_called): + if fx_called == 'training_step': + m = """ + Your training_step returned None. You should instead do: + return loss + or + return TrainResult + """ + elif fx_called in ['validation_step', 'test_step']: + m = f""" + Your {fx_called} returned None. You should instead do: + return EvalResult + """ + warning_cache.warn(m) + + def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): # pragma: no-cover r"""Applies each `module` in :attr:`modules` in parallel on arguments contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword) @@ -229,6 +259,7 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): # pragma: n def _worker(i, module, input, kwargs, device=None): torch.set_grad_enabled(grad_enabled) + fx_called: str = '' if device is None: device = get_a_var(input).get_device() try: @@ -243,14 +274,18 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): # pragma: n # CHANGE if module.training: output = module.training_step(*input, **kwargs) - + fx_called = 'training_step' elif module.testing: output = module.test_step(*input, **kwargs) - + fx_called = 'test_step' else: output = module.validation_step(*input, **kwargs) + fx_called = 'validation_step' - if module.use_dp or module.use_ddp2: + if output is None: + warn_missing_output(fx_called) + + if output is not None and (module.use_dp or module.use_ddp2): auto_squeeze_dim_zeros(output) # --------------- @@ -296,6 +331,10 @@ def auto_squeeze_dim_zeros(output): :param output: :return: """ + if isinstance(output, torch.Tensor): + output = output.unsqueeze(0) + return output + for k, v in output.items(): if not isinstance(v, torch.Tensor): continue diff --git a/pytorch_lightning/utilities/warning_utils.py b/pytorch_lightning/utilities/warning_utils.py new file mode 100644 index 0000000000..242d5c0695 --- /dev/null +++ b/pytorch_lightning/utilities/warning_utils.py @@ -0,0 +1,12 @@ +from pytorch_lightning.utilities.distributed import rank_zero_warn + + +class WarningCache: + + def __init__(self): + self.warnings = set() + + def warn(self, m): + if m not in self.warnings: + self.warnings.add(m) + rank_zero_warn(m) diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py index 9b3e84fe94..22b581d4eb 100644 --- a/tests/base/develop_utils.py +++ b/tests/base/develop_utils.py @@ -3,7 +3,6 @@ import os import numpy as np -# from pl_examples import LightningTemplateModel from pytorch_lightning import seed_everything from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger diff --git a/pl_examples/models/__init__.py b/tests/examples/__init__.py similarity index 100% rename from pl_examples/models/__init__.py rename to tests/examples/__init__.py diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py new file mode 100644 index 0000000000..7fe5d4ed60 --- /dev/null +++ b/tests/examples/test_examples.py @@ -0,0 +1,94 @@ +from unittest import mock +import torch +import pytest + +dp_16_args = """ +--max_epochs 1 \ +--batch_size 32 \ +--limit_train_batches 2 \ +--limit_val_batches 2 \ +--gpus 2 \ +--distributed_backend dp \ +--precision 16 \ +""" + +cpu_args = """ +--max_epochs 1 \ +--batch_size 32 \ +--limit_train_batches 2 \ +--limit_val_batches 2 \ +""" + +ddp_args = """ +--max_epochs 1 \ +--batch_size 32 \ +--limit_train_batches 2 \ +--limit_val_batches 2 \ +--gpus 2 \ +--precision 16 \ +""" + + +# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +# @pytest.mark.parametrize('cli_args', [dp_16_args]) +# def test_examples_dp_mnist(cli_args): +# from pl_examples.basic_examples.mnist import cli_main +# +# with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): +# cli_main() + + +# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +# @pytest.mark.parametrize('cli_args', [dp_16_args]) +# def test_examples_dp_image_classifier(cli_args): +# from pl_examples.basic_examples.image_classifier import cli_main +# +# with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): +# cli_main() +# +# +# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +# @pytest.mark.parametrize('cli_args', [dp_16_args]) +# def test_examples_dp_autoencoder(cli_args): +# from pl_examples.basic_examples.autoencoder import cli_main +# +# with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): +# cli_main() + + +# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +# @pytest.mark.parametrize('cli_args', [ddp_args]) +# def test_examples_ddp_mnist(cli_args): +# from pl_examples.basic_examples.mnist import cli_main +# +# with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): +# cli_main() +# +# +# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +# @pytest.mark.parametrize('cli_args', [ddp_args]) +# def test_examples_ddp_image_classifier(cli_args): +# from pl_examples.basic_examples.image_classifier import cli_main +# +# with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): +# cli_main() +# +# +# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +# @pytest.mark.parametrize('cli_args', [ddp_args]) +# def test_examples_ddp_autoencoder(cli_args): +# from pl_examples.basic_examples.autoencoder import cli_main +# +# with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): +# cli_main() +# + +@pytest.mark.parametrize('cli_args', [cpu_args]) +def test_examples_cpu(cli_args): + from pl_examples.basic_examples.mnist import cli_main as mnist_cli + from pl_examples.basic_examples.image_classifier import cli_main as ic_cli + from pl_examples.basic_examples.autoencoder import cli_main as ae_cli + + for cli_cmd in [mnist_cli, ic_cli, ae_cli]: + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): + cli_cmd()