From 7d97e3e6e47d63a8e219e06ecea8c61c53a23afc Mon Sep 17 00:00:00 2001 From: Phuc Le Date: Wed, 24 Jul 2019 12:12:45 +0700 Subject: [PATCH 01/48] Support any lr_scheduler --- README.md | 1 - .../RequiredTrainerInterface.md | 21 ++++++++++--------- docs/Trainer/Training Loop.md | 11 ---------- docs/Trainer/index.md | 1 - docs/index.md | 1 - .../lightning_module_template.py | 4 ++-- .../sample_model_template/model_template.py | 5 +++-- pytorch_lightning/models/trainer.py | 15 +++---------- pytorch_lightning/root_module/root_module.py | 2 +- 9 files changed, 20 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 85b27c204d..fc975ab54a 100644 --- a/README.md +++ b/README.md @@ -264,7 +264,6 @@ tensorboard --logdir /some/path ###### Training loop - [Accumulate gradients](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#accumulated-gradients) -- [Anneal Learning rate](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#anneal-learning-rate) - [Force training for min or max epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-training-for-min-or-max-epochs) - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index 96522c7eec..f5e36e8f37 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -222,26 +222,27 @@ def validation_end(self, outputs): def configure_optimizers(self) ``` -Set up as many optimizers as you need. Normally you'd need one. But in the case of GANs or something more esoteric you might have multiple. -Lightning will call .backward() and .step() on each one. If you use 16 bit precision it will also handle that. +Set up as many optimizers and (optionally) learning rate schedulers as you need. Normally you'd need one. But in the case of GANs or something more esoteric you might have multiple. +Lightning will call .backward() and .step() on each one in every epoch. If you use 16 bit precision it will also handle that. ##### Return -List - List of optimizers +Tuple - List of optimizers and list of schedulers **Example** ``` {.python} # most cases def configure_optimizers(self): - opt = Adam(lr=0.01) - return [opt] + opt = Adam(self.model.parameters(), lr=0.01) + return [opt], [] -# gan example +# gan example, with scheduler for discriminator def configure_optimizers(self): - generator_opt = Adam(lr=0.01) - disriminator_opt = Adam(lr=0.02) - return [generator_opt, disriminator_opt] + generator_opt = Adam(self.model_gen.parameters(), lr=0.01) + disriminator_opt = Adam(self.model_disc.parameters(), lr=0.02) + discriminator_sched = CosineAnnealing(discriminator_opt, T_max=10) + return [generator_opt, disriminator_opt], [discriminator_sched] ``` --- @@ -427,4 +428,4 @@ def add_model_specific_args(parent_parser, root_dir): parser.opt_list('--batch_size', default=256, type=int, options=[32, 64, 128, 256], tunable=False) parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False) return parser -``` \ No newline at end of file +``` diff --git a/docs/Trainer/Training Loop.md b/docs/Trainer/Training Loop.md index 2be8da9edd..7a8a7c6058 100644 --- a/docs/Trainer/Training Loop.md +++ b/docs/Trainer/Training Loop.md @@ -11,17 +11,6 @@ Accumulated gradients runs K small batches of size N before doing a backwards pa trainer = Trainer(accumulate_grad_batches=1) ``` ---- -#### Anneal Learning rate -Cut the learning rate by 10 at every epoch listed in this list. -``` {.python} -# DEFAULT (don't anneal) -trainer = Trainer(lr_scheduler_milestones=None) - -# cut LR by 10 at 100, 200, and 300 epochs -trainer = Trainer(lr_scheduler_milestones='100, 200, 300') -``` - --- #### Force training for min or max epochs It can be useful to force training for a minimum number of epochs or limit to a max number diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index 1b30da1966..d670ec8b32 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -59,7 +59,6 @@ But of course the fun is in all the advanced things it can do: **Training loop** - [Accumulate gradients](Training%20Loop/#accumulated-gradients) -- [Anneal Learning rate](Training%20Loop/#anneal-learning-rate) - [Force training for min or max epochs](Training%20Loop/#force-training-for-min-or-max-epochs) - [Force disable early stop](Training%20Loop/#force-disable-early-stop) - [Use multiple optimizers (like GANs)](../Pytorch-lightning/LightningModule/#configure_optimizers) diff --git a/docs/index.md b/docs/index.md index 0e25fa79d5..91d344685b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -60,7 +60,6 @@ To start a new project define these two files. ###### Training loop - [Accumulate gradients](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#accumulated-gradients) -- [Anneal Learning rate](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#anneal-learning-rate) - [Force training for min or max epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-training-for-min-or-max-epochs) - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) diff --git a/pytorch_lightning/examples/new_project_templates/lightning_module_template.py b/pytorch_lightning/examples/new_project_templates/lightning_module_template.py index 0a4dab2692..6e48bb3650 100644 --- a/pytorch_lightning/examples/new_project_templates/lightning_module_template.py +++ b/pytorch_lightning/examples/new_project_templates/lightning_module_template.py @@ -174,7 +174,8 @@ class LightningTemplateModel(LightningModule): :return: list of optimizers """ optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - return [optimizer] + scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) + return [optimizer], [scheduler] def __dataloader(self, train): # init data generators @@ -231,7 +232,6 @@ class LightningTemplateModel(LightningModule): # parser.set_defaults(gradient_clip=5.0) # network params - parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False) parser.add_argument('--in_features', default=28*28, type=int) parser.add_argument('--out_features', default=10, type=int) parser.add_argument('--hidden_dim', default=50000, type=int) # use 500 for CPU, 50000 for GPU to see speed difference diff --git a/pytorch_lightning/models/sample_model_template/model_template.py b/pytorch_lightning/models/sample_model_template/model_template.py index 10f12c59a1..0c446a11ca 100644 --- a/pytorch_lightning/models/sample_model_template/model_template.py +++ b/pytorch_lightning/models/sample_model_template/model_template.py @@ -128,12 +128,13 @@ class ExampleModel1(LightningModule): # --------------------- def configure_optimizers(self): """ - return whatever optimizers we want here + return whatever optimizers and (optionally) schedulers we want here :return: list of optimizers """ optimizer = self.choose_optimizer(self.hparams.optimizer_name, self.parameters(), {'lr': self.hparams.learning_rate}, 'optimizer') self.optimizers = [optimizer] - return self.optimizers + self.schedulers = [] + return self.optimizers, self.schedulers def __dataloader(self, train): # init data generators diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index bbfafd8933..5da1ead938 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -71,7 +71,6 @@ class Trainer(TrainerIO): train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0, val_check_interval=0.95, log_save_interval=100, add_log_row_interval=10, - lr_scheduler_milestones=None, distributed_backend='dp', use_amp=False, print_nan_grads=False, @@ -104,7 +103,6 @@ class Trainer(TrainerIO): :param val_check_interval: :param log_save_interval: :param add_log_row_interval: - :param lr_scheduler_milestones: :param distributed_backend: 'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel :param use_amp: :param print_nan_grads: @@ -141,7 +139,6 @@ class Trainer(TrainerIO): self.early_stop_callback = early_stop_callback self.min_nb_epochs = min_nb_epochs self.nb_sanity_val_steps = nb_sanity_val_steps - self.lr_scheduler_milestones = [] if lr_scheduler_milestones is None else [int(x.strip()) for x in lr_scheduler_milestones.split(',')] self.lr_schedulers = [] self.amp_level = amp_level self.print_nan_grads = print_nan_grads @@ -444,7 +441,7 @@ class Trainer(TrainerIO): # CHOOSE OPTIMIZER # filter out the weights that were done on gpu so we can load on good old cpus - self.optimizers = model.configure_optimizers() + self.optimizers, self.lr_schedulers = model.configure_optimizers() self.__run_pretrain_routine(model) @@ -456,7 +453,7 @@ class Trainer(TrainerIO): # CHOOSE OPTIMIZER # filter out the weights that were done on gpu so we can load on good old cpus - self.optimizers = model.configure_optimizers() + self.optimizers, self.lr_schedulers = model.configure_optimizers() model.cuda(self.data_parallel_device_ids[0]) @@ -507,7 +504,7 @@ class Trainer(TrainerIO): # CHOOSE OPTIMIZER # filter out the weights that were done on gpu so we can load on good old cpus - self.optimizers = model.configure_optimizers() + self.optimizers, self.lr_schedulers = model.configure_optimizers() # MODEL # copy model to each gpu @@ -587,12 +584,6 @@ class Trainer(TrainerIO): # init training constants self.__layout_bookeeping() - # add lr schedulers - if self.lr_scheduler_milestones is not None: - for optimizer in self.optimizers: - scheduler = MultiStepLR(optimizer, self.lr_scheduler_milestones) - self.lr_schedulers.append(scheduler) - # print model summary if self.proc_rank == 0 and self.print_weights_summary: ref_model.summarize() diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index d6d740f039..415934b7da 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -58,7 +58,7 @@ class LightningModule(GradInformation, ModelIO, ModelHooks): def configure_optimizers(self): """ - Return array of optimizers + Return a list of optimizers and a list of schedulers (could be empty) :return: """ raise NotImplementedError From ba25161dcc1be898ba3443a3c3c8d7d5f77b233c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 08:23:37 -0400 Subject: [PATCH 02/48] Update trainer.py --- pytorch_lightning/models/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 5da1ead938..c21cf3be01 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -10,7 +10,6 @@ import re import torch from torch.utils.data.distributed import DistributedSampler -from torch.optim.lr_scheduler import MultiStepLR import torch.multiprocessing as mp import torch.distributed as dist import numpy as np From 5ba0a8fe4cb5fa08913bdab90db496cc11ea62e4 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 08:24:56 -0400 Subject: [PATCH 03/48] Update RequiredTrainerInterface.md --- docs/LightningModule/RequiredTrainerInterface.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index f5e36e8f37..ff9b84916e 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -234,7 +234,7 @@ Tuple - List of optimizers and list of schedulers ``` {.python} # most cases def configure_optimizers(self): - opt = Adam(self.model.parameters(), lr=0.01) + opt = Adam(self.parameters(), lr=0.01) return [opt], [] # gan example, with scheduler for discriminator From 98fcc1713542c64ff3ac6a7a1dc2b673f35bb1f9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 08:27:14 -0400 Subject: [PATCH 04/48] Update trainer.py --- pytorch_lightning/models/trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index c21cf3be01..9c7545103d 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -612,8 +612,9 @@ class Trainer(TrainerIO): # run all epochs for epoch_nb in range(self.current_epoch, self.max_nb_epochs): # update the lr scheduler - for lr_scheduler in self.lr_schedulers: - lr_scheduler.step() + if self.lr_schedulers is not None: + for lr_scheduler in self.lr_schedulers: + lr_scheduler.step() model = self.__get_model() model.current_epoch = epoch_nb From c7dab0d7856f7035d5c4bdd0b37e550d49428599 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 26 Jul 2019 14:39:04 -0400 Subject: [PATCH 05/48] Update lm_test_module.py --- pytorch_lightning/testing_models/lm_test_module.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/testing_models/lm_test_module.py b/pytorch_lightning/testing_models/lm_test_module.py index e33ee53e33..158f8fbf0c 100644 --- a/pytorch_lightning/testing_models/lm_test_module.py +++ b/pytorch_lightning/testing_models/lm_test_module.py @@ -190,8 +190,9 @@ class LightningTestModel(LightningModule): return whatever optimizers we want here :return: list of optimizers """ + # try no scheduler for this model (testing purposes) optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - return [optimizer] + return [optimizer], [] def __dataloader(self, train): # init data generators From 532604f0561019f15b89c0d82043bc297218f9ce Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 27 Jul 2019 14:26:08 -0400 Subject: [PATCH 06/48] Update README.md --- README.md | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 85b27c204d..315de45f83 100644 --- a/README.md +++ b/README.md @@ -40,21 +40,24 @@ With lightning, you guarantee those parts of your code work so you can focus on To use lightning do 2 things: 1. [Define a LightningModel](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/) ```python -import pytorch_lightning as ptl +import os import torch from torch.nn import functional as F from torch.utils.data import DataLoader from torchvision.datasets import MNIST +import torchvision.transforms as transforms + +import pytorch_lightning as ptl class CoolModel(ptl.LightningModule): - def __init(self): + def __init__(self): super(CoolModel, self).__init__() # not the best model... self.l1 = torch.nn.Linear(28 * 28, 10) def forward(self, x): - return torch.relu(self.l1(x)) + return torch.relu(self.l1(x.view(x.size(0), -1))) def my_loss(self, y_hat, y): return F.cross_entropy(y_hat, y) @@ -62,7 +65,7 @@ class CoolModel(ptl.LightningModule): def training_step(self, batch, batch_nb): x, y = batch y_hat = self.forward(x) - return {'tng_loss': self.my_loss(y_hat, y)} + return {'loss': self.my_loss(y_hat, y)} def validation_step(self, batch, batch_nb): x, y = batch @@ -70,23 +73,25 @@ class CoolModel(ptl.LightningModule): return {'val_loss': self.my_loss(y_hat, y)} def validation_end(self, outputs): - avg_loss = torch.stack([x for x in outputs['val_loss']]).mean() - return avg_loss + avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() + return {'avg_val_loss': avg_loss} def configure_optimizers(self): - return [torch.optim.Adam(self.parameters(), lr=0.02)] + optim = torch.optim.Adam(self.parameters(), lr=0.02) + self.optimizers = [optim] + return self.optimizers @ptl.data_loader def tng_dataloader(self): - return DataLoader(MNIST('path/to/save', train=True), batch_size=32) + return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32) @ptl.data_loader def val_dataloader(self): - return DataLoader(MNIST('path/to/save', train=False), batch_size=32) + return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32) @ptl.data_loader def test_dataloader(self): - return DataLoader(MNIST('path/to/save', train=False), batch_size=32) + return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32) ``` 2. Fit with a [trainer](https://williamfalcon.github.io/pytorch-lightning/Trainer/) @@ -97,13 +102,15 @@ from test_tube import Experiment model = CoolModel() # fit on 32 gpus across 4 nodes -exp = Experiment(save_dir='some/dir') -trainer = Trainer(experiment=exp, nb_gpu_nodes=4, gpus=[0,1,2,3,4,5,6,7]) +model = CoolModel() +exp = Experiment(save_dir=os.getcwd()) +trainer = Trainer(experiment=exp, max_nb_epochs=1) +# train (1 epoch only here for demo) trainer.fit(model) -# see all experiment metrics here -# tensorboard --log_dir some/dir +# view tensorflow logs +print(f'View tensorboard logs by running\ntensorboard --logdir {os.getcwd()}') ``` @@ -305,4 +312,4 @@ python multi_node_cluster_template.py --nb_gpu_nodes 4 --gpus '0,1,2,3,4,5,6,7' If you can't wait for the next release, install the most up to date code with: ```bash pip install git+https://github.com/williamFalcon/pytorch-lightning.git@master --upgrade -``` \ No newline at end of file +``` From d6b5f37a7b553ab4b1254dad7ecae1eefb4a6cd6 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 27 Jul 2019 14:28:44 -0400 Subject: [PATCH 07/48] Update README.md --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 315de45f83..d2488324c9 100644 --- a/README.md +++ b/README.md @@ -99,13 +99,18 @@ class CoolModel(ptl.LightningModule): from pytorch_lightning import Trainer from test_tube import Experiment -model = CoolModel() - -# fit on 32 gpus across 4 nodes model = CoolModel() exp = Experiment(save_dir=os.getcwd()) + +# train on cpu trainer = Trainer(experiment=exp, max_nb_epochs=1) +# train on 4 gpus +# trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=[0, 1, 2, 3]) + +# train on 32 gpus across 4 nodes (make sure to submit appropriate SLURM job) +# trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=[0, 1, 2, 3, 4, 5, 6, 7], nb_gpu_nodes=4) + # train (1 epoch only here for demo) trainer.fit(model) From 90b14977a494e8fe971027d2c8381a68a2ed6d7b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 27 Jul 2019 14:31:22 -0400 Subject: [PATCH 08/48] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d2488324c9..9b3bb98502 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,8 @@ trainer = Trainer(experiment=exp, max_nb_epochs=1) trainer.fit(model) # view tensorflow logs -print(f'View tensorboard logs by running\ntensorboard --logdir {os.getcwd()}') +print(f'View tensorboard logs by running\ntensorboard --logdir {os.getcwd()}') +print('and going to http://localhost:6006 on your browser') ``` From 66188209b545a8a98dc2d49a916eb4b11bf6a29f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 27 Jul 2019 14:33:48 -0400 Subject: [PATCH 09/48] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9b3bb98502..40c2f0da84 100644 --- a/README.md +++ b/README.md @@ -102,8 +102,8 @@ from test_tube import Experiment model = CoolModel() exp = Experiment(save_dir=os.getcwd()) -# train on cpu -trainer = Trainer(experiment=exp, max_nb_epochs=1) +# train on cpu using only 10% of the data (for demo purposes) +trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.1) # train on 4 gpus # trainer = Trainer(experiment=exp, max_nb_epochs=1, gpus=[0, 1, 2, 3]) From a48cccdc68cae5a4ba3fe465c936b812770c3ad8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 27 Jul 2019 14:38:33 -0400 Subject: [PATCH 10/48] Update README.md --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 40c2f0da84..fadb2ec1ba 100644 --- a/README.md +++ b/README.md @@ -77,9 +77,7 @@ class CoolModel(ptl.LightningModule): return {'avg_val_loss': avg_loss} def configure_optimizers(self): - optim = torch.optim.Adam(self.parameters(), lr=0.02) - self.optimizers = [optim] - return self.optimizers + return [torch.optim.Adam(self.parameters(), lr=0.02)] @ptl.data_loader def tng_dataloader(self): From 921a3cbabeae4eb868725f1762d66ce084038c44 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 27 Jul 2019 18:27:38 -0400 Subject: [PATCH 11/48] updated docs --- docs/Trainer/Distributed training.md | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/docs/Trainer/Distributed training.md b/docs/Trainer/Distributed training.md index a7d487b525..aedbd20ed1 100644 --- a/docs/Trainer/Distributed training.md +++ b/docs/Trainer/Distributed training.md @@ -23,6 +23,16 @@ have configuration issues depending on your cluster. For a deeper understanding of what lightning is doing, feel free to read [this guide](https://medium.com/@_willfalcon/9-tips-for-training-lightning-fast-neural-networks-in-pytorch-8e63a502f565). +--- +#### CUDA flags +CUDA flags make certain GPUs visible to your script. +Lightning sets these for you automatically, there's NO NEED to do this yourself. +```python +# lightning will set according to what you give the trainer +# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +# os.environ["CUDA_VISIBLE_DEVICES"] = "0" +``` + --- #### 16-bit mixed precision 16 bit precision can cut your memory footprint by half. If using volta architecture GPUs it can give a dramatic training speed-up as well. @@ -43,10 +53,6 @@ trainer = Trainer(amp_level='O2', use_amp=False) #### Single-gpu Make sure you're on a GPU machine. ```python -# set these flags -os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -os.environ["CUDA_VISIBLE_DEVICES"] = "0" - # DEFAULT trainer = Trainer(gpus=[0]) ``` @@ -56,13 +62,6 @@ trainer = Trainer(gpus=[0]) Make sure you're on a GPU machine. You can set as many GPUs as you want. In this setting, the model will run on all 8 GPUs at once using DataParallel under the hood. ```python -# set these flags -# lightning sets these flags for you automatically -# no need to set yourself -# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7" - - # to use DataParallel (default) trainer = Trainer(gpus=[0,1,2,3,4,5,6,7], distributed_backend='dp') From 9aa41ec98d0d4fc9eb01b7e316b5cd800288c4a9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 27 Jul 2019 18:40:29 -0400 Subject: [PATCH 12/48] added tb docs --- docs/Trainer/Logging.md | 27 ++++ pytorch_lightning/trainer_main.py | 210 ------------------------------ 2 files changed, 27 insertions(+), 210 deletions(-) delete mode 100644 pytorch_lightning/trainer_main.py diff --git a/docs/Trainer/Logging.md b/docs/Trainer/Logging.md index 1596251f8b..d3004cb2e3 100644 --- a/docs/Trainer/Logging.md +++ b/docs/Trainer/Logging.md @@ -50,6 +50,33 @@ exp = Experiment(create_git_tag=True) Trainer(experiment=exp) ``` +--- +### Tensorboard support +The experiment object is a strict subclass of Pytorch SummaryWriter. However, this class +also snapshots every detail about the experiment (data folder paths, code, hyperparams), +and allows you to visualize it using tensorboard. +``` {.python} +from test_tube import Experiment, HyperOptArgumentParser + +# exp hyperparams +args = HyperOptArgumentParser() +hparams = args.parse_args() + +# this is a summaryWriter with nicer logging structure +exp = Experiment(save_dir='/some/path', create_git_tag=True) + +# track experiment details (must be ArgumentParser or HyperOptArgumentParser). +# each option in the parser is tracked +exp.argparse(hparams) +exp.tag({'description': 'running demo'}) + +# trainer uses the exp object to log exp data +trainer = Trainer(experiment=exp) +trainer.fit(model) + +# view logs at: +# tensorboard --logdir /some/path +``` --- #### Write logs file to csv every k batches diff --git a/pytorch_lightning/trainer_main.py b/pytorch_lightning/trainer_main.py deleted file mode 100644 index 0c30a41968..0000000000 --- a/pytorch_lightning/trainer_main.py +++ /dev/null @@ -1,210 +0,0 @@ -import os -import sys - -import torch -import numpy as np -from test_tube import HyperOptArgumentParser, Experiment, SlurmCluster -from pytorch_lightning.models.trainer import Trainer -from pytorch_lightning.utils.arg_parse import add_default_args -from time import sleep - -from pytorch_lightning.callbacks.pt_callbacks import EarlyStopping, ModelCheckpoint -SEED = 2334 -torch.manual_seed(SEED) -np.random.seed(SEED) - -# --------------------- -# DEFINE MODEL HERE -# --------------------- -from pytorch_lightning.models.sample_model_template.model_template import ExampleModel1 -# --------------------- - -AVAILABLE_MODELS = { - 'model_1': ExampleModel1 -} - - -""" -Allows training by using command line arguments - -Run by: -# TYPE YOUR RUN COMMAND HERE -""" - - -def main_local(hparams): - main(hparams, None, None) - - -def main(hparams, cluster, results_dict): - """ - Main training routine specific for this project - :param hparams: - :return: - """ - on_gpu = torch.cuda.is_available() - if hparams.disable_cuda: - on_gpu = False - - device = 'cuda' if on_gpu else 'cpu' - hparams.__setattr__('device', device) - hparams.__setattr__('on_gpu', on_gpu) - hparams.__setattr__('nb_gpus', torch.cuda.device_count()) - hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) - - # init experiment - exp = Experiment( - name=hparams.tt_name, - debug=hparams.debug, - save_dir=hparams.tt_save_path, - version=hparams.hpc_exp_number, - autosave=False, - description=hparams.tt_description - ) - - exp.argparse(hparams) - exp.save() - - # build model - print('loading model...') - model = TRAINING_MODEL(hparams) - print('model built') - - # callbacks - early_stop = EarlyStopping( - monitor=hparams.early_stop_metric, - patience=hparams.early_stop_patience, - verbose=True, - mode=hparams.early_stop_mode - ) - - model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) - checkpoint = ModelCheckpoint( - filepath=model_save_path, - save_function=None, - save_best_only=True, - verbose=True, - monitor=hparams.model_save_monitor_value, - mode=hparams.model_save_monitor_mode - ) - - # configure trainer - trainer = Trainer( - experiment=exp, - on_gpu=on_gpu, - cluster=cluster, - progress_bar=hparams.enable_tqdm, - overfit_pct=hparams.overfit, - track_grad_norm=hparams.track_grad_norm, - fast_dev_run=hparams.fast_dev_run, - check_val_every_n_epoch=hparams.check_val_every_n_epoch, - accumulate_grad_batches=hparams.accumulate_grad_batches, - process_position=process_position, - current_gpu_name=current_gpu, - checkpoint_callback=checkpoint, - early_stop_callback=early_stop, - enable_early_stop=hparams.enable_early_stop, - max_nb_epochs=hparams.max_nb_epochs, - min_nb_epochs=hparams.min_nb_epochs, - train_percent_check=hparams.train_percent_check, - val_percent_check=hparams.val_percent_check, - test_percent_check=hparams.test_percent_check, - val_check_interval=hparams.val_check_interval, - log_save_interval=hparams.log_save_interval, - add_log_row_interval=hparams.add_log_row_interval, - lr_scheduler_milestones=hparams.lr_scheduler_milestones - ) - - # train model - trainer.fit(model) - - -def get_default_parser(strategy, root_dir): - - possible_model_names = list(AVAILABLE_MODELS.keys()) - parser = HyperOptArgumentParser(strategy=strategy, add_help=False) - add_default_args(parser, root_dir, possible_model_names, SEED) - return parser - - -def get_model_name(args): - for i, arg in enumerate(args): - if 'model_name' in arg: - return args[i+1] - - -def optimize_on_cluster(hyperparams): - # enable cluster training - cluster = SlurmCluster( - hyperparam_optimizer=hyperparams, - log_path=hyperparams.tt_save_path, - test_tube_exp_name=hyperparams.tt_name - ) - - # email for cluster coms - cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True) - - # configure cluster - cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus - cluster.job_time = '48:00:00' - cluster.gpu_type = '1080ti' - cluster.memory_mb_per_node = 48000 - - # any modules for code to run in env - cluster.add_command('source activate pytorch_lightning') - - # name of exp - job_display_name = hyperparams.tt_name.split('_')[0] - job_display_name = job_display_name[0:3] - - # run hopt - print('submitting jobs...') - cluster.optimize_parallel_cluster_gpu( - main, - nb_trials=hyperparams.nb_hopt_trials, - job_name=job_display_name - ) - - -if __name__ == '__main__': - - model_name = get_model_name(sys.argv) - - # use default args - root_dir = os.path.split(os.path.dirname(sys.modules['__main__'].__file__))[0] - parent_parser = get_default_parser(strategy='random_search', root_dir=root_dir) - - # allow model to overwrite or extend args - TRAINING_MODEL = AVAILABLE_MODELS[model_name] - parser = TRAINING_MODEL.add_model_specific_args(parent_parser) - parser.json_config('-c', '--config', default=root_dir + '/run_configs/local.json') - hyperparams = parser.parse_args() - - # format GPU layout - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - gpu_ids = hyperparams.gpus.split(';') - - # RUN TRAINING - if hyperparams.on_cluster: - print('RUNNING ON SLURM CLUSTER') - os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpu_ids) - optimize_on_cluster(hyperparams) - - elif hyperparams.single_run_gpu: - print(f'RUNNING 1 TRIAL ON GPU. gpu: {gpu_ids[0]}') - os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids[0] - main(hyperparams, None, None) - - elif hyperparams.local or hyperparams.single_run: - os.environ["CUDA_VISIBLE_DEVICES"] = '0' - print('RUNNING LOCALLY') - main(hyperparams, None, None) - - else: - print(f'RUNNING MULTI GPU. GPU ids: {gpu_ids}') - hyperparams.optimize_parallel_gpu( - main_local, - gpu_ids=gpu_ids, - nb_trials=hyperparams.nb_hopt_trials, - nb_workers=len(gpu_ids) - ) From 62201de70d75cacef59555fd2f2c81466719896d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 07:50:18 -0400 Subject: [PATCH 13/48] added hooks docs --- docs/Trainer/hooks.md | 87 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md index e69de29bb2..061ffa1ad0 100644 --- a/docs/Trainer/hooks.md +++ b/docs/Trainer/hooks.md @@ -0,0 +1,87 @@ +# Hooks +[[Github Code](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/root_module/hooks.py)] + +There are cases when you might want to do something different at different parts of the training/validation loop. +To enable a hook, simply override the method in your LightningModule and the trainer will call it at the correct time. + +**Contributing** If there's a hook you'd like to add, simply: +1. Fork PytorchLightning. +2. Add the hook [here](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/root_module/hooks.py). +3. Add the correct place in the [Trainer](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/models/trainer.py) +where it should be called. + +--- +#### on_epoch_start +Called in the training loop at the very beginning of the epoch. +```python +def on_epoch_start(self): + # do something when the epoch starts +``` + +--- +#### on_batch_end +Called in the training loop at the very end of the epoch. +```python +def on_epoch_end(self): + # do something when the epoch ends +``` + +--- +#### on_batch_start +Called in the training loop before anything happens for that batch. +```python +def on_batch_start(self): + # do something when the batch starts +``` + +--- +#### on_pre_performance_check +Called at the very beginning of the validation loop. +```python +def on_pre_performance_check(self): + # do something before validation starts +``` + +--- +#### on_post_performance_check +Called at the very end of the validation loop. +```python +def on_post_performance_check(self): + # do something before validation end +``` + +--- +#### on_tng_metrics +Called in the training loop, right before metrics are logged. +Although you can log at any time by using self.experiment, you can use +this callback to modify what will be logged. +```python +def on_tng_metrics(self, metrics): + # do something before validation end +``` + +--- +#### on_before_zero_grad +Called in the training loop after taking an optimizer step and before zeroing grads. +Good place to inspect weight information with weights updated. + +Called once per optimizer +```python +def on_before_zero_grad(self, optimizer): + # do something with the optimizer or inspect it. +``` + +--- +#### on_after_backward +Called in the training loop after model.backward() +This is the ideal place to inspect or log gradient information +```python +def on_after_backward(self): + # example to inspect gradient information in tensorboard + if self.trainer.global_step % 25 == 0: # don't make the tf file huge + params = self.state_dict() + for k, v in params.items(): + grads = v + name = k + self.experiment.add_histogram(tag=name, values=grads, global_step=self.trainer.global_step) +``` From 42888bceb7578b1ca9e6bd123c89906e42cebad3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 07:51:53 -0400 Subject: [PATCH 14/48] added hooks docs --- docs/Trainer/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index 1b30da1966..a16f9bd070 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -62,12 +62,14 @@ But of course the fun is in all the advanced things it can do: - [Anneal Learning rate](Training%20Loop/#anneal-learning-rate) - [Force training for min or max epochs](Training%20Loop/#force-training-for-min-or-max-epochs) - [Force disable early stop](Training%20Loop/#force-disable-early-stop) +- [Hooks](hooks/) - [Use multiple optimizers (like GANs)](../Pytorch-lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](Training%20Loop/#set-how-much-of-the-training-set-to-check) **Validation loop** - [Check validation every n epochs](Validation%20Loop/#check-validation-every-n-epochs) +- [Hooks](hooks/) - [Set how much of the validation set to check](Validation%20Loop/#set-how-much-of-the-validation-set-to-check) - [Set how much of the test set to check](Validation%20Loop/#set-how-much-of-the-test-set-to-check) - [Set validation check frequency within 1 training epoch](Validation%20Loop/#set-validation-check-frequency-within-1-training-epoch) From 4f0f1a9b0bd06d4b8aab013a00ff6ede87fc6548 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 07:55:26 -0400 Subject: [PATCH 15/48] added hooks docs --- docs/Trainer/index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index a16f9bd070..a9be9a5ebf 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -49,9 +49,9 @@ But of course the fun is in all the advanced things it can do: **Experiment Logging** - [Display metrics in progress bar](Logging/#display-metrics-in-progress-bar) -- Log arbitrary metrics - [Log metric row every k batches](Logging/#log-metric-row-every-k-batches) - [Process position](Logging/#process-position) +- [Tensorboard support](Logging/#tensorboard-support) - [Save a snapshot of all hyperparameters](Logging/#save-a-snapshot-of-all-hyperparameters) - [Snapshot code for a training run](Logging/#snapshot-code-for-a-training-run) - [Write logs file to csv every k batches](Logging/#write-logs-file-to-csv-every-k-batches) @@ -62,14 +62,14 @@ But of course the fun is in all the advanced things it can do: - [Anneal Learning rate](Training%20Loop/#anneal-learning-rate) - [Force training for min or max epochs](Training%20Loop/#force-training-for-min-or-max-epochs) - [Force disable early stop](Training%20Loop/#force-disable-early-stop) -- [Hooks](hooks/) +- [Hooks](hooks) - [Use multiple optimizers (like GANs)](../Pytorch-lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](Training%20Loop/#set-how-much-of-the-training-set-to-check) **Validation loop** - [Check validation every n epochs](Validation%20Loop/#check-validation-every-n-epochs) -- [Hooks](hooks/) +- [Hooks](hooks) - [Set how much of the validation set to check](Validation%20Loop/#set-how-much-of-the-validation-set-to-check) - [Set how much of the test set to check](Validation%20Loop/#set-how-much-of-the-test-set-to-check) - [Set validation check frequency within 1 training epoch](Validation%20Loop/#set-validation-check-frequency-within-1-training-epoch) From 5a6ee935f0d574c3f7337e38099dfe6e4424c73b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 07:59:16 -0400 Subject: [PATCH 16/48] updated doc indexes --- README.md | 8 ++++++-- docs/Trainer/index.md | 15 ++++++++------- docs/index.md | 4 +++- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index fadb2ec1ba..8ecdce4634 100644 --- a/README.md +++ b/README.md @@ -233,6 +233,7 @@ tensorboard --logdir /some/path ## Lightning automates all of the following ([each is also configurable](https://williamfalcon.github.io/pytorch-lightning/Trainer/)): + ###### Checkpointing - [Model saving](https://williamfalcon.github.io/pytorch-lightning/Trainer/Checkpointing/#model-saving) @@ -265,9 +266,9 @@ tensorboard --logdir /some/path ###### Experiment Logging - [Display metrics in progress bar](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#display-metrics-in-progress-bar) -- Log arbitrary metrics - [Log metric row every k batches](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#log-metric-row-every-k-batches) - [Process position](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#process-position) +- [Tensorboard support](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#tensorboard-support) - [Save a snapshot of all hyperparameters](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#save-a-snapshot-of-all-hyperparameters) - [Snapshot code for a training run](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#snapshot-code-for-a-training-run) - [Write logs file to csv every k batches](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#write-logs-file-to-csv-every-k-batches) @@ -279,18 +280,21 @@ tensorboard --logdir /some/path - [Force training for min or max epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-training-for-min-or-max-epochs) - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) +- [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) - [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) -###### Validation loop +######Validation loop - [Check validation every n epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#check-validation-every-n-epochs) +- [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) - [Set how much of the validation set to check](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#set-how-much-of-the-validation-set-to-check) - [Set how much of the test set to check](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#set-how-much-of-the-test-set-to-check) - [Set validation check frequency within 1 training epoch](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#set-validation-check-frequency-within-1-training-epoch) - [Set the number of validation sanity steps](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#set-the-number-of-validation-sanity-steps) + ## Demo ```bash # install lightning diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index a9be9a5ebf..48b8f6d260 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -51,20 +51,21 @@ But of course the fun is in all the advanced things it can do: - [Display metrics in progress bar](Logging/#display-metrics-in-progress-bar) - [Log metric row every k batches](Logging/#log-metric-row-every-k-batches) - [Process position](Logging/#process-position) -- [Tensorboard support](Logging/#tensorboard-support) +- [Tensorboard support](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#tensorboard-support) - [Save a snapshot of all hyperparameters](Logging/#save-a-snapshot-of-all-hyperparameters) - [Snapshot code for a training run](Logging/#snapshot-code-for-a-training-run) - [Write logs file to csv every k batches](Logging/#write-logs-file-to-csv-every-k-batches) **Training loop** -- [Accumulate gradients](Training%20Loop/#accumulated-gradients) -- [Anneal Learning rate](Training%20Loop/#anneal-learning-rate) -- [Force training for min or max epochs](Training%20Loop/#force-training-for-min-or-max-epochs) -- [Force disable early stop](Training%20Loop/#force-disable-early-stop) +- [Accumulate gradients](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#accumulated-gradients) +- [Anneal Learning rate](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#anneal-learning-rate) +- [Force training for min or max epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-training-for-min-or-max-epochs) +- [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) +- [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](hooks) -- [Use multiple optimizers (like GANs)](../Pytorch-lightning/LightningModule/#configure_optimizers) -- [Set how much of the training set to check (1-100%)](Training%20Loop/#set-how-much-of-the-training-set-to-check) +- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) +- [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) **Validation loop** diff --git a/docs/index.md b/docs/index.md index 0e25fa79d5..795a2ba9b0 100644 --- a/docs/index.md +++ b/docs/index.md @@ -50,9 +50,9 @@ To start a new project define these two files. ###### Experiment Logging - [Display metrics in progress bar](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#display-metrics-in-progress-bar) -- Log arbitrary metrics - [Log metric row every k batches](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#log-metric-row-every-k-batches) - [Process position](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#process-position) +- [Tensorboard support](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#tensorboard-support) - [Save a snapshot of all hyperparameters](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#save-a-snapshot-of-all-hyperparameters) - [Snapshot code for a training run](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#snapshot-code-for-a-training-run) - [Write logs file to csv every k batches](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#write-logs-file-to-csv-every-k-batches) @@ -64,12 +64,14 @@ To start a new project define these two files. - [Force training for min or max epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-training-for-min-or-max-epochs) - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) +- [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) - [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) ######Validation loop - [Check validation every n epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#check-validation-every-n-epochs) +- [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) - [Set how much of the validation set to check](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#set-how-much-of-the-validation-set-to-check) - [Set how much of the test set to check](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#set-how-much-of-the-test-set-to-check) - [Set validation check frequency within 1 training epoch](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#set-validation-check-frequency-within-1-training-epoch) From 579f111637498056154747a0e9b131ab1b69cde9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:02:29 -0400 Subject: [PATCH 17/48] Update hooks.md --- docs/Trainer/hooks.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md index 061ffa1ad0..c7a4bfbea0 100644 --- a/docs/Trainer/hooks.md +++ b/docs/Trainer/hooks.md @@ -4,11 +4,10 @@ There are cases when you might want to do something different at different parts of the training/validation loop. To enable a hook, simply override the method in your LightningModule and the trainer will call it at the correct time. -**Contributing** If there's a hook you'd like to add, simply: -1. Fork PytorchLightning. -2. Add the hook [here](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/root_module/hooks.py). -3. Add the correct place in the [Trainer](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/models/trainer.py) -where it should be called. +**Contributing** If there's a hook you'd like to add, simply: +1. Fork PytorchLightning. +2. Add the hook [here](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/root_module/hooks.py). +3. Add the correct place in the [Trainer](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/models/trainer.py) where it should be called. --- #### on_epoch_start From 3f8c2191311a75dc4c900a4a4eb5382853589c81 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:04:28 -0400 Subject: [PATCH 18/48] updated doc indexes --- README.md | 2 +- docs/index.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8ecdce4634..afbefb16e7 100644 --- a/README.md +++ b/README.md @@ -284,7 +284,7 @@ tensorboard --logdir /some/path - [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) -######Validation loop +###### Validation loop - [Check validation every n epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#check-validation-every-n-epochs) - [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) diff --git a/docs/index.md b/docs/index.md index 795a2ba9b0..a0dc44bead 100644 --- a/docs/index.md +++ b/docs/index.md @@ -68,7 +68,7 @@ To start a new project define these two files. - [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) -######Validation loop +###### Validation loop - [Check validation every n epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Validation%20loop/#check-validation-every-n-epochs) - [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) From cdb4de36060a4bf9bbf7e87704317536bd3cdd15 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:13:40 -0400 Subject: [PATCH 19/48] updated doc indexes --- docs/Trainer/Training Loop.md | 2 +- docs/Trainer/Validation loop.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Trainer/Training Loop.md b/docs/Trainer/Training Loop.md index 2be8da9edd..e1ff90a484 100644 --- a/docs/Trainer/Training Loop.md +++ b/docs/Trainer/Training Loop.md @@ -1,4 +1,4 @@ -The lightning training loop handles everything except the actual computations of your model. To decide what will happen in your training loop, define the [training_step function](../../Pytorch-lightning/LightningModule/#training_step). +The lightning training loop handles everything except the actual computations of your model. To decide what will happen in your training loop, define the [training_step function](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#training_step). Below are all the things lightning automates for you in the training loop. diff --git a/docs/Trainer/Validation loop.md b/docs/Trainer/Validation loop.md index 693df88904..8d6c5bac46 100644 --- a/docs/Trainer/Validation loop.md +++ b/docs/Trainer/Validation loop.md @@ -1,4 +1,4 @@ -The lightning validation loop handles everything except the actual computations of your model. To decide what will happen in your validation loop, define the [validation_step function](../../Pytorch-lightning/LightningModule/#validation_step). +The lightning validation loop handles everything except the actual computations of your model. To decide what will happen in your validation loop, define the [validation_step function](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#validation_step). Below are all the things lightning automates for you in the validation loop. **Note** From e89975d19eba4d418260205dede7a352176c18dc Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:14:50 -0400 Subject: [PATCH 20/48] updated doc indexes --- .../RequiredTrainerInterface.md | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index b9bb1d36fc..ff4adf57a6 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -29,21 +29,24 @@ Otherwise, to Define a Lightning Module, implement the following methods: --- **Minimal example** ```python -import pytorch_lightning as ptl +import os import torch from torch.nn import functional as F from torch.utils.data import DataLoader from torchvision.datasets import MNIST +import torchvision.transforms as transforms + +import pytorch_lightning as ptl class CoolModel(ptl.LightningModule): - def __init(self): + def __init__(self): super(CoolModel, self).__init__() # not the best model... self.l1 = torch.nn.Linear(28 * 28, 10) def forward(self, x): - return torch.relu(self.l1(x)) + return torch.relu(self.l1(x.view(x.size(0), -1))) def my_loss(self, y_hat, y): return F.cross_entropy(y_hat, y) @@ -51,7 +54,7 @@ class CoolModel(ptl.LightningModule): def training_step(self, batch, batch_nb): x, y = batch y_hat = self.forward(x) - return {'tng_loss': self.my_loss(y_hat, y)} + return {'loss': self.my_loss(y_hat, y)} def validation_step(self, batch, batch_nb): x, y = batch @@ -59,23 +62,23 @@ class CoolModel(ptl.LightningModule): return {'val_loss': self.my_loss(y_hat, y)} def validation_end(self, outputs): - avg_loss = torch.stack([x for x in outputs['val_loss']]).mean() - return avg_loss + avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() + return {'avg_val_loss': avg_loss} def configure_optimizers(self): return [torch.optim.Adam(self.parameters(), lr=0.02)] @ptl.data_loader def tng_dataloader(self): - return DataLoader(MNIST('path/to/save', train=True), batch_size=32) + return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32) @ptl.data_loader def val_dataloader(self): - return DataLoader(MNIST('path/to/save', train=False), batch_size=32) + return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32) @ptl.data_loader def test_dataloader(self): - return DataLoader(MNIST('path/to/save', train=False), batch_size=32) + return DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()), batch_size=32) ``` --- From a6ddf8a671522a1d341ad01aebed670abeb6de0a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:16:55 -0400 Subject: [PATCH 21/48] updated doc indexes --- docs/LightningModule/RequiredTrainerInterface.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index ff4adf57a6..5b0c42b6df 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -3,7 +3,7 @@ A lightning module is a strict superclass of nn.Module, it provides a standard interface for the trainer to interact with the model. -The easiest thing to do is copy [this template](../../pytorch_lightning/examples/new_project_templates/lightning_module_template.py) and modify accordingly. +The easiest thing to do is copy [this minimal example](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#minimal-example) and modify accordingly. Otherwise, to Define a Lightning Module, implement the following methods: From 1205dc8a201af0fa9efb7ba563285e1e8cbb376e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:19:51 -0400 Subject: [PATCH 22/48] updated doc indexes --- docs/LightningModule/RequiredTrainerInterface.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index 5b0c42b6df..059c54260f 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -3,7 +3,7 @@ A lightning module is a strict superclass of nn.Module, it provides a standard interface for the trainer to interact with the model. -The easiest thing to do is copy [this minimal example](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#minimal-example) and modify accordingly. +The easiest thing to do is copy the [minimal example](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#minimal-example) below and modify accordingly. Otherwise, to Define a Lightning Module, implement the following methods: @@ -27,7 +27,7 @@ Otherwise, to Define a Lightning Module, implement the following methods: - [add_model_specific_args](RequiredTrainerInterface.md#add_model_specific_args) --- -**Minimal example** +### Minimal example ```python import os import torch From 14dff830a1025fcecce1aeaf3bc190bdf351d5ce Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:20:26 -0400 Subject: [PATCH 23/48] updated doc indexes --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index a0dc44bead..ba5ec0e71b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,7 +1,7 @@ ###### New project Quick Start To start a new project define these two files. -1. [Define a LightningModule](/pytorch-lightning/LightningModule/RequiredTrainerInterface/) +1. [Define a LightningModule](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#minimal-example) 2. [Define a trainer](https://williamfalcon.github.io/pytorch-lightning/Trainer/) - [Basic CPU Trainer Template](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/examples/new_project_templates/single_cpu_template.py) - [Multi-GPU Trainer Template](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/examples/new_project_templates/single_gpu_node_template.py) From d95f1a2a655363ae167ac4027e7a93521e3b4818 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:26:58 -0400 Subject: [PATCH 24/48] updated doc indexes --- docs/index.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/index.md b/docs/index.md index ba5ec0e71b..2a9c156f2b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,8 +1,11 @@ ###### New project Quick Start -To start a new project define these two files. +To start a new project you define two files, a LightningModule and a Trainer file. +A separate trainer file allows to run many LightningModules. Each LightningModule has the core +logic to a particular research project. For example, one lightningModule could be an image classifier, the other +one could be a seq-2-seq model, both (optionally) ran by the same trainer file. -1. [Define a LightningModule](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#minimal-example) -2. [Define a trainer](https://williamfalcon.github.io/pytorch-lightning/Trainer/) +1. [LightningModule](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#minimal-example) +2. [Trainer](https://williamfalcon.github.io/pytorch-lightning/Trainer/) - [Basic CPU Trainer Template](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/examples/new_project_templates/single_cpu_template.py) - [Multi-GPU Trainer Template](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/examples/new_project_templates/single_gpu_node_template.py) - [GPU cluster Trainer Template](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/examples/new_project_templates/multi_node_cluster_template.py) From aba8405d1a6d6b82b41ef8eecbd1e829ae517843 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:27:09 -0400 Subject: [PATCH 25/48] updated doc indexes --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 2a9c156f2b..4526863139 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,7 +4,7 @@ A separate trainer file allows to run many LightningModules. Each LightningModul logic to a particular research project. For example, one lightningModule could be an image classifier, the other one could be a seq-2-seq model, both (optionally) ran by the same trainer file. -1. [LightningModule](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#minimal-example) +1. [MNIST LightningModule](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#minimal-example) 2. [Trainer](https://williamfalcon.github.io/pytorch-lightning/Trainer/) - [Basic CPU Trainer Template](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/examples/new_project_templates/single_cpu_template.py) - [Multi-GPU Trainer Template](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/examples/new_project_templates/single_gpu_node_template.py) From 88b383115c93d8492b36f8ba318090c6de1d9259 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 08:28:15 -0400 Subject: [PATCH 26/48] updated doc indexes --- docs/index.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index 4526863139..bcd7ca2159 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,7 +1,10 @@ ###### New project Quick Start -To start a new project you define two files, a LightningModule and a Trainer file. +To start a new project you define two files, a LightningModule and a Trainer file. + A separate trainer file allows to run many LightningModules. Each LightningModule has the core -logic to a particular research project. For example, one lightningModule could be an image classifier, the other +logic to a particular research project. + +For example, one lightningModule could be an image classifier, the other one could be a seq-2-seq model, both (optionally) ran by the same trainer file. 1. [MNIST LightningModule](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#minimal-example) From e42046446dc01ea1b9866271925c3cacf1fef041 Mon Sep 17 00:00:00 2001 From: williamFalcon Date: Sun, 28 Jul 2019 05:42:43 -0700 Subject: [PATCH 27/48] removed file --- .../sample_model_template/model_template.py | 204 ------------------ 1 file changed, 204 deletions(-) delete mode 100644 pytorch_lightning/models/sample_model_template/model_template.py diff --git a/pytorch_lightning/models/sample_model_template/model_template.py b/pytorch_lightning/models/sample_model_template/model_template.py deleted file mode 100644 index 0c446a11ca..0000000000 --- a/pytorch_lightning/models/sample_model_template/model_template.py +++ /dev/null @@ -1,204 +0,0 @@ -import torch.nn as nn -import numpy as np -from pytorch_lightning import LightningModule -from test_tube import HyperOptArgumentParser -from torchvision.datasets import MNIST -import torchvision.transforms as transforms -import torch -import torch.nn.functional as F - - -class ExampleModel1(LightningModule): - """ - Sample model to show how to define a template - """ - - def __init__(self, hparams): - # init superclass - super(ExampleModel1, self).__init__(hparams) - - self.batch_size = hparams.batch_size - - # build model - self.__build_model() - - # --------------------- - # MODEL SETUP - # --------------------- - def __build_model(self): - """ - Layout model - :return: - """ - self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim) - self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim) - self.c_d1_drop = nn.Dropout(self.hparams.drop_prob) - - self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features) - - # --------------------- - # TRAINING - # --------------------- - def forward(self, x): - x = self.c_d1(x) - x = F.tanh(x) - x = self.c_d1_bn(x) - x = self.c_d1_drop(x) - - x = self.c_d2(x) - logits = F.log_softmax(x, dim=1) - - return logits - - def loss(self, labels, logits): - nll = F.nll_loss(logits, labels) - return nll - - def training_step(self, data_batch): - """ - Called inside the training loop - :param data_batch: - :return: - """ - # forward pass - x, y = data_batch - x = x.view(x.size(0), -1) - y_hat = self.forward(x) - - # calculate loss - loss_val = self.loss(y, y_hat) - - tqdm_dic = {'jefe': 1} - return loss_val, tqdm_dic - - def validation_step(self, data_batch): - """ - Called inside the validation loop - :param data_batch: - :return: - """ - x, y = data_batch - x = x.view(x.size(0), -1) - y_hat = self.forward(x) - - loss_val = self.loss(y, y_hat) - - # acc - labels_hat = torch.argmax(y_hat, dim=1) - val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - - output = {'y_hat': y_hat, 'val_loss': loss_val.item(), 'val_acc': val_acc} - return output - - def validation_end(self, outputs): - """ - Called at the end of validation to aggregate outputs - :param outputs: list of individual outputs of each validation step - :return: - """ - val_loss_mean = 0 - accs = [] - for output in outputs: - val_loss_mean += output['val_loss'] - accs.append(output['val_acc']) - - val_loss_mean /= len(outputs) - tqdm_dic = {'val_loss': val_loss_mean, 'val_acc': np.mean(accs)} - return tqdm_dic - - def update_tng_log_metrics(self, logs): - return logs - - # --------------------- - # MODEL SAVING - # --------------------- - def get_save_dict(self): - checkpoint = { - 'state_dict': self.state_dict(), - } - - return checkpoint - - def load_model_specific(self, checkpoint): - self.load_state_dict(checkpoint['state_dict']) - pass - - # --------------------- - # TRAINING SETUP - # --------------------- - def configure_optimizers(self): - """ - return whatever optimizers and (optionally) schedulers we want here - :return: list of optimizers - """ - optimizer = self.choose_optimizer(self.hparams.optimizer_name, self.parameters(), {'lr': self.hparams.learning_rate}, 'optimizer') - self.optimizers = [optimizer] - self.schedulers = [] - return self.optimizers, self.schedulers - - def __dataloader(self, train): - # init data generators - transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) - - dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True) - - loader = torch.utils.data.DataLoader( - dataset=dataset, - batch_size=self.hparams.batch_size, - shuffle=True - ) - - return loader - - @data_loader - def tng_dataloader(self): - if self._tng_dataloader is None: - try: - self._tng_dataloader = self.__dataloader(train=True) - except Exception as e: - print(e) - raise e - return self._tng_dataloader - - @property - def val_dataloader(self): - if self._val_dataloader is None: - try: - self._val_dataloader = self.__dataloader(train=False) - except Exception as e: - print(e) - raise e - return self._val_dataloader - - @property - def test_dataloader(self): - if self._test_dataloader is None: - try: - self._test_dataloader = self.__dataloader(train=False) - except Exception as e: - print(e) - raise e - return self._test_dataloader - - @staticmethod - def add_model_specific_args(parent_parser): - parser = HyperOptArgumentParser(strategy=parent_parser.strategy, parents=[parent_parser]) - - # param overwrites - # parser.set_defaults(gradient_clip=5.0) - - # network params - parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False) - parser.add_argument('--in_features', default=28*28) - parser.add_argument('--hidden_dim', default=500) - parser.add_argument('--out_features', default=10) - - # data - parser.add_argument('--data_root', default='/Users/williamfalcon/Developer/personal/research_lib/research_proj/datasets/mnist', type=str) - - # training params (opt) - parser.opt_list('--learning_rate', default=0.001, type=float, options=[0.0001, 0.0005, 0.001, 0.005], - tunable=False) - parser.opt_list('--batch_size', default=256, type=int, options=[32, 64, 128, 256], tunable=False) - parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False) - return parser From 27660b8a96b6fe784305f70429973639a2ed17fb Mon Sep 17 00:00:00 2001 From: williamFalcon Date: Sun, 28 Jul 2019 05:57:37 -0700 Subject: [PATCH 28/48] running tests --- pytorch_lightning/root_module/model_saving.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pytorch_lightning/root_module/model_saving.py b/pytorch_lightning/root_module/model_saving.py index c5831317e5..83fc1aa3af 100644 --- a/pytorch_lightning/root_module/model_saving.py +++ b/pytorch_lightning/root_module/model_saving.py @@ -71,11 +71,19 @@ class TrainerIO(object): checkpoint['early_stop_callback_wait'] = self.early_stop_callback.wait checkpoint['early_stop_callback_patience'] = self.early_stop_callback.patience + # save optimizers optimizer_states = [] for i, optimizer in enumerate(self.optimizers): optimizer_states.append(optimizer.state_dict()) checkpoint['optimizer_states'] = optimizer_states + + # save lr schedulers + lr_schedulers = [] + for i, scheduler in enumerate(self.lr_schedulers): + lr_schedulers.append(scheduler.state_dict()) + + checkpoint['lr_schedulers'] = lr_schedulers # add the state_dict from the model model = self.__get_model() @@ -130,6 +138,11 @@ class TrainerIO(object): optimizer_states = checkpoint['optimizer_states'] for optimizer, opt_state in zip(self.optimizers, optimizer_states): optimizer.load_state_dict(opt_state) + + # restore the lr schedulers + lr_schedulers = checkpoint['lr_schedulers'] + for scheduler, lrs_state in zip(self.lr_schedulers, lr_schedulers): + scheduler.load_state_dict(lrs_state) # ---------------------------------- # PRIVATE OPS From b9e0d841dcf4fd5c01f205758b894f4f9c8f77f1 Mon Sep 17 00:00:00 2001 From: williamFalcon Date: Sun, 28 Jul 2019 06:21:41 -0700 Subject: [PATCH 29/48] fixed lr scheduler tests --- tests/test_models.py | 58 ++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index e8fa339b54..9d40c0da93 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -24,6 +24,33 @@ np.random.seed(SEED) # ------------------------------------------------------------------------ # TESTS # ------------------------------------------------------------------------ +def test_amp_gpu_ddp(): + """ + Make sure DDP + AMP work + :return: + """ + if not torch.cuda.is_available(): + warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test') + return + if not torch.cuda.device_count() > 1: + warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test') + return + + os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) + + hparams = get_hparams() + model = LightningTestModel(hparams) + + trainer_options = dict( + progress_bar=True, + max_nb_epochs=1, + gpus=[0, 1], + distributed_backend='ddp', + use_amp=True + ) + + run_gpu_model_test(trainer_options, model, hparams) + def test_cpu_slurm_save_load(): """ @@ -280,7 +307,7 @@ def test_amp_gpu_ddp_slurm_managed(): if trainer.use_ddp: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model - trainer.optimizers = pretrained_model.configure_optimizers() + trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers() # test HPC loading / saving trainer.hpc_save(save_dir, exp) @@ -477,33 +504,6 @@ def test_multi_gpu_model_ddp(): run_gpu_model_test(trainer_options, model, hparams) -def test_amp_gpu_ddp(): - """ - Make sure DDP + AMP work - :return: - """ - if not torch.cuda.is_available(): - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test') - return - if not torch.cuda.device_count() > 1: - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test') - return - - os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) - - hparams = get_hparams() - model = LightningTestModel(hparams) - - trainer_options = dict( - progress_bar=True, - max_nb_epochs=1, - gpus=[0, 1], - distributed_backend='ddp', - use_amp=True - ) - - run_gpu_model_test(trainer_options, model, hparams) - def test_ddp_sampler_error(): """ @@ -574,7 +574,7 @@ def run_gpu_model_test(trainer_options, model, hparams, on_gpu=True): if trainer.use_ddp: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model - trainer.optimizers = pretrained_model.configure_optimizers() + trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers() # test HPC loading / saving trainer.hpc_save(save_dir, exp) From 638d79a5a63a1ef5bf66294a0d02e340836b3555 Mon Sep 17 00:00:00 2001 From: williamFalcon Date: Sun, 28 Jul 2019 06:33:58 -0700 Subject: [PATCH 30/48] allow optimizer fx to return 1 or 2 lists --- pytorch_lightning/models/trainer.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index ac67d83dad..5ce253c29c 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -438,8 +438,10 @@ class Trainer(TrainerIO): raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option') # CHOOSE OPTIMIZER - # filter out the weights that were done on gpu so we can load on good old cpus - self.optimizers, self.lr_schedulers = model.configure_optimizers() + # allow for lr schedulers as well + self.optimizers = model.configure_optimizers() + if len(self.optimizers) == 2: + self.optimizers, self.lr_schedulers = self.optimizers self.__run_pretrain_routine(model) @@ -450,8 +452,10 @@ class Trainer(TrainerIO): def __dp_train(self, model): # CHOOSE OPTIMIZER - # filter out the weights that were done on gpu so we can load on good old cpus - self.optimizers, self.lr_schedulers = model.configure_optimizers() + # allow for lr schedulers as well + self.optimizers = model.configure_optimizers() + if len(self.optimizers) == 2: + self.optimizers, self.lr_schedulers = self.optimizers model.cuda(self.data_parallel_device_ids[0]) @@ -504,8 +508,10 @@ class Trainer(TrainerIO): self.__init_tcp_connection() # CHOOSE OPTIMIZER - # filter out the weights that were done on gpu so we can load on good old cpus - self.optimizers, self.lr_schedulers = model.configure_optimizers() + # allow for lr schedulers as well + self.optimizers = model.configure_optimizers() + if len(self.optimizers) == 2: + self.optimizers, self.lr_schedulers = self.optimizers # MODEL # copy model to each gpu From a3df994d5f57bc21e0335fcb66955ff66ac5b5ba Mon Sep 17 00:00:00 2001 From: williamFalcon Date: Sun, 28 Jul 2019 06:34:55 -0700 Subject: [PATCH 31/48] allow optimizer fx to return 1 or 2 lists --- pytorch_lightning/testing_models/lm_test_module.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/testing_models/lm_test_module.py b/pytorch_lightning/testing_models/lm_test_module.py index a143e51e7e..87de2c7777 100644 --- a/pytorch_lightning/testing_models/lm_test_module.py +++ b/pytorch_lightning/testing_models/lm_test_module.py @@ -181,7 +181,9 @@ class LightningTestModel(LightningModule): """ # try no scheduler for this model (testing purposes) optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - return [optimizer], [] + + # test returning only 1 list instead of 2 + return [optimizer] def __dataloader(self, train): # init data generators From 36c0fae7da960c4516a4ff48bc66bac2de422d7b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 09:51:20 -0400 Subject: [PATCH 32/48] updated doc indexes --- docs/LightningModule/RequiredTrainerInterface.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index ef7c2e3008..da230ea611 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -230,7 +230,7 @@ Lightning will call .backward() and .step() on each one in every epoch. If you ##### Return -Tuple - List of optimizers and list of schedulers +List or Tuple - List of optimizers with an optional second list of learning-rate schedulers **Example** @@ -238,7 +238,7 @@ Tuple - List of optimizers and list of schedulers # most cases def configure_optimizers(self): opt = Adam(self.parameters(), lr=0.01) - return [opt], [] + return [opt] # gan example, with scheduler for discriminator def configure_optimizers(self): From db0d347941e02852ace36653ebe3db118170c0ef Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 09:54:33 -0400 Subject: [PATCH 33/48] updated doc indexes --- README.md | 1 + docs/Trainer/index.md | 2 +- docs/index.md | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 549a48504d..2beb05a7cb 100644 --- a/README.md +++ b/README.md @@ -281,6 +281,7 @@ tensorboard --logdir /some/path - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) - [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) +- [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) ###### Validation loop diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index 48b8f6d260..4fc2040a44 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -59,11 +59,11 @@ But of course the fun is in all the advanced things it can do: **Training loop** - [Accumulate gradients](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#accumulated-gradients) -- [Anneal Learning rate](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#anneal-learning-rate) - [Force training for min or max epochs](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-training-for-min-or-max-epochs) - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](hooks) +- [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) diff --git a/docs/index.md b/docs/index.md index 631d940d59..047c220240 100644 --- a/docs/index.md +++ b/docs/index.md @@ -70,6 +70,7 @@ one could be a seq-2-seq model, both (optionally) ran by the same trainer file. - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) +- [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) From de93470c2e04d5cabade4fd2bb9a5542b98214c5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 09:56:02 -0400 Subject: [PATCH 34/48] updated doc indexes --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2beb05a7cb..327b781872 100644 --- a/README.md +++ b/README.md @@ -280,8 +280,8 @@ tensorboard --logdir /some/path - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) -- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) +- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) ###### Validation loop From 29cf7a239a1dc5869498dd6a0d345965d5478ee0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 09:57:09 -0400 Subject: [PATCH 35/48] release v0.3.6.5 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 50a050570a..7156c586d1 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from setuptools import setup, find_packages # http://blog.ionelmc.ro/2014/05/25/python-packaging/ setup( name="pytorch-lightning", - version='0.3.6.4', + version='0.3.6.5', description="The Keras for ML researchers using PyTorch", author="William Falcon", author_email="waf2107@columbia.edu", From f1f7698ce13347be350ae7f2d1a2b92d6830c44e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 10:00:53 -0400 Subject: [PATCH 36/48] updated doc indexes --- README.md | 4 ++-- docs/Trainer/index.md | 2 +- docs/index.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 327b781872..0196245aa2 100644 --- a/README.md +++ b/README.md @@ -280,8 +280,8 @@ tensorboard --logdir /some/path - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) -- [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) -- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) +- [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) +- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) ###### Validation loop diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index 4fc2040a44..128f12452d 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -63,7 +63,7 @@ But of course the fun is in all the advanced things it can do: - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](hooks) -- [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) +- [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) - [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) diff --git a/docs/index.md b/docs/index.md index 047c220240..9c62b00354 100644 --- a/docs/index.md +++ b/docs/index.md @@ -70,7 +70,7 @@ one could be a seq-2-seq model, both (optionally) ran by the same trainer file. - [Force disable early stop](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#force-disable-early-stop) - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) -- [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) +- [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) - [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) From d372f9a2e207f390d2154a6bd20c84b90e4a4715 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 11:46:26 -0400 Subject: [PATCH 37/48] updated dict keys --- pytorch_lightning/models/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 5ce253c29c..e60a0d95cc 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -772,7 +772,7 @@ class Trainer(TrainerIO): output = self.model.training_step(data_batch, batch_nb) try: - model_specific_tqdm_metrics_dic = output['tqdm_metrics'] + model_specific_tqdm_metrics_dic = output['prog'] except Exception as e: model_specific_tqdm_metrics_dic = {} From 6bb3c0306a1413da18b89707642028efa1966861 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 11:51:32 -0400 Subject: [PATCH 38/48] updated output of test models --- .../testing_models/lm_test_module.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/testing_models/lm_test_module.py b/pytorch_lightning/testing_models/lm_test_module.py index 87de2c7777..8e0718b972 100644 --- a/pytorch_lightning/testing_models/lm_test_module.py +++ b/pytorch_lightning/testing_models/lm_test_module.py @@ -96,12 +96,15 @@ class LightningTestModel(LightningModule): if self.trainer.use_dp: loss_val = loss_val.unsqueeze(0) - output = OrderedDict({ - 'loss': loss_val - }) - - # can also return just a scalar instead of a dict (return loss_val) - return output + # alternate possible outputs to test + if self.trainer.batch_nb % 1 == 0: + output = OrderedDict({ + 'loss': loss_val, + 'prog': {'some_val': loss_val * loss_val} + }) + return output + if self.trainer.batch_nb % 2 == 0: + return loss_val def validation_step(self, data_batch, batch_i): """ From 7b774beb0c14740097a7d298dba18dde12c2c574 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 28 Jul 2019 12:12:45 -0400 Subject: [PATCH 39/48] release v0.3.6.6 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7156c586d1..efc55d2159 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from setuptools import setup, find_packages # http://blog.ionelmc.ro/2014/05/25/python-packaging/ setup( name="pytorch-lightning", - version='0.3.6.5', + version='0.3.6.6', description="The Keras for ML researchers using PyTorch", author="William Falcon", author_email="waf2107@columbia.edu", From a4a8bae35907a9bf3be340c958a49491be47b05f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 1 Aug 2019 10:02:12 -0400 Subject: [PATCH 40/48] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0196245aa2..fbc271bd20 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Pytorch Lightning

- The Keras for ML researchers using PyTorch. More control. Less boilerplate. + The PyTorch Keras for ML researchers. More control. Less boilerplate.

From 598e1accb59714264ec75672a407e59151e62cba Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 1 Aug 2019 10:11:26 -0400 Subject: [PATCH 41/48] updated docs --- README.md | 2 +- docs/LightningModule/RequiredTrainerInterface.md | 6 +++--- docs/LightningModule/methods.md | 2 +- docs/Trainer/Logging.md | 2 +- docs/Trainer/hooks.md | 2 +- docs/Trainer/index.md | 2 +- docs/index.md | 2 +- mkdocs.yml | 4 ++-- tests/README.md | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 0196245aa2..b9ecec319b 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@

- Pytorch Lightning + PyTorch Lightning

The Keras for ML researchers using PyTorch. More control. Less boilerplate. diff --git a/docs/LightningModule/RequiredTrainerInterface.md b/docs/LightningModule/RequiredTrainerInterface.md index da230ea611..abc015caf0 100644 --- a/docs/LightningModule/RequiredTrainerInterface.md +++ b/docs/LightningModule/RequiredTrainerInterface.md @@ -300,7 +300,7 @@ def tng_dataloader(self) Called by lightning during training loop. Make sure to use the @ptl.data_loader decorator, this ensures not calling this function until the data are needed. ##### Return -Pytorch DataLoader +PyTorch DataLoader **Example** @@ -327,7 +327,7 @@ def tng_dataloader(self) Called by lightning during validation loop. Make sure to use the @ptl.data_loader decorator, this ensures not calling this function until the data are needed. ##### Return -Pytorch DataLoader +PyTorch DataLoader **Example** @@ -355,7 +355,7 @@ def test_dataloader(self) Called by lightning during test loop. Make sure to use the @ptl.data_loader decorator, this ensures not calling this function until the data are needed. ##### Return -Pytorch DataLoader +PyTorch DataLoader **Example** diff --git a/docs/LightningModule/methods.md b/docs/LightningModule/methods.md index d57c695034..cb96ea7a1d 100644 --- a/docs/LightningModule/methods.md +++ b/docs/LightningModule/methods.md @@ -31,7 +31,7 @@ y_hat = pretrained_model(x) | Param | description | |---|---| -| weights_path | Path to a pytorch checkpoint | +| weights_path | Path to a PyTorch checkpoint | | tags_csv | Path to meta_tags.csv file generated by the test-tube Experiment | | on_gpu | if True, puts model on GPU. Make sure to use transforms option if model devices have changed | | map_location | A dictionary mapping saved weight GPU devices to new GPU devices | diff --git a/docs/Trainer/Logging.md b/docs/Trainer/Logging.md index d3004cb2e3..2edbab16f1 100644 --- a/docs/Trainer/Logging.md +++ b/docs/Trainer/Logging.md @@ -52,7 +52,7 @@ Trainer(experiment=exp) --- ### Tensorboard support -The experiment object is a strict subclass of Pytorch SummaryWriter. However, this class +The experiment object is a strict subclass of PyTorch SummaryWriter. However, this class also snapshots every detail about the experiment (data folder paths, code, hyperparams), and allows you to visualize it using tensorboard. ``` {.python} diff --git a/docs/Trainer/hooks.md b/docs/Trainer/hooks.md index c7a4bfbea0..dd08b30b45 100644 --- a/docs/Trainer/hooks.md +++ b/docs/Trainer/hooks.md @@ -5,7 +5,7 @@ There are cases when you might want to do something different at different parts To enable a hook, simply override the method in your LightningModule and the trainer will call it at the correct time. **Contributing** If there's a hook you'd like to add, simply: -1. Fork PytorchLightning. +1. Fork PyTorchLightning. 2. Add the hook [here](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/root_module/hooks.py). 3. Add the correct place in the [Trainer](https://github.com/williamFalcon/pytorch-lightning/blob/master/pytorch_lightning/models/trainer.py) where it should be called. diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index 128f12452d..19c10d4940 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -64,7 +64,7 @@ But of course the fun is in all the advanced things it can do: - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](hooks) - [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) -- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) +- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/PyTorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) **Validation loop** diff --git a/docs/index.md b/docs/index.md index 9c62b00354..973e744cc7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -71,7 +71,7 @@ one could be a seq-2-seq model, both (optionally) ran by the same trainer file. - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) - [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) -- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/Pytorch-Lightning/LightningModule/#configure_optimizers) +- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/PyTorch-Lightning/LightningModule/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) ###### Validation loop diff --git a/mkdocs.yml b/mkdocs.yml index 5675a64e01..539714d4b5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,10 +1,10 @@ -site_name: Pytorch lightning Documentation +site_name: PyTorch lightning Documentation theme: name: 'material' docs_dir: docs repo_url: https://github.com/williamFalcon/pytorch-lightning site_dir: 'site' -site_description: 'Documentation for Pytorch LightningModule, the researcher version of keras.' +site_description: 'Documentation for PyTorch LightningModule, the researcher version of keras.' dev_addr: '0.0.0.0:8000' #google_analytics: ['UA-aasd', 'sitename'] diff --git a/tests/README.md b/tests/README.md index 20f783bf7e..f7a85b51a0 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,4 +1,4 @@ -# Pytorch-Lightning Tests +# PyTorch-Lightning Tests ## Running tests The automatic travis tests ONLY run CPU-based tests. Although these cover most of the use cases, From 5e4728b2a75a59d29fcc624c88708d72d407ad60 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 1 Aug 2019 10:15:28 -0400 Subject: [PATCH 42/48] only proc 0 can submit a continuation slurm job --- pytorch_lightning/models/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index e60a0d95cc..1b9a233184 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -612,7 +612,7 @@ class Trainer(TrainerIO): # enable cluster checkpointing # also restores training state - if self.cluster is not None: # pragma: no cover + if self.cluster is not None and self.proc_rank == 0: # pragma: no cover self.enable_auto_hpc_walltime_manager() # --------------------------- From a0483eec9669a35f8bb021d8c9d4459d0532b339 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 1 Aug 2019 10:21:20 -0400 Subject: [PATCH 43/48] fix broken opt link --- docs/Trainer/index.md | 2 +- docs/index.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Trainer/index.md b/docs/Trainer/index.md index 19c10d4940..3983daae88 100644 --- a/docs/Trainer/index.md +++ b/docs/Trainer/index.md @@ -64,7 +64,7 @@ But of course the fun is in all the advanced things it can do: - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](hooks) - [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) -- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/PyTorch-Lightning/LightningModule/#configure_optimizers) +- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) **Validation loop** diff --git a/docs/index.md b/docs/index.md index 973e744cc7..ae08df4bf8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -71,7 +71,7 @@ one could be a seq-2-seq model, both (optionally) ran by the same trainer file. - [Gradient Clipping](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#gradient-clipping) - [Hooks](https://williamfalcon.github.io/pytorch-lightning/Trainer/hooks/) - [Learning rate scheduling](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) -- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/PyTorch-Lightning/LightningModule/#configure_optimizers) +- [Use multiple optimizers (like GANs)](https://williamfalcon.github.io/pytorch-lightning/LightningModule/RequiredTrainerInterface/#configure_optimizers) - [Set how much of the training set to check (1-100%)](https://williamfalcon.github.io/pytorch-lightning/Trainer/Training%20Loop/#set-how-much-of-the-training-set-to-check) ###### Validation loop From 00e851958c9ac0678830bd51fac05799ecc2f8fb Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 1 Aug 2019 10:26:53 -0400 Subject: [PATCH 44/48] release v0.3.6.7 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index efc55d2159..725d008c72 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from setuptools import setup, find_packages # http://blog.ionelmc.ro/2014/05/25/python-packaging/ setup( name="pytorch-lightning", - version='0.3.6.6', + version='0.3.6.7', description="The Keras for ML researchers using PyTorch", author="William Falcon", author_email="waf2107@columbia.edu", From ef6d5a412ce5295caa66501032977a31944bbbfd Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 1 Aug 2019 16:19:04 -0400 Subject: [PATCH 45/48] proc 0 only for save hpc. all procs for hpc load --- pytorch_lightning/models/trainer.py | 2 +- pytorch_lightning/root_module/model_saving.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 1b9a233184..e60a0d95cc 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -612,7 +612,7 @@ class Trainer(TrainerIO): # enable cluster checkpointing # also restores training state - if self.cluster is not None and self.proc_rank == 0: # pragma: no cover + if self.cluster is not None: # pragma: no cover self.enable_auto_hpc_walltime_manager() # --------------------------- diff --git a/pytorch_lightning/root_module/model_saving.py b/pytorch_lightning/root_module/model_saving.py index 83fc1aa3af..39c5ae7b70 100644 --- a/pytorch_lightning/root_module/model_saving.py +++ b/pytorch_lightning/root_module/model_saving.py @@ -102,13 +102,16 @@ class TrainerIO(object): return # allow test tube to handle model check pointing automatically - self.cluster.set_checkpoint_save_function( - self.hpc_save, - kwargs={ - 'folderpath': self.checkpoint_callback.filepath, - 'experiment': self.experiment - } - ) + # only if proc 0 so we don't trigger world_size resubmits + if self.proc_rank == 0: + self.cluster.set_checkpoint_save_function( + self.hpc_save, + kwargs={ + 'folderpath': self.checkpoint_callback.filepath, + 'experiment': self.experiment + } + ) + self.cluster.set_checkpoint_load_function( self.hpc_load, kwargs={ From d8e801594b808c8e011786c847a2cb0a64027de6 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 1 Aug 2019 16:19:19 -0400 Subject: [PATCH 46/48] release v0.3.6.8 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 725d008c72..769d607a60 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from setuptools import setup, find_packages # http://blog.ionelmc.ro/2014/05/25/python-packaging/ setup( name="pytorch-lightning", - version='0.3.6.7', + version='0.3.6.8', description="The Keras for ML researchers using PyTorch", author="William Falcon", author_email="waf2107@columbia.edu", From ee1029ed5e909e28c013ba454fbccd50de67328e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 3 Aug 2019 07:16:00 -0500 Subject: [PATCH 47/48] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9163387f17..1a3e287989 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,11 @@ Lightning defers training and validation loop logic to you. It guarantees correc ## Why do I want to use lightning? -When starting a new project the last thing you want to do is recode a training loop, model loading/saving, distributed training, when to validate, etc... You're likely to spend a long time ironing out all the bugs without even getting to the core of your research. +When starting a new project the last thing you want to do is recode a training loop, multi-cluster training, 16-bit precision, early-stopping, model loading/saving, when to validate, etc... You're likely to spend a long time ironing out all the bugs without even getting to the core of your research. -With lightning, you guarantee those parts of your code work so you can focus on what the meat of the research: Data and training, validation loop logic. Don't worry about multiple gpus or speeding up your code, lightning will do that for you! +With lightning, you guarantee those parts of your code work so you can focus on what the meat of the research: The data and the training/validation loop logic. + +Don't worry about training on multiple gpus or speeding up your code, lightning will do that for you! ## How do I do use it? From c8eb06da2b8d2f1b75b6602038bc774c5d2e9fd0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 3 Aug 2019 07:21:25 -0500 Subject: [PATCH 48/48] Update README.md --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 1a3e287989..864ed27c43 100644 --- a/README.md +++ b/README.md @@ -318,6 +318,19 @@ python single_gpu_node_template.py --gpus "0,1" python multi_node_cluster_template.py --nb_gpu_nodes 4 --gpus '0,1,2,3,4,5,6,7' ``` +## Contributing +Welcome to the PTL community! We're building the most advanced research platform on the planet to implement the latest, best practices that the amazing PyTorch team rolls out! + +#### Bug fixes: +1. Submit a github issue. +2. Fix it. +3. Submit a PR! + +#### New Features: +1. Submit a github issue. +2. We'll agree on the feature scope. +3. Submit a PR! (with updated docs and tests 🙃). + ## Bleeding edge If you can't wait for the next release, install the most up to date code with: ```bash