From d9bfe964f9ab10e6f4cf94c918426ee79cbfe2a9 Mon Sep 17 00:00:00 2001 From: Jiri BOROVEC Date: Tue, 6 Aug 2019 12:08:31 +0200 Subject: [PATCH] update by flake8 --- .circleci/config.yml | 2 +- README.md | 1 + .../lightning_module_template.py | 26 +++-- .../multi_node_cluster_template.py | 34 ++++--- .../single_cpu_template.py | 13 ++- .../single_gpu_node_16bit_template.py | 17 ++-- .../single_gpu_node_ddp_template.py | 17 ++-- .../single_gpu_node_dp_template.py | 17 ++-- pytorch_lightning/models/trainer.py | 95 +++++++++++-------- pytorch_lightning/root_module/grads.py | 6 +- pytorch_lightning/root_module/model_saving.py | 6 +- pytorch_lightning/testing/lm_test_module.py | 26 +++-- pytorch_lightning/utilities/arg_parse.py | 54 +++++++---- tests/debug.py | 3 +- tests/test_models.py | 39 +++++--- 15 files changed, 226 insertions(+), 130 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 875552539c..71d3c14c6a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -18,7 +18,7 @@ references: check-manifest --ignore tox.ini python setup.py check -m -s coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules - flake8 . --max-line-length=100 + flake8 . --max-line-length=120 codecov jobs: diff --git a/README.md b/README.md index 2d1914ff2c..bbe110ba0e 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ [![PyPI Status](https://badge.fury.io/py/pytorch-lightning.svg)](https://badge.fury.io/py/pytorch-lightning) [![PyPI Status](https://pepy.tech/badge/pytorch-lightning)](https://pepy.tech/project/pytorch-lightning) [![Build Status](https://travis-ci.org/williamFalcon/pytorch-lightning.svg?branch=master)](https://travis-ci.org/williamFalcon/pytorch-lightning) +[![CircleCI](https://circleci.com/gh/Borda/pytorch-lightning.svg?style=svg)](https://circleci.com/gh/Borda/pytorch-lightning) [![Build status](https://ci.appveyor.com/api/projects/status/rum89d7hq8l1kfye?svg=true)](https://ci.appveyor.com/project/Borda/pytorch-lightning) [![codecov](https://codecov.io/gh/Borda/pytorch-lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/Borda/pytorch-lightning) [![CodeFactor](https://www.codefactor.io/repository/github/borda/pytorch-lightning/badge)](https://www.codefactor.io/repository/github/borda/pytorch-lightning) diff --git a/examples/new_project_templates/lightning_module_template.py b/examples/new_project_templates/lightning_module_template.py index a65500358a..d6bdd13068 100644 --- a/examples/new_project_templates/lightning_module_template.py +++ b/examples/new_project_templates/lightning_module_template.py @@ -47,11 +47,13 @@ class LightningTemplateModel(LightningModule): Layout model :return: """ - self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim) + self.c_d1 = nn.Linear(in_features=self.hparams.in_features, + out_features=self.hparams.hidden_dim) self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim) self.c_d1_drop = nn.Dropout(self.hparams.drop_prob) - self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features) + self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, + out_features=self.hparams.out_features) # --------------------- # TRAINING @@ -171,8 +173,10 @@ class LightningTemplateModel(LightningModule): def __dataloader(self, train): # init data generators - transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) - dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True) + transform = transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.5,), (1.0,))]) + dataset = MNIST(root=self.hparams.data_root, train=train, + transform=transform, download=True) # when using multi-node we need to add the datasampler train_sampler = None @@ -234,11 +238,15 @@ class LightningTemplateModel(LightningModule): parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str) # training params (opt) - parser.opt_list('--learning_rate', default=0.001 * 8, type=float, options=[0.0001, 0.0005, 0.001, 0.005], + parser.opt_list('--learning_rate', default=0.001 * 8, type=float, + options=[0.0001, 0.0005, 0.001, 0.005], tunable=False) - parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False) + parser.opt_list('--optimizer_name', default='adam', type=str, + options=['adam'], tunable=False) - # if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu - parser.opt_list('--batch_size', default=256 * 8, type=int, options=[32, 64, 128, 256], tunable=False, - help='batch size will be divided over all the gpus being used across all nodes') + # if using 2 nodes with 4 gpus each the batch size here + # (256) will be 256 / (2*8) = 16 per gpu + parser.opt_list('--batch_size', default=256 * 8, type=int, + options=[32, 64, 128, 256], tunable=False, + help='batch size will be divided over all gpus being used across all nodes') return parser diff --git a/examples/new_project_templates/multi_node_cluster_template.py b/examples/new_project_templates/multi_node_cluster_template.py index 5f6914d107..cdbda0031d 100644 --- a/examples/new_project_templates/multi_node_cluster_template.py +++ b/examples/new_project_templates/multi_node_cluster_template.py @@ -10,12 +10,12 @@ from test_tube import HyperOptArgumentParser, Experiment, SlurmCluster from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from .lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main_local(hparams): main(hparams, None, None) @@ -112,8 +112,10 @@ def optimize_on_cluster(hyperparams): cluster.add_command('source activate lightning') # run only on 32GB voltas - cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus') - cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus') + cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', + comment='use 32gb gpus') + cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, + comment='use 32gb gpus') # run hopt # creates and submits jobs to slurm @@ -140,15 +142,23 @@ if __name__ == '__main__': parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual') # TODO: make 1 param - parent_parser.add_argument('--per_experiment_nb_gpus', type=int, help='how many gpus to use in a node') - parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node') + parent_parser.add_argument('--per_experiment_nb_gpus', type=int, + help='how many gpus to use in a node') + parent_parser.add_argument('--gpus', type=str, default='-1', + help='how many gpus to use in the node') - parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1, help='how many nodes to use in a cluster') - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, help='where to save slurm meta') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') - parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, help='how many grid search trials to run') + parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1, + help='how many nodes to use in a cluster') + parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, + help='where to save logs') + parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, + help='where to save slurm meta') + parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, + help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', + help='test tube exp name') + parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, + help='how many grid search trials to run') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) diff --git a/examples/new_project_templates/single_cpu_template.py b/examples/new_project_templates/single_cpu_template.py index c9ad443536..9822f21601 100644 --- a/examples/new_project_templates/single_cpu_template.py +++ b/examples/new_project_templates/single_cpu_template.py @@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from .lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main(hparams): """ @@ -90,9 +90,12 @@ if __name__ == '__main__': parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # gpu args - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') + parent_parser.add_argument('--test_tube_save_path', type=str, + default=test_tube_dir, help='where to save logs') + parent_parser.add_argument('--model_save_path', type=str, + default=checkpoint_dir, help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, + default='pt_lightning_exp_a', help='test tube exp name') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) diff --git a/examples/new_project_templates/single_gpu_node_16bit_template.py b/examples/new_project_templates/single_gpu_node_16bit_template.py index c2bf667491..137f0e4848 100644 --- a/examples/new_project_templates/single_gpu_node_16bit_template.py +++ b/examples/new_project_templates/single_gpu_node_16bit_template.py @@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from .lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main(hparams): """ @@ -92,10 +92,15 @@ if __name__ == '__main__': parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # gpu args - parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node') - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') + parent_parser.add_argument('--gpus', type=str, default='-1', + help='how many gpus to use in the node.' + 'value -1 uses all the gpus on the node') + parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, + help='where to save logs') + parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, + help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', + help='test tube exp name') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) diff --git a/examples/new_project_templates/single_gpu_node_ddp_template.py b/examples/new_project_templates/single_gpu_node_ddp_template.py index 358b8c948b..e8f4601257 100644 --- a/examples/new_project_templates/single_gpu_node_ddp_template.py +++ b/examples/new_project_templates/single_gpu_node_ddp_template.py @@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from .lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main(hparams): """ @@ -92,10 +92,15 @@ if __name__ == '__main__': parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # gpu args - parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node') - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') + parent_parser.add_argument('--gpus', type=str, default='-1', + help='how many gpus to use in the node.' + ' value -1 uses all the gpus on the node') + parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, + help='where to save logs') + parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, + help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', + help='test tube exp name') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) diff --git a/examples/new_project_templates/single_gpu_node_dp_template.py b/examples/new_project_templates/single_gpu_node_dp_template.py index 656924414c..f48df5ca87 100644 --- a/examples/new_project_templates/single_gpu_node_dp_template.py +++ b/examples/new_project_templates/single_gpu_node_dp_template.py @@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from .lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main(hparams): """ @@ -91,10 +91,15 @@ if __name__ == '__main__': parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # gpu args - parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node') - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') + parent_parser.add_argument('--gpus', type=str, default='-1', + help='how many gpus to use in the node.' + ' value -1 uses all the gpus on the node') + parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, + help='where to save logs') + parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, + help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', + help='test tube exp name') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 7d2a925cbd..e23dfa0b6f 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -15,7 +15,8 @@ import torch.distributed as dist from ..root_module.memory import get_gpu_memory_map from ..root_module.model_saving import TrainerIO -from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel +from ..pt_overrides.override_data_parallel import ( + LightningDistributedDataParallel, LightningDataParallel) from ..utilities.debugging import MisconfigurationException try: @@ -64,17 +65,20 @@ class Trainer(TrainerIO): check_val_every_n_epoch=1, fast_dev_run=False, accumulate_grad_batches=1, - max_nb_epochs=1000, min_nb_epochs=1, - train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0, + max_nb_epochs=1000, + min_nb_epochs=1, + train_percent_check=1.0, + val_percent_check=1.0, + test_percent_check=1.0, val_check_interval=0.95, - log_save_interval=100, add_log_row_interval=10, + log_save_interval=100, + add_log_row_interval=10, distributed_backend='dp', use_amp=False, print_nan_grads=False, print_weights_summary=True, amp_level='O2', nb_sanity_val_steps=5): - """ :param experiment: Test-tube experiment @@ -100,16 +104,15 @@ class Trainer(TrainerIO): :param val_check_interval: :param log_save_interval: :param add_log_row_interval: - :param distributed_backend: 'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel + :param distributed_backend: + 'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel :param use_amp: :param print_nan_grads: :param print_weights_summary: :param amp_level: :param nb_sanity_val_steps: """ - # Transfer params - self.nb_gpu_nodes = nb_gpu_nodes self.gradient_clip = gradient_clip self.check_val_every_n_epoch = check_val_every_n_epoch @@ -171,13 +174,13 @@ class Trainer(TrainerIO): # set the correct cuda visible devices (using pci order) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in self.data_parallel_device_ids]) + os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in + self.data_parallel_device_ids]) print('VISIBLE GPUS: %r' % os.environ["CUDA_VISIBLE_DEVICES"]) # make DP and DDP mutually exclusive # single GPU will also use DP with devices=[0] - have_gpus = self.data_parallel_device_ids is not None and len(self.data_parallel_device_ids) > 0 - if have_gpus: + if self.data_parallel_device_ids: self.use_dp = distributed_backend == 'dp' self.use_ddp = distributed_backend == 'ddp' @@ -224,7 +227,8 @@ class Trainer(TrainerIO): self.val_dataloader = None # how much of the data to use - self.__determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct) + self.__determine_data_use_amount(train_percent_check, val_percent_check, + test_percent_check, overfit_pct) print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu)) # 16 bit mixed precision training using apex @@ -246,7 +250,8 @@ class Trainer(TrainerIO): def data_parallel(self): return self.use_dp or self.use_ddp - def __determine_data_use_amount(self, train_percent_check, val_percent_check, test_percent_check, overfit_pct): + def __determine_data_use_amount(self, train_percent_check, val_percent_check, + test_percent_check, overfit_pct): """ Use less data for debugging purposes """ @@ -388,17 +393,18 @@ class Trainer(TrainerIO): if self.use_ddp and not isinstance(self.tng_dataloader.sampler, DistributedSampler): msg = """ - when using multiple gpus and multiple nodes you must pass a DistributedSampler to DataLoader(sampler). +when using multiple gpus and multiple nodes you must pass + a DistributedSampler to DataLoader(sampler). - ie: this: - dataset = myDataset() - dataloader = Dataloader(dataset) +ie: this: +dataset = myDataset() +dataloader = Dataloader(dataset) - becomes: - dataset = myDataset() - dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset) - dataloader = Dataloader(dataset, sampler=dist_sampler) - """ +becomes: +dataset = myDataset() +dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset) +dataloader = Dataloader(dataset, sampler=dist_sampler) +""" raise MisconfigurationException(msg) # ----------------------------- @@ -408,7 +414,8 @@ class Trainer(TrainerIO): # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp: - # must copy only the meta of the exp so it survives pickle/unpickle when going to new process + # must copy only the meta of the exp so it survives pickle/unpickle + # when going to new process self.experiment = self.experiment.get_meta_copy() if self.is_slurm_managing_tasks: @@ -416,11 +423,11 @@ class Trainer(TrainerIO): self.ddp_train(task, model) else: msg = """ - You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks. - We will launch %(nb_gpus)s processes for you. - We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s - If you're not using SLURM, ignore this message! - """ % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks} +You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks. +We will launch %(nb_gpus)s processes for you. +We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s +If you're not using SLURM, ignore this message! +""" % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks} warnings.warn(msg) mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, )) @@ -433,7 +440,8 @@ class Trainer(TrainerIO): else: # run through amp wrapper if self.use_amp: - raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option') + raise MisconfigurationException('amp + cpu is not supported.' + ' Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well @@ -461,10 +469,10 @@ class Trainer(TrainerIO): # https://github.com/NVIDIA/apex/issues/227 if self.use_dp and self.use_amp: m = """ - Amp level %r with DataParallel is not supported. - See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. - We recommend you switch to ddp if you want to use amp - """ % self.amp_level +Amp level %r with DataParallel is not supported. +See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. +We recommend you switch to ddp if you want to use amp +""" % self.amp_level raise MisconfigurationException(m) model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids) @@ -527,7 +535,8 @@ class Trainer(TrainerIO): ) self.optimizers = optimizers - model = LightningDistributedDataParallel(model, device_ids=[gpu_nb], find_unused_parameters=True) + model = LightningDistributedDataParallel(model, device_ids=[gpu_nb], + find_unused_parameters=True) # continue training routine self.__run_pretrain_routine(model) @@ -642,7 +651,8 @@ class Trainer(TrainerIO): # init progbar when requested if self.progress_bar: - self.prog_bar = tqdm.tqdm(range(self.total_batches), position=self.process_position) + self.prog_bar = tqdm.tqdm(range(self.total_batches), + position=self.process_position) for batch_nb, data_batch in enumerate(self.tng_dataloader): self.batch_nb = batch_nb @@ -651,7 +661,8 @@ class Trainer(TrainerIO): model = self.__get_model() model.global_step = self.global_step - # stop when the flag is changed or we've gone past the amount requested in the batches + # stop when the flag is changed or we've gone past the amount + # requested in the batches self.total_batch_nb += 1 met_batch_limit = batch_nb > self.nb_tng_batches if met_batch_limit: @@ -698,7 +709,8 @@ class Trainer(TrainerIO): model.on_tng_metrics(metrics) # log metrics - scalar_metrics = self.__metrics_to_scalars(metrics, blacklist=self.__log_vals_blacklist()) + scalar_metrics = self.__metrics_to_scalars( + metrics, blacklist=self.__log_vals_blacklist()) if self.proc_rank == 0: self.experiment.log(scalar_metrics, global_step=self.global_step) self.experiment.save() @@ -720,7 +732,8 @@ class Trainer(TrainerIO): # early stopping met_min_epochs = epoch_nb > self.min_nb_epochs if self.enable_early_stop and met_min_epochs: - should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb, logs=self.__tng_tqdm_dic) + should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb, + logs=self.__tng_tqdm_dic) # stop training stop = should_stop and met_min_epochs @@ -828,7 +841,8 @@ class Trainer(TrainerIO): # clear gradients optimizer.zero_grad() - # queuing loss across batches blows it up proportionally... divide out the number accumulated + # queuing loss across batches blows it up proportionally... + # divide out the number accumulated self.batch_loss_value = self.batch_loss_value / self.accumulate_grad_batches # track loss @@ -885,4 +899,5 @@ class Trainer(TrainerIO): # model checkpointing if self.proc_rank == 0 and self.checkpoint_callback is not None: print('save callback...') - self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, logs=self.__tng_tqdm_dic) + self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, + logs=self.__tng_tqdm_dic) diff --git a/pytorch_lightning/root_module/grads.py b/pytorch_lightning/root_module/grads.py index b28cf21cd4..7bdc8572b9 100644 --- a/pytorch_lightning/root_module/grads.py +++ b/pytorch_lightning/root_module/grads.py @@ -17,11 +17,13 @@ class GradInformation(nn.Module): total_norm += param_norm ** norm_type norm = param_norm ** (1 / norm_type) - results['grad_{}_norm_{}'.format(norm_type, i)] = round(norm.data.cpu().numpy().flatten()[0], 3) + grad = round(norm.data.cpu().numpy().flatten()[0], 3) + results['grad_{}_norm_{}'.format(norm_type, i)] = grad except Exception: # this param had no grad pass total_norm = total_norm ** (1. / norm_type) - results['grad_{}_norm_total'.format(norm_type)] = round(total_norm.data.cpu().numpy().flatten()[0], 3) + grad = round(total_norm.data.cpu().numpy().flatten()[0], 3) + results['grad_{}_norm_total'.format(norm_type)] = grad return results diff --git a/pytorch_lightning/root_module/model_saving.py b/pytorch_lightning/root_module/model_saving.py index 0bde0943f4..0765142cd5 100644 --- a/pytorch_lightning/root_module/model_saving.py +++ b/pytorch_lightning/root_module/model_saving.py @@ -3,7 +3,8 @@ import re import torch -from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel +from ..pt_overrides.override_data_parallel import ( + LightningDistributedDataParallel, LightningDataParallel) class ModelIO(object): @@ -45,7 +46,8 @@ class ModelIO(object): class TrainerIO(object): def __get_model(self): - is_dp_module = type(self.model) is LightningDistributedDataParallel or type(self.model) is LightningDataParallel + is_dp_module = isinstance(self.model, (LightningDistributedDataParallel, + LightningDataParallel)) model = self.model.module if is_dp_module else self.model return model diff --git a/pytorch_lightning/testing/lm_test_module.py b/pytorch_lightning/testing/lm_test_module.py index 24995c7f44..61ecf874f3 100644 --- a/pytorch_lightning/testing/lm_test_module.py +++ b/pytorch_lightning/testing/lm_test_module.py @@ -48,11 +48,13 @@ class LightningTestModel(LightningModule): Layout model :return: """ - self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim) + self.c_d1 = nn.Linear(in_features=self.hparams.in_features, + out_features=self.hparams.hidden_dim) self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim) self.c_d1_drop = nn.Dropout(self.hparams.drop_prob) - self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features) + self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, + out_features=self.hparams.out_features) # --------------------- # TRAINING @@ -191,8 +193,10 @@ class LightningTestModel(LightningModule): def __dataloader(self, train): # init data generators - transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) - dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True) + transform = transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.5,), (1.0,))]) + dataset = MNIST(root=self.hparams.data_root, train=train, + transform=transform, download=True) # when using multi-node we need to add the datasampler train_sampler = None @@ -251,11 +255,15 @@ class LightningTestModel(LightningModule): parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str) # training params (opt) - parser.opt_list('--learning_rate', default=0.001 * 8, type=float, options=[0.0001, 0.0005, 0.001, 0.005], + parser.opt_list('--learning_rate', default=0.001 * 8, type=float, + options=[0.0001, 0.0005, 0.001, 0.005], tunable=False) - parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False) + parser.opt_list('--optimizer_name', default='adam', type=str, + options=['adam'], tunable=False) - # if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu - parser.opt_list('--batch_size', default=256 * 8, type=int, options=[32, 64, 128, 256], tunable=False, - help='batch size will be divided over all the gpus being used across all nodes') + # if using 2 nodes with 4 gpus each the batch size here + # (256) will be 256 / (2*8) = 16 per gpu + parser.opt_list('--batch_size', default=256 * 8, type=int, + options=[32, 64, 128, 256], tunable=False, + help='batch size will be divided over all gpus being used across all nodes') return parser diff --git a/pytorch_lightning/utilities/arg_parse.py b/pytorch_lightning/utilities/arg_parse.py index 44399c39ac..39d4ec81f9 100644 --- a/pytorch_lightning/utilities/arg_parse.py +++ b/pytorch_lightning/utilities/arg_parse.py @@ -9,29 +9,40 @@ import os def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None): # tng, test, val check intervals - parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true', help='true = run test set also') - parser.add_argument('--check_val_every_n_epoch', default=1, type=int, help='check val every n epochs') + parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true', + help='true = run test set also') + parser.add_argument('--check_val_every_n_epoch', default=1, type=int, + help='check val every n epochs') parser.opt_list('--accumulate_grad_batches', default=1, type=int, tunable=False, - help='accumulates gradients k times before applying update. Simulates huge batch size') + help='accumulates gradients k times before applying update.' + ' Simulates huge batch size') parser.add_argument('--max_nb_epochs', default=200, type=int, help='cap epochs') parser.add_argument('--min_nb_epochs', default=2, type=int, help='min epochs') - parser.add_argument('--train_percent_check', default=1.0, type=float, help='how much of tng set to check') - parser.add_argument('--val_percent_check', default=1.0, type=float, help='how much of val set to check') - parser.add_argument('--test_percent_check', default=1.0, type=float, help='how much of test set to check') + parser.add_argument('--train_percent_check', default=1.0, type=float, + help='how much of tng set to check') + parser.add_argument('--val_percent_check', default=1.0, type=float, + help='how much of val set to check') + parser.add_argument('--test_percent_check', default=1.0, type=float, + help='how much of test set to check') - parser.add_argument('--val_check_interval', default=0.95, type=float, help='how much within 1 epoch to check val') - parser.add_argument('--log_save_interval', default=100, type=int, help='how many batches between log saves') - parser.add_argument('--add_log_row_interval', default=100, type=int, help='add log every k batches') + parser.add_argument('--val_check_interval', default=0.95, type=float, + help='how much within 1 epoch to check val') + parser.add_argument('--log_save_interval', default=100, type=int, + help='how many batches between log saves') + parser.add_argument('--add_log_row_interval', default=100, type=int, + help='add log every k batches') # early stopping parser.add_argument('--disable_early_stop', dest='enable_early_stop', action='store_false') parser.add_argument('--early_stop_metric', default='val_acc', type=str) parser.add_argument('--early_stop_mode', default='min', type=str) - parser.add_argument('--early_stop_patience', default=3, type=int, help='number of epochs until stop') + parser.add_argument('--early_stop_patience', default=3, type=int, + help='number of epochs until stop') # gradient handling parser.add_argument('--gradient_clip', default=-1, type=int) - parser.add_argument('--track_grad_norm', default=-1, type=int, help='if > 0, will track this grad norm') + parser.add_argument('--track_grad_norm', default=-1, type=int, + help='if > 0, will track this grad norm') # model saving parser.add_argument('--model_save_path', default=root_dir + '/model_weights') @@ -47,7 +58,8 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None # test_tube settings parser.add_argument('-en', '--tt_name', default='pt_test') parser.add_argument('-td', '--tt_description', default='pytorch lightning test') - parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'), help='logging dir') + parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'), + help='logging dir') parser.add_argument('--enable_single_run', dest='single_run', action='store_true') parser.add_argument('--nb_hopt_trials', default=1, type=int) parser.add_argument('--log_stdout', dest='log_stdout', action='store_true') @@ -65,17 +77,23 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None # FAST training # use these settings to make sure network has no bugs without running a full dataset - parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true', help='runs validation after 1 tng step') - parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true', help='false removes the prog bar') - parser.add_argument('--overfit', default=-1, type=float, help='% of dataset to use with this option. float, or -1 for none') + parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true', + help='runs validation after 1 tng step') + parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true', + help='false removes the prog bar') + parser.add_argument('--overfit', default=-1, type=float, + help='% of dataset to use with this option. float, or -1 for none') # debug args if rand_seed is not None: parser.add_argument('--random_seed', default=rand_seed, type=int) - parser.add_argument('--interactive', dest='interactive', action='store_true', help='runs on gpu without cluster') - parser.add_argument('--debug', dest='debug', action='store_true', help='enables/disables test tube') - parser.add_argument('--local', dest='local', action='store_true', help='enables local tng') + parser.add_argument('--interactive', dest='interactive', action='store_true', + help='runs on gpu without cluster') + parser.add_argument('--debug', dest='debug', action='store_true', + help='enables/disables test tube') + parser.add_argument('--local', dest='local', action='store_true', + help='enables local tng') # optimizer parser.add_argument('--lr_scheduler_milestones', default=None, type=str) diff --git a/tests/debug.py b/tests/debug.py index d068e63e5b..6a5efbecfa 100644 --- a/tests/debug.py +++ b/tests/debug.py @@ -107,7 +107,8 @@ def load_model(exp, save_dir): checkpoints = [x for x in os.listdir(save_dir) if '.ckpt' in x] weights_dir = os.path.join(save_dir, checkpoints[0]) - trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir, tags_csv=tags_path, on_gpu=True) + trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir, + tags_csv=tags_path, on_gpu=True) assert trained_model is not None, 'loading model failed' diff --git a/tests/test_models.py b/tests/test_models.py index 59c4577abe..044f2391bb 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -30,10 +30,12 @@ def test_amp_gpu_ddp(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_amp_gpu_ddp cannot run.' + 'Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_amp_gpu_ddp cannot run.' + 'Rerun on a node with 2+ GPUs to run this test') return os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) @@ -105,7 +107,8 @@ def test_cpu_slurm_save_load(): # wipe-out trainer and model # retrain with not much data... this simulates picking training back up after slurm # we want to see if the weights come back correctly - continue_tng_hparams = get_hparams(continue_training=True, hpc_exp_number=cluster_a.hpc_exp_number) + continue_tng_hparams = get_hparams(continue_training=True, + hpc_exp_number=cluster_a.hpc_exp_number) trainer_options = dict( max_nb_epochs=1, cluster=SlurmCluster(continue_tng_hparams), @@ -219,7 +222,8 @@ def test_model_saving_loading(): # load new model tags_path = exp.get_data_path(exp.name, exp.version) tags_path = os.path.join(tags_path, 'meta_tags.csv') - model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path, tags_csv=tags_path, on_gpu=False) + model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path, + tags_csv=tags_path, on_gpu=False) model_2.eval() # make prediction @@ -244,10 +248,12 @@ def test_amp_gpu_ddp_slurm_managed(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_amp_gpu_ddp cannot run.' + ' Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_amp_gpu_ddp cannot run.' + ' Rerun on a node with 2+ GPUs to run this test') return # simulate setting slurm flags @@ -411,7 +417,8 @@ def test_single_gpu_model(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_single_gpu_model cannot run.' + ' Rerun on a GPU node to run this test') return model, hparams = get_model() @@ -432,10 +439,12 @@ def test_multi_gpu_model_dp(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_multi_gpu_model_dp cannot run.' + ' Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_multi_gpu_model_dp cannot run.' + ' Rerun on a node with 2+ GPUs to run this test') return model, hparams = get_model() trainer_options = dict( @@ -458,10 +467,12 @@ def test_amp_gpu_dp(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_amp_gpu_dp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_amp_gpu_dp cannot run.' + ' Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_amp_gpu_dp cannot run.' + ' Rerun on a node with 2+ GPUs to run this test') return model, hparams = get_model() trainer_options = dict( @@ -480,10 +491,12 @@ def test_multi_gpu_model_ddp(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_multi_gpu_model_ddp cannot run.' + ' Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_multi_gpu_model_ddp cannot run.' + ' Rerun on a node with 2+ GPUs to run this test') return os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])