update by flake8

2019-08-06 12:08:31 +02:00 · 2019-08-06 12:08:31 +02:00 · d9bfe964f9
parent 4e0b9c50e7
commit d9bfe964f9
15 changed files with 226 additions and 130 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -18,7 +18,7 @@ references:
        check-manifest --ignore tox.ini
        python setup.py check -m -s
        coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules
-        flake8 . --max-line-length=100
+        flake8 . --max-line-length=120
        codecov

 jobs:
--- a/README.md
+++ b/README.md
@ -8,6 +8,7 @@
 [![PyPI Status](https://badge.fury.io/py/pytorch-lightning.svg)](https://badge.fury.io/py/pytorch-lightning)
 [![PyPI Status](https://pepy.tech/badge/pytorch-lightning)](https://pepy.tech/project/pytorch-lightning)
 [![Build Status](https://travis-ci.org/williamFalcon/pytorch-lightning.svg?branch=master)](https://travis-ci.org/williamFalcon/pytorch-lightning)
+[![CircleCI](https://circleci.com/gh/Borda/pytorch-lightning.svg?style=svg)](https://circleci.com/gh/Borda/pytorch-lightning)
 [![Build status](https://ci.appveyor.com/api/projects/status/rum89d7hq8l1kfye?svg=true)](https://ci.appveyor.com/project/Borda/pytorch-lightning)
 [![codecov](https://codecov.io/gh/Borda/pytorch-lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/Borda/pytorch-lightning)
 [![CodeFactor](https://www.codefactor.io/repository/github/borda/pytorch-lightning/badge)](https://www.codefactor.io/repository/github/borda/pytorch-lightning)
--- a/examples/new_project_templates/lightning_module_template.py
+++ b/examples/new_project_templates/lightning_module_template.py
@ -47,11 +47,13 @@ class LightningTemplateModel(LightningModule):
        Layout model
        :return:
        """
-        self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim)
+        self.c_d1 = nn.Linear(in_features=self.hparams.in_features,
+                              out_features=self.hparams.hidden_dim)
        self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim)
        self.c_d1_drop = nn.Dropout(self.hparams.drop_prob)

-        self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features)
+        self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim,
+                              out_features=self.hparams.out_features)

    # ---------------------
    # TRAINING
@ -171,8 +173,10 @@ class LightningTemplateModel(LightningModule):

    def __dataloader(self, train):
        # init data generators
-        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
-        dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True)
+        transform = transforms.Compose([transforms.ToTensor(),
+                                        transforms.Normalize((0.5,), (1.0,))])
+        dataset = MNIST(root=self.hparams.data_root, train=train,
+                        transform=transform, download=True)

        # when using multi-node we need to add the datasampler
        train_sampler = None
@ -234,11 +238,15 @@ class LightningTemplateModel(LightningModule):
        parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)

        # training params (opt)
-        parser.opt_list('--learning_rate', default=0.001 * 8, type=float, options=[0.0001, 0.0005, 0.001, 0.005],
+        parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
+                        options=[0.0001, 0.0005, 0.001, 0.005],
                        tunable=False)
-        parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False)
+        parser.opt_list('--optimizer_name', default='adam', type=str,
+                        options=['adam'], tunable=False)

-        # if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu
-        parser.opt_list('--batch_size', default=256 * 8, type=int, options=[32, 64, 128, 256], tunable=False,
-                        help='batch size will be divided over all the gpus being used across all nodes')
+        # if using 2 nodes with 4 gpus each the batch size here
+        #  (256) will be 256 / (2*8) = 16 per gpu
+        parser.opt_list('--batch_size', default=256 * 8, type=int,
+                        options=[32, 64, 128, 256], tunable=False,
+                        help='batch size will be divided over all gpus being used across all nodes')
        return parser
--- a/examples/new_project_templates/multi_node_cluster_template.py
+++ b/examples/new_project_templates/multi_node_cluster_template.py
@ -10,12 +10,12 @@ from test_tube import HyperOptArgumentParser, Experiment, SlurmCluster
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

+from .lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)

-from .lightning_module_template import LightningTemplateModel
-

 def main_local(hparams):
    main(hparams, None, None)
@ -112,8 +112,10 @@ def optimize_on_cluster(hyperparams):
    cluster.add_command('source activate lightning')

    # run only on 32GB voltas
-    cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus')
-    cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus')
+    cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
+                          comment='use 32gb gpus')
+    cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition,
+                          comment='use 32gb gpus')

    # run hopt
    # creates and submits jobs to slurm
@ -140,15 +142,23 @@ if __name__ == '__main__':
    parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual')

    # TODO: make 1 param
-    parent_parser.add_argument('--per_experiment_nb_gpus', type=int, help='how many gpus to use in a node')
-    parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node')
+    parent_parser.add_argument('--per_experiment_nb_gpus', type=int,
+                               help='how many gpus to use in a node')
+    parent_parser.add_argument('--gpus', type=str, default='-1',
+                               help='how many gpus to use in the node')

-    parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1, help='how many nodes to use in a cluster')
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, help='where to save slurm meta')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
-    parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, help='how many grid search trials to run')
+    parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1,
+                               help='how many nodes to use in a cluster')
+    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
+                               help='where to save logs')
+    parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir,
+                               help='where to save slurm meta')
+    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
+                               help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
+                               help='test tube exp name')
+    parent_parser.add_argument('--nb_hopt_trials', type=int, default=1,
+                               help='how many grid search trials to run')

    # allow model to overwrite or extend args
    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
--- a/examples/new_project_templates/single_cpu_template.py
+++ b/examples/new_project_templates/single_cpu_template.py
@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

+from .lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)

-from .lightning_module_template import LightningTemplateModel
-

 def main(hparams):
    """
@ -90,9 +90,12 @@ if __name__ == '__main__':
    parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)

    # gpu args
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
+    parent_parser.add_argument('--test_tube_save_path', type=str,
+                               default=test_tube_dir, help='where to save logs')
+    parent_parser.add_argument('--model_save_path', type=str,
+                               default=checkpoint_dir, help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str,
+                               default='pt_lightning_exp_a', help='test tube exp name')

    # allow model to overwrite or extend args
    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
--- a/examples/new_project_templates/single_gpu_node_16bit_template.py
+++ b/examples/new_project_templates/single_gpu_node_16bit_template.py
@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

+from .lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)

-from .lightning_module_template import LightningTemplateModel
-

 def main(hparams):
    """
@ -92,10 +92,15 @@ if __name__ == '__main__':
    parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)

    # gpu args
-    parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
+    parent_parser.add_argument('--gpus', type=str, default='-1',
+                               help='how many gpus to use in the node.'
+                                    'value -1 uses all the gpus on the node')
+    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
+                               help='where to save logs')
+    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
+                               help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
+                               help='test tube exp name')

    # allow model to overwrite or extend args
    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
--- a/examples/new_project_templates/single_gpu_node_ddp_template.py
+++ b/examples/new_project_templates/single_gpu_node_ddp_template.py
@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

+from .lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)

-from .lightning_module_template import LightningTemplateModel
-

 def main(hparams):
    """
@ -92,10 +92,15 @@ if __name__ == '__main__':
    parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)

    # gpu args
-    parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
+    parent_parser.add_argument('--gpus', type=str, default='-1',
+                               help='how many gpus to use in the node.'
+                                    ' value -1 uses all the gpus on the node')
+    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
+                               help='where to save logs')
+    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
+                               help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
+                               help='test tube exp name')

    # allow model to overwrite or extend args
    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
--- a/examples/new_project_templates/single_gpu_node_dp_template.py
+++ b/examples/new_project_templates/single_gpu_node_dp_template.py
@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

+from .lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)

-from .lightning_module_template import LightningTemplateModel
-

 def main(hparams):
    """
@ -91,10 +91,15 @@ if __name__ == '__main__':
    parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)

    # gpu args
-    parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
+    parent_parser.add_argument('--gpus', type=str, default='-1',
+                               help='how many gpus to use in the node.'
+                                    ' value -1 uses all the gpus on the node')
+    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
+                               help='where to save logs')
+    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
+                               help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
+                               help='test tube exp name')

    # allow model to overwrite or extend args
    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
--- a/pytorch_lightning/models/trainer.py
+++ b/pytorch_lightning/models/trainer.py
@ -15,7 +15,8 @@ import torch.distributed as dist

 from ..root_module.memory import get_gpu_memory_map
 from ..root_module.model_saving import TrainerIO
-from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
+from ..pt_overrides.override_data_parallel import (
+    LightningDistributedDataParallel, LightningDataParallel)
 from ..utilities.debugging import MisconfigurationException

 try:
@ -64,17 +65,20 @@ class Trainer(TrainerIO):
                 check_val_every_n_epoch=1,
                 fast_dev_run=False,
                 accumulate_grad_batches=1,
-                 max_nb_epochs=1000, min_nb_epochs=1,
-                 train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0,
+                 max_nb_epochs=1000,
+                 min_nb_epochs=1,
+                 train_percent_check=1.0,
+                 val_percent_check=1.0,
+                 test_percent_check=1.0,
                 val_check_interval=0.95,
-                 log_save_interval=100, add_log_row_interval=10,
+                 log_save_interval=100,
+                 add_log_row_interval=10,
                 distributed_backend='dp',
                 use_amp=False,
                 print_nan_grads=False,
                 print_weights_summary=True,
                 amp_level='O2',
                 nb_sanity_val_steps=5):
-
        """

        :param experiment: Test-tube experiment
@ -100,16 +104,15 @@ class Trainer(TrainerIO):
        :param val_check_interval:
        :param log_save_interval:
        :param add_log_row_interval:
-        :param distributed_backend: 'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel
+        :param distributed_backend:
+            'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel
        :param use_amp:
        :param print_nan_grads:
        :param print_weights_summary:
        :param amp_level:
        :param nb_sanity_val_steps:
        """
-
        # Transfer params
-
        self.nb_gpu_nodes = nb_gpu_nodes
        self.gradient_clip = gradient_clip
        self.check_val_every_n_epoch = check_val_every_n_epoch
@ -171,13 +174,13 @@ class Trainer(TrainerIO):

            # set the correct cuda visible devices (using pci order)
            os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-            os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in self.data_parallel_device_ids])
+            os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in
+                                                           self.data_parallel_device_ids])
            print('VISIBLE GPUS: %r' % os.environ["CUDA_VISIBLE_DEVICES"])

        # make DP and DDP mutually exclusive
        # single GPU will also use DP with devices=[0]
-        have_gpus = self.data_parallel_device_ids is not None and len(self.data_parallel_device_ids) > 0
-        if have_gpus:
+        if self.data_parallel_device_ids:
            self.use_dp = distributed_backend == 'dp'
            self.use_ddp = distributed_backend == 'ddp'

@ -224,7 +227,8 @@ class Trainer(TrainerIO):
        self.val_dataloader = None

        # how much of the data to use
-        self.__determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct)
+        self.__determine_data_use_amount(train_percent_check, val_percent_check,
+                                         test_percent_check, overfit_pct)
        print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))

        # 16 bit mixed precision training using apex
@ -246,7 +250,8 @@ class Trainer(TrainerIO):
    def data_parallel(self):
        return self.use_dp or self.use_ddp

-    def __determine_data_use_amount(self, train_percent_check, val_percent_check, test_percent_check, overfit_pct):
+    def __determine_data_use_amount(self, train_percent_check, val_percent_check,
+                                    test_percent_check, overfit_pct):
        """
        Use less data for debugging purposes
        """
@ -388,17 +393,18 @@ class Trainer(TrainerIO):

        if self.use_ddp and not isinstance(self.tng_dataloader.sampler, DistributedSampler):
            msg = """
-            when using multiple gpus and multiple nodes you must pass a DistributedSampler to DataLoader(sampler).
+when using multiple gpus and multiple nodes you must pass
+ a DistributedSampler to DataLoader(sampler).

-            ie: this:
-            dataset = myDataset()
-            dataloader = Dataloader(dataset)
+ie: this:
+dataset = myDataset()
+dataloader = Dataloader(dataset)

-            becomes:
-            dataset = myDataset()
-            dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
-            dataloader = Dataloader(dataset, sampler=dist_sampler)
-            """
+becomes:
+dataset = myDataset()
+dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+dataloader = Dataloader(dataset, sampler=dist_sampler)
+"""
            raise MisconfigurationException(msg)

    # -----------------------------
@ -408,7 +414,8 @@ class Trainer(TrainerIO):

        # when using multi-node or DDP within a node start each module in a separate process
        if self.use_ddp:
-            # must copy only the meta of the exp so it survives pickle/unpickle when going to new process
+            # must copy only the meta of the exp so it survives pickle/unpickle
+            #  when going to new process
            self.experiment = self.experiment.get_meta_copy()

            if self.is_slurm_managing_tasks:
@ -416,11 +423,11 @@ class Trainer(TrainerIO):
                self.ddp_train(task, model)
            else:
                msg = """
-                You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks.
-                We will launch %(nb_gpus)s processes for you.
-                We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s
-                If you're not using SLURM, ignore this message!
-                """ % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks}
+You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks.
+We will launch %(nb_gpus)s processes for you.
+We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s
+If you're not using SLURM, ignore this message!
+""" % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks}
                warnings.warn(msg)
                mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))

@ -433,7 +440,8 @@ class Trainer(TrainerIO):
        else:
            # run through amp wrapper
            if self.use_amp:
-                raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option')
+                raise MisconfigurationException('amp + cpu is not supported.'
+                                                ' Please use a GPU option')

            # CHOOSE OPTIMIZER
            # allow for lr schedulers as well
@ -461,10 +469,10 @@ class Trainer(TrainerIO):
        # https://github.com/NVIDIA/apex/issues/227
        if self.use_dp and self.use_amp:
            m = """
-            Amp level %r with DataParallel is not supported.
-            See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
-            We recommend you switch to ddp if you want to use amp
-            """ % self.amp_level
+Amp level %r with DataParallel is not supported.
+See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
+We recommend you switch to ddp if you want to use amp
+""" % self.amp_level
            raise MisconfigurationException(m)

        model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids)
@ -527,7 +535,8 @@ class Trainer(TrainerIO):
            )
            self.optimizers = optimizers

-        model = LightningDistributedDataParallel(model, device_ids=[gpu_nb], find_unused_parameters=True)
+        model = LightningDistributedDataParallel(model, device_ids=[gpu_nb],
+                                                 find_unused_parameters=True)

        # continue training routine
        self.__run_pretrain_routine(model)
@ -642,7 +651,8 @@ class Trainer(TrainerIO):

            # init progbar when requested
            if self.progress_bar:
-                self.prog_bar = tqdm.tqdm(range(self.total_batches), position=self.process_position)
+                self.prog_bar = tqdm.tqdm(range(self.total_batches),
+                                          position=self.process_position)

            for batch_nb, data_batch in enumerate(self.tng_dataloader):
                self.batch_nb = batch_nb
@ -651,7 +661,8 @@ class Trainer(TrainerIO):
                model = self.__get_model()
                model.global_step = self.global_step

-                # stop when the flag is changed or we've gone past the amount requested in the batches
+                # stop when the flag is changed or we've gone past the amount
+                #  requested in the batches
                self.total_batch_nb += 1
                met_batch_limit = batch_nb > self.nb_tng_batches
                if met_batch_limit:
@ -698,7 +709,8 @@ class Trainer(TrainerIO):
                        model.on_tng_metrics(metrics)

                    # log metrics
-                    scalar_metrics = self.__metrics_to_scalars(metrics, blacklist=self.__log_vals_blacklist())
+                    scalar_metrics = self.__metrics_to_scalars(
+                        metrics, blacklist=self.__log_vals_blacklist())
                    if self.proc_rank == 0:
                        self.experiment.log(scalar_metrics, global_step=self.global_step)
                        self.experiment.save()
@ -720,7 +732,8 @@ class Trainer(TrainerIO):
            # early stopping
            met_min_epochs = epoch_nb > self.min_nb_epochs
            if self.enable_early_stop and met_min_epochs:
-                should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb, logs=self.__tng_tqdm_dic)
+                should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb,
+                                                                    logs=self.__tng_tqdm_dic)

                # stop training
                stop = should_stop and met_min_epochs
@ -828,7 +841,8 @@ class Trainer(TrainerIO):
                # clear gradients
                optimizer.zero_grad()

-            # queuing loss across batches blows it up proportionally... divide out the number accumulated
+            # queuing loss across batches blows it up proportionally...
+            #  divide out the number accumulated
            self.batch_loss_value = self.batch_loss_value / self.accumulate_grad_batches

            # track loss
@ -885,4 +899,5 @@ class Trainer(TrainerIO):
        # model checkpointing
        if self.proc_rank == 0 and self.checkpoint_callback is not None:
            print('save callback...')
-            self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, logs=self.__tng_tqdm_dic)
+            self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch,
+                                                  logs=self.__tng_tqdm_dic)
--- a/pytorch_lightning/root_module/grads.py
+++ b/pytorch_lightning/root_module/grads.py
@ -17,11 +17,13 @@ class GradInformation(nn.Module):
                    total_norm += param_norm ** norm_type
                    norm = param_norm ** (1 / norm_type)

-                    results['grad_{}_norm_{}'.format(norm_type, i)] = round(norm.data.cpu().numpy().flatten()[0], 3)
+                    grad = round(norm.data.cpu().numpy().flatten()[0], 3)
+                    results['grad_{}_norm_{}'.format(norm_type, i)] = grad
                except Exception:
                    # this param had no grad
                    pass

        total_norm = total_norm ** (1. / norm_type)
-        results['grad_{}_norm_total'.format(norm_type)] = round(total_norm.data.cpu().numpy().flatten()[0], 3)
+        grad = round(total_norm.data.cpu().numpy().flatten()[0], 3)
+        results['grad_{}_norm_total'.format(norm_type)] = grad
        return results
--- a/pytorch_lightning/root_module/model_saving.py
+++ b/pytorch_lightning/root_module/model_saving.py
@ -3,7 +3,8 @@ import re

 import torch

-from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
+from ..pt_overrides.override_data_parallel import (
+    LightningDistributedDataParallel, LightningDataParallel)


 class ModelIO(object):
@ -45,7 +46,8 @@ class ModelIO(object):
 class TrainerIO(object):

    def __get_model(self):
-        is_dp_module = type(self.model) is LightningDistributedDataParallel or type(self.model) is LightningDataParallel
+        is_dp_module = isinstance(self.model, (LightningDistributedDataParallel,
+                                               LightningDataParallel))
        model = self.model.module if is_dp_module else self.model
        return model

--- a/pytorch_lightning/testing/lm_test_module.py
+++ b/pytorch_lightning/testing/lm_test_module.py
@ -48,11 +48,13 @@ class LightningTestModel(LightningModule):
        Layout model
        :return:
        """
-        self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim)
+        self.c_d1 = nn.Linear(in_features=self.hparams.in_features,
+                              out_features=self.hparams.hidden_dim)
        self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim)
        self.c_d1_drop = nn.Dropout(self.hparams.drop_prob)

-        self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features)
+        self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim,
+                              out_features=self.hparams.out_features)

    # ---------------------
    # TRAINING
@ -191,8 +193,10 @@ class LightningTestModel(LightningModule):

    def __dataloader(self, train):
        # init data generators
-        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
-        dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True)
+        transform = transforms.Compose([transforms.ToTensor(),
+                                        transforms.Normalize((0.5,), (1.0,))])
+        dataset = MNIST(root=self.hparams.data_root, train=train,
+                        transform=transform, download=True)

        # when using multi-node we need to add the datasampler
        train_sampler = None
@ -251,11 +255,15 @@ class LightningTestModel(LightningModule):
        parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)

        # training params (opt)
-        parser.opt_list('--learning_rate', default=0.001 * 8, type=float, options=[0.0001, 0.0005, 0.001, 0.005],
+        parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
+                        options=[0.0001, 0.0005, 0.001, 0.005],
                        tunable=False)
-        parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False)
+        parser.opt_list('--optimizer_name', default='adam', type=str,
+                        options=['adam'], tunable=False)

-        # if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu
-        parser.opt_list('--batch_size', default=256 * 8, type=int, options=[32, 64, 128, 256], tunable=False,
-                        help='batch size will be divided over all the gpus being used across all nodes')
+        # if using 2 nodes with 4 gpus each the batch size here
+        #  (256) will be 256 / (2*8) = 16 per gpu
+        parser.opt_list('--batch_size', default=256 * 8, type=int,
+                        options=[32, 64, 128, 256], tunable=False,
+                        help='batch size will be divided over all gpus being used across all nodes')
        return parser
--- a/pytorch_lightning/utilities/arg_parse.py
+++ b/pytorch_lightning/utilities/arg_parse.py
@ -9,29 +9,40 @@ import os
 def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None):

    # tng, test, val check intervals
-    parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true', help='true = run test set also')
-    parser.add_argument('--check_val_every_n_epoch', default=1, type=int, help='check val every n epochs')
+    parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true',
+                        help='true = run test set also')
+    parser.add_argument('--check_val_every_n_epoch', default=1, type=int,
+                        help='check val every n epochs')
    parser.opt_list('--accumulate_grad_batches', default=1, type=int, tunable=False,
-                    help='accumulates gradients k times before applying update. Simulates huge batch size')
+                    help='accumulates gradients k times before applying update.'
+                         ' Simulates huge batch size')
    parser.add_argument('--max_nb_epochs', default=200, type=int, help='cap epochs')
    parser.add_argument('--min_nb_epochs', default=2, type=int, help='min epochs')
-    parser.add_argument('--train_percent_check', default=1.0, type=float, help='how much of tng set to check')
-    parser.add_argument('--val_percent_check', default=1.0, type=float, help='how much of val set to check')
-    parser.add_argument('--test_percent_check', default=1.0, type=float, help='how much of test set to check')
+    parser.add_argument('--train_percent_check', default=1.0, type=float,
+                        help='how much of tng set to check')
+    parser.add_argument('--val_percent_check', default=1.0, type=float,
+                        help='how much of val set to check')
+    parser.add_argument('--test_percent_check', default=1.0, type=float,
+                        help='how much of test set to check')

-    parser.add_argument('--val_check_interval', default=0.95, type=float, help='how much within 1 epoch to check val')
-    parser.add_argument('--log_save_interval', default=100, type=int, help='how many batches between log saves')
-    parser.add_argument('--add_log_row_interval', default=100, type=int, help='add log every k batches')
+    parser.add_argument('--val_check_interval', default=0.95, type=float,
+                        help='how much within 1 epoch to check val')
+    parser.add_argument('--log_save_interval', default=100, type=int,
+                        help='how many batches between log saves')
+    parser.add_argument('--add_log_row_interval', default=100, type=int,
+                        help='add log every k batches')

    # early stopping
    parser.add_argument('--disable_early_stop', dest='enable_early_stop', action='store_false')
    parser.add_argument('--early_stop_metric', default='val_acc', type=str)
    parser.add_argument('--early_stop_mode', default='min', type=str)
-    parser.add_argument('--early_stop_patience', default=3, type=int, help='number of epochs until stop')
+    parser.add_argument('--early_stop_patience', default=3, type=int,
+                        help='number of epochs until stop')

    # gradient handling
    parser.add_argument('--gradient_clip', default=-1, type=int)
-    parser.add_argument('--track_grad_norm', default=-1, type=int, help='if > 0, will track this grad norm')
+    parser.add_argument('--track_grad_norm', default=-1, type=int,
+                        help='if > 0, will track this grad norm')

    # model saving
    parser.add_argument('--model_save_path', default=root_dir + '/model_weights')
@ -47,7 +58,8 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None
    # test_tube settings
    parser.add_argument('-en', '--tt_name', default='pt_test')
    parser.add_argument('-td', '--tt_description', default='pytorch lightning test')
-    parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'), help='logging dir')
+    parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'),
+                        help='logging dir')
    parser.add_argument('--enable_single_run', dest='single_run', action='store_true')
    parser.add_argument('--nb_hopt_trials', default=1, type=int)
    parser.add_argument('--log_stdout', dest='log_stdout', action='store_true')
@ -65,17 +77,23 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None

    # FAST training
    # use these settings to make sure network has no bugs without running a full dataset
-    parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true', help='runs validation after 1 tng step')
-    parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true', help='false removes the prog bar')
-    parser.add_argument('--overfit', default=-1, type=float, help='% of dataset to use with this option. float, or -1 for none')
+    parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true',
+                        help='runs validation after 1 tng step')
+    parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true',
+                        help='false removes the prog bar')
+    parser.add_argument('--overfit', default=-1, type=float,
+                        help='% of dataset to use with this option. float, or -1 for none')

    # debug args
    if rand_seed is not None:
        parser.add_argument('--random_seed', default=rand_seed, type=int)

-    parser.add_argument('--interactive', dest='interactive', action='store_true', help='runs on gpu without cluster')
-    parser.add_argument('--debug', dest='debug', action='store_true', help='enables/disables test tube')
-    parser.add_argument('--local', dest='local', action='store_true', help='enables local tng')
+    parser.add_argument('--interactive', dest='interactive', action='store_true',
+                        help='runs on gpu without cluster')
+    parser.add_argument('--debug', dest='debug', action='store_true',
+                        help='enables/disables test tube')
+    parser.add_argument('--local', dest='local', action='store_true',
+                        help='enables local tng')

    # optimizer
    parser.add_argument('--lr_scheduler_milestones', default=None, type=str)
--- a/tests/debug.py
+++ b/tests/debug.py
@ -107,7 +107,8 @@ def load_model(exp, save_dir):
    checkpoints = [x for x in os.listdir(save_dir) if '.ckpt' in x]
    weights_dir = os.path.join(save_dir, checkpoints[0])

-    trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir, tags_csv=tags_path, on_gpu=True)
+    trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir,
+                                                             tags_csv=tags_path, on_gpu=True)

    assert trained_model is not None, 'loading model failed'

--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -30,10 +30,12 @@ def test_amp_gpu_ddp():
    :return:
    """
    if not torch.cuda.is_available():
-        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      'Rerun on a GPU node to run this test')
        return
    if not torch.cuda.device_count() > 1:
-        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      'Rerun on a node with 2+ GPUs to run this test')
        return

    os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
@ -105,7 +107,8 @@ def test_cpu_slurm_save_load():
    # wipe-out trainer and model
    # retrain with not much data... this simulates picking training back up after slurm
    # we want to see if the weights come back correctly
-    continue_tng_hparams = get_hparams(continue_training=True, hpc_exp_number=cluster_a.hpc_exp_number)
+    continue_tng_hparams = get_hparams(continue_training=True,
+                                       hpc_exp_number=cluster_a.hpc_exp_number)
    trainer_options = dict(
        max_nb_epochs=1,
        cluster=SlurmCluster(continue_tng_hparams),
@ -219,7 +222,8 @@ def test_model_saving_loading():
    # load new model
    tags_path = exp.get_data_path(exp.name, exp.version)
    tags_path = os.path.join(tags_path, 'meta_tags.csv')
-    model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path, tags_csv=tags_path, on_gpu=False)
+    model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path,
+                                                   tags_csv=tags_path, on_gpu=False)
    model_2.eval()

    # make prediction
@ -244,10 +248,12 @@ def test_amp_gpu_ddp_slurm_managed():
    :return:
    """
    if not torch.cuda.is_available():
-        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      ' Rerun on a GPU node to run this test')
        return
    if not torch.cuda.device_count() > 1:
-        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      ' Rerun on a node with 2+ GPUs to run this test')
        return

    # simulate setting slurm flags
@ -411,7 +417,8 @@ def test_single_gpu_model():
    :return:
    """
    if not torch.cuda.is_available():
-        warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_single_gpu_model cannot run.'
+                      ' Rerun on a GPU node to run this test')
        return
    model, hparams = get_model()

@ -432,10 +439,12 @@ def test_multi_gpu_model_dp():
    :return:
    """
    if not torch.cuda.is_available():
-        warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_multi_gpu_model_dp cannot run.'
+                      ' Rerun on a GPU node to run this test')
        return
    if not torch.cuda.device_count() > 1:
-        warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_multi_gpu_model_dp cannot run.'
+                      ' Rerun on a node with 2+ GPUs to run this test')
        return
    model, hparams = get_model()
    trainer_options = dict(
@ -458,10 +467,12 @@ def test_amp_gpu_dp():
    :return:
    """
    if not torch.cuda.is_available():
-        warnings.warn('test_amp_gpu_dp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_amp_gpu_dp cannot run.'
+                      ' Rerun on a GPU node to run this test')
        return
    if not torch.cuda.device_count() > 1:
-        warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_amp_gpu_dp cannot run.'
+                      ' Rerun on a node with 2+ GPUs to run this test')
        return
    model, hparams = get_model()
    trainer_options = dict(
@ -480,10 +491,12 @@ def test_multi_gpu_model_ddp():
    :return:
    """
    if not torch.cuda.is_available():
-        warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_multi_gpu_model_ddp cannot run.'
+                      ' Rerun on a GPU node to run this test')
        return
    if not torch.cuda.device_count() > 1:
-        warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_multi_gpu_model_ddp cannot run.'
+                      ' Rerun on a node with 2+ GPUs to run this test')
        return

    os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])