update by flake8

This commit is contained in:
Jiri BOROVEC 2019-08-06 12:08:31 +02:00
parent 4e0b9c50e7
commit d9bfe964f9
15 changed files with 226 additions and 130 deletions

View File

@ -18,7 +18,7 @@ references:
check-manifest --ignore tox.ini
python setup.py check -m -s
coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules
flake8 . --max-line-length=100
flake8 . --max-line-length=120
codecov
jobs:

View File

@ -8,6 +8,7 @@
[![PyPI Status](https://badge.fury.io/py/pytorch-lightning.svg)](https://badge.fury.io/py/pytorch-lightning)
[![PyPI Status](https://pepy.tech/badge/pytorch-lightning)](https://pepy.tech/project/pytorch-lightning)
[![Build Status](https://travis-ci.org/williamFalcon/pytorch-lightning.svg?branch=master)](https://travis-ci.org/williamFalcon/pytorch-lightning)
[![CircleCI](https://circleci.com/gh/Borda/pytorch-lightning.svg?style=svg)](https://circleci.com/gh/Borda/pytorch-lightning)
[![Build status](https://ci.appveyor.com/api/projects/status/rum89d7hq8l1kfye?svg=true)](https://ci.appveyor.com/project/Borda/pytorch-lightning)
[![codecov](https://codecov.io/gh/Borda/pytorch-lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/Borda/pytorch-lightning)
[![CodeFactor](https://www.codefactor.io/repository/github/borda/pytorch-lightning/badge)](https://www.codefactor.io/repository/github/borda/pytorch-lightning)

View File

@ -47,11 +47,13 @@ class LightningTemplateModel(LightningModule):
Layout model
:return:
"""
self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim)
self.c_d1 = nn.Linear(in_features=self.hparams.in_features,
out_features=self.hparams.hidden_dim)
self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim)
self.c_d1_drop = nn.Dropout(self.hparams.drop_prob)
self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features)
self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim,
out_features=self.hparams.out_features)
# ---------------------
# TRAINING
@ -171,8 +173,10 @@ class LightningTemplateModel(LightningModule):
def __dataloader(self, train):
# init data generators
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True)
transform = transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.5,), (1.0,))])
dataset = MNIST(root=self.hparams.data_root, train=train,
transform=transform, download=True)
# when using multi-node we need to add the datasampler
train_sampler = None
@ -234,11 +238,15 @@ class LightningTemplateModel(LightningModule):
parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
# training params (opt)
parser.opt_list('--learning_rate', default=0.001 * 8, type=float, options=[0.0001, 0.0005, 0.001, 0.005],
parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
options=[0.0001, 0.0005, 0.001, 0.005],
tunable=False)
parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False)
parser.opt_list('--optimizer_name', default='adam', type=str,
options=['adam'], tunable=False)
# if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu
parser.opt_list('--batch_size', default=256 * 8, type=int, options=[32, 64, 128, 256], tunable=False,
help='batch size will be divided over all the gpus being used across all nodes')
# if using 2 nodes with 4 gpus each the batch size here
# (256) will be 256 / (2*8) = 16 per gpu
parser.opt_list('--batch_size', default=256 * 8, type=int,
options=[32, 64, 128, 256], tunable=False,
help='batch size will be divided over all gpus being used across all nodes')
return parser

View File

@ -10,12 +10,12 @@ from test_tube import HyperOptArgumentParser, Experiment, SlurmCluster
from pytorch_lightning.models.trainer import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from .lightning_module_template import LightningTemplateModel
SEED = 2334
torch.manual_seed(SEED)
np.random.seed(SEED)
from .lightning_module_template import LightningTemplateModel
def main_local(hparams):
main(hparams, None, None)
@ -112,8 +112,10 @@ def optimize_on_cluster(hyperparams):
cluster.add_command('source activate lightning')
# run only on 32GB voltas
cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus')
cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus')
cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
comment='use 32gb gpus')
cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition,
comment='use 32gb gpus')
# run hopt
# creates and submits jobs to slurm
@ -140,15 +142,23 @@ if __name__ == '__main__':
parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual')
# TODO: make 1 param
parent_parser.add_argument('--per_experiment_nb_gpus', type=int, help='how many gpus to use in a node')
parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node')
parent_parser.add_argument('--per_experiment_nb_gpus', type=int,
help='how many gpus to use in a node')
parent_parser.add_argument('--gpus', type=str, default='-1',
help='how many gpus to use in the node')
parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1, help='how many nodes to use in a cluster')
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, help='where to save slurm meta')
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, help='how many grid search trials to run')
parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1,
help='how many nodes to use in a cluster')
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
help='where to save logs')
parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir,
help='where to save slurm meta')
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
help='test tube exp name')
parent_parser.add_argument('--nb_hopt_trials', type=int, default=1,
help='how many grid search trials to run')
# allow model to overwrite or extend args
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)

View File

@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
from pytorch_lightning.models.trainer import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from .lightning_module_template import LightningTemplateModel
SEED = 2334
torch.manual_seed(SEED)
np.random.seed(SEED)
from .lightning_module_template import LightningTemplateModel
def main(hparams):
"""
@ -90,9 +90,12 @@ if __name__ == '__main__':
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
# gpu args
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
parent_parser.add_argument('--test_tube_save_path', type=str,
default=test_tube_dir, help='where to save logs')
parent_parser.add_argument('--model_save_path', type=str,
default=checkpoint_dir, help='where to save model')
parent_parser.add_argument('--experiment_name', type=str,
default='pt_lightning_exp_a', help='test tube exp name')
# allow model to overwrite or extend args
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)

View File

@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
from pytorch_lightning.models.trainer import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from .lightning_module_template import LightningTemplateModel
SEED = 2334
torch.manual_seed(SEED)
np.random.seed(SEED)
from .lightning_module_template import LightningTemplateModel
def main(hparams):
"""
@ -92,10 +92,15 @@ if __name__ == '__main__':
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
# gpu args
parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
parent_parser.add_argument('--gpus', type=str, default='-1',
help='how many gpus to use in the node.'
'value -1 uses all the gpus on the node')
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
help='where to save logs')
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
help='test tube exp name')
# allow model to overwrite or extend args
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)

View File

@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
from pytorch_lightning.models.trainer import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from .lightning_module_template import LightningTemplateModel
SEED = 2334
torch.manual_seed(SEED)
np.random.seed(SEED)
from .lightning_module_template import LightningTemplateModel
def main(hparams):
"""
@ -92,10 +92,15 @@ if __name__ == '__main__':
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
# gpu args
parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
parent_parser.add_argument('--gpus', type=str, default='-1',
help='how many gpus to use in the node.'
' value -1 uses all the gpus on the node')
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
help='where to save logs')
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
help='test tube exp name')
# allow model to overwrite or extend args
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)

View File

@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
from pytorch_lightning.models.trainer import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from .lightning_module_template import LightningTemplateModel
SEED = 2334
torch.manual_seed(SEED)
np.random.seed(SEED)
from .lightning_module_template import LightningTemplateModel
def main(hparams):
"""
@ -91,10 +91,15 @@ if __name__ == '__main__':
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
# gpu args
parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
parent_parser.add_argument('--gpus', type=str, default='-1',
help='how many gpus to use in the node.'
' value -1 uses all the gpus on the node')
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
help='where to save logs')
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
help='where to save model')
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
help='test tube exp name')
# allow model to overwrite or extend args
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)

View File

@ -15,7 +15,8 @@ import torch.distributed as dist
from ..root_module.memory import get_gpu_memory_map
from ..root_module.model_saving import TrainerIO
from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
from ..pt_overrides.override_data_parallel import (
LightningDistributedDataParallel, LightningDataParallel)
from ..utilities.debugging import MisconfigurationException
try:
@ -64,17 +65,20 @@ class Trainer(TrainerIO):
check_val_every_n_epoch=1,
fast_dev_run=False,
accumulate_grad_batches=1,
max_nb_epochs=1000, min_nb_epochs=1,
train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0,
max_nb_epochs=1000,
min_nb_epochs=1,
train_percent_check=1.0,
val_percent_check=1.0,
test_percent_check=1.0,
val_check_interval=0.95,
log_save_interval=100, add_log_row_interval=10,
log_save_interval=100,
add_log_row_interval=10,
distributed_backend='dp',
use_amp=False,
print_nan_grads=False,
print_weights_summary=True,
amp_level='O2',
nb_sanity_val_steps=5):
"""
:param experiment: Test-tube experiment
@ -100,16 +104,15 @@ class Trainer(TrainerIO):
:param val_check_interval:
:param log_save_interval:
:param add_log_row_interval:
:param distributed_backend: 'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel
:param distributed_backend:
'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel
:param use_amp:
:param print_nan_grads:
:param print_weights_summary:
:param amp_level:
:param nb_sanity_val_steps:
"""
# Transfer params
self.nb_gpu_nodes = nb_gpu_nodes
self.gradient_clip = gradient_clip
self.check_val_every_n_epoch = check_val_every_n_epoch
@ -171,13 +174,13 @@ class Trainer(TrainerIO):
# set the correct cuda visible devices (using pci order)
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in self.data_parallel_device_ids])
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in
self.data_parallel_device_ids])
print('VISIBLE GPUS: %r' % os.environ["CUDA_VISIBLE_DEVICES"])
# make DP and DDP mutually exclusive
# single GPU will also use DP with devices=[0]
have_gpus = self.data_parallel_device_ids is not None and len(self.data_parallel_device_ids) > 0
if have_gpus:
if self.data_parallel_device_ids:
self.use_dp = distributed_backend == 'dp'
self.use_ddp = distributed_backend == 'ddp'
@ -224,7 +227,8 @@ class Trainer(TrainerIO):
self.val_dataloader = None
# how much of the data to use
self.__determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct)
self.__determine_data_use_amount(train_percent_check, val_percent_check,
test_percent_check, overfit_pct)
print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
# 16 bit mixed precision training using apex
@ -246,7 +250,8 @@ class Trainer(TrainerIO):
def data_parallel(self):
return self.use_dp or self.use_ddp
def __determine_data_use_amount(self, train_percent_check, val_percent_check, test_percent_check, overfit_pct):
def __determine_data_use_amount(self, train_percent_check, val_percent_check,
test_percent_check, overfit_pct):
"""
Use less data for debugging purposes
"""
@ -388,17 +393,18 @@ class Trainer(TrainerIO):
if self.use_ddp and not isinstance(self.tng_dataloader.sampler, DistributedSampler):
msg = """
when using multiple gpus and multiple nodes you must pass a DistributedSampler to DataLoader(sampler).
when using multiple gpus and multiple nodes you must pass
a DistributedSampler to DataLoader(sampler).
ie: this:
dataset = myDataset()
dataloader = Dataloader(dataset)
ie: this:
dataset = myDataset()
dataloader = Dataloader(dataset)
becomes:
dataset = myDataset()
dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
dataloader = Dataloader(dataset, sampler=dist_sampler)
"""
becomes:
dataset = myDataset()
dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
dataloader = Dataloader(dataset, sampler=dist_sampler)
"""
raise MisconfigurationException(msg)
# -----------------------------
@ -408,7 +414,8 @@ class Trainer(TrainerIO):
# when using multi-node or DDP within a node start each module in a separate process
if self.use_ddp:
# must copy only the meta of the exp so it survives pickle/unpickle when going to new process
# must copy only the meta of the exp so it survives pickle/unpickle
# when going to new process
self.experiment = self.experiment.get_meta_copy()
if self.is_slurm_managing_tasks:
@ -416,11 +423,11 @@ class Trainer(TrainerIO):
self.ddp_train(task, model)
else:
msg = """
You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks.
We will launch %(nb_gpus)s processes for you.
We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s
If you're not using SLURM, ignore this message!
""" % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks}
You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks.
We will launch %(nb_gpus)s processes for you.
We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s
If you're not using SLURM, ignore this message!
""" % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks}
warnings.warn(msg)
mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))
@ -433,7 +440,8 @@ class Trainer(TrainerIO):
else:
# run through amp wrapper
if self.use_amp:
raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option')
raise MisconfigurationException('amp + cpu is not supported.'
' Please use a GPU option')
# CHOOSE OPTIMIZER
# allow for lr schedulers as well
@ -461,10 +469,10 @@ class Trainer(TrainerIO):
# https://github.com/NVIDIA/apex/issues/227
if self.use_dp and self.use_amp:
m = """
Amp level %r with DataParallel is not supported.
See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
We recommend you switch to ddp if you want to use amp
""" % self.amp_level
Amp level %r with DataParallel is not supported.
See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
We recommend you switch to ddp if you want to use amp
""" % self.amp_level
raise MisconfigurationException(m)
model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids)
@ -527,7 +535,8 @@ class Trainer(TrainerIO):
)
self.optimizers = optimizers
model = LightningDistributedDataParallel(model, device_ids=[gpu_nb], find_unused_parameters=True)
model = LightningDistributedDataParallel(model, device_ids=[gpu_nb],
find_unused_parameters=True)
# continue training routine
self.__run_pretrain_routine(model)
@ -642,7 +651,8 @@ class Trainer(TrainerIO):
# init progbar when requested
if self.progress_bar:
self.prog_bar = tqdm.tqdm(range(self.total_batches), position=self.process_position)
self.prog_bar = tqdm.tqdm(range(self.total_batches),
position=self.process_position)
for batch_nb, data_batch in enumerate(self.tng_dataloader):
self.batch_nb = batch_nb
@ -651,7 +661,8 @@ class Trainer(TrainerIO):
model = self.__get_model()
model.global_step = self.global_step
# stop when the flag is changed or we've gone past the amount requested in the batches
# stop when the flag is changed or we've gone past the amount
# requested in the batches
self.total_batch_nb += 1
met_batch_limit = batch_nb > self.nb_tng_batches
if met_batch_limit:
@ -698,7 +709,8 @@ class Trainer(TrainerIO):
model.on_tng_metrics(metrics)
# log metrics
scalar_metrics = self.__metrics_to_scalars(metrics, blacklist=self.__log_vals_blacklist())
scalar_metrics = self.__metrics_to_scalars(
metrics, blacklist=self.__log_vals_blacklist())
if self.proc_rank == 0:
self.experiment.log(scalar_metrics, global_step=self.global_step)
self.experiment.save()
@ -720,7 +732,8 @@ class Trainer(TrainerIO):
# early stopping
met_min_epochs = epoch_nb > self.min_nb_epochs
if self.enable_early_stop and met_min_epochs:
should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb, logs=self.__tng_tqdm_dic)
should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb,
logs=self.__tng_tqdm_dic)
# stop training
stop = should_stop and met_min_epochs
@ -828,7 +841,8 @@ class Trainer(TrainerIO):
# clear gradients
optimizer.zero_grad()
# queuing loss across batches blows it up proportionally... divide out the number accumulated
# queuing loss across batches blows it up proportionally...
# divide out the number accumulated
self.batch_loss_value = self.batch_loss_value / self.accumulate_grad_batches
# track loss
@ -885,4 +899,5 @@ class Trainer(TrainerIO):
# model checkpointing
if self.proc_rank == 0 and self.checkpoint_callback is not None:
print('save callback...')
self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, logs=self.__tng_tqdm_dic)
self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch,
logs=self.__tng_tqdm_dic)

View File

@ -17,11 +17,13 @@ class GradInformation(nn.Module):
total_norm += param_norm ** norm_type
norm = param_norm ** (1 / norm_type)
results['grad_{}_norm_{}'.format(norm_type, i)] = round(norm.data.cpu().numpy().flatten()[0], 3)
grad = round(norm.data.cpu().numpy().flatten()[0], 3)
results['grad_{}_norm_{}'.format(norm_type, i)] = grad
except Exception:
# this param had no grad
pass
total_norm = total_norm ** (1. / norm_type)
results['grad_{}_norm_total'.format(norm_type)] = round(total_norm.data.cpu().numpy().flatten()[0], 3)
grad = round(total_norm.data.cpu().numpy().flatten()[0], 3)
results['grad_{}_norm_total'.format(norm_type)] = grad
return results

View File

@ -3,7 +3,8 @@ import re
import torch
from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
from ..pt_overrides.override_data_parallel import (
LightningDistributedDataParallel, LightningDataParallel)
class ModelIO(object):
@ -45,7 +46,8 @@ class ModelIO(object):
class TrainerIO(object):
def __get_model(self):
is_dp_module = type(self.model) is LightningDistributedDataParallel or type(self.model) is LightningDataParallel
is_dp_module = isinstance(self.model, (LightningDistributedDataParallel,
LightningDataParallel))
model = self.model.module if is_dp_module else self.model
return model

View File

@ -48,11 +48,13 @@ class LightningTestModel(LightningModule):
Layout model
:return:
"""
self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim)
self.c_d1 = nn.Linear(in_features=self.hparams.in_features,
out_features=self.hparams.hidden_dim)
self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim)
self.c_d1_drop = nn.Dropout(self.hparams.drop_prob)
self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features)
self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim,
out_features=self.hparams.out_features)
# ---------------------
# TRAINING
@ -191,8 +193,10 @@ class LightningTestModel(LightningModule):
def __dataloader(self, train):
# init data generators
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True)
transform = transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.5,), (1.0,))])
dataset = MNIST(root=self.hparams.data_root, train=train,
transform=transform, download=True)
# when using multi-node we need to add the datasampler
train_sampler = None
@ -251,11 +255,15 @@ class LightningTestModel(LightningModule):
parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
# training params (opt)
parser.opt_list('--learning_rate', default=0.001 * 8, type=float, options=[0.0001, 0.0005, 0.001, 0.005],
parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
options=[0.0001, 0.0005, 0.001, 0.005],
tunable=False)
parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False)
parser.opt_list('--optimizer_name', default='adam', type=str,
options=['adam'], tunable=False)
# if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu
parser.opt_list('--batch_size', default=256 * 8, type=int, options=[32, 64, 128, 256], tunable=False,
help='batch size will be divided over all the gpus being used across all nodes')
# if using 2 nodes with 4 gpus each the batch size here
# (256) will be 256 / (2*8) = 16 per gpu
parser.opt_list('--batch_size', default=256 * 8, type=int,
options=[32, 64, 128, 256], tunable=False,
help='batch size will be divided over all gpus being used across all nodes')
return parser

View File

@ -9,29 +9,40 @@ import os
def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None):
# tng, test, val check intervals
parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true', help='true = run test set also')
parser.add_argument('--check_val_every_n_epoch', default=1, type=int, help='check val every n epochs')
parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true',
help='true = run test set also')
parser.add_argument('--check_val_every_n_epoch', default=1, type=int,
help='check val every n epochs')
parser.opt_list('--accumulate_grad_batches', default=1, type=int, tunable=False,
help='accumulates gradients k times before applying update. Simulates huge batch size')
help='accumulates gradients k times before applying update.'
' Simulates huge batch size')
parser.add_argument('--max_nb_epochs', default=200, type=int, help='cap epochs')
parser.add_argument('--min_nb_epochs', default=2, type=int, help='min epochs')
parser.add_argument('--train_percent_check', default=1.0, type=float, help='how much of tng set to check')
parser.add_argument('--val_percent_check', default=1.0, type=float, help='how much of val set to check')
parser.add_argument('--test_percent_check', default=1.0, type=float, help='how much of test set to check')
parser.add_argument('--train_percent_check', default=1.0, type=float,
help='how much of tng set to check')
parser.add_argument('--val_percent_check', default=1.0, type=float,
help='how much of val set to check')
parser.add_argument('--test_percent_check', default=1.0, type=float,
help='how much of test set to check')
parser.add_argument('--val_check_interval', default=0.95, type=float, help='how much within 1 epoch to check val')
parser.add_argument('--log_save_interval', default=100, type=int, help='how many batches between log saves')
parser.add_argument('--add_log_row_interval', default=100, type=int, help='add log every k batches')
parser.add_argument('--val_check_interval', default=0.95, type=float,
help='how much within 1 epoch to check val')
parser.add_argument('--log_save_interval', default=100, type=int,
help='how many batches between log saves')
parser.add_argument('--add_log_row_interval', default=100, type=int,
help='add log every k batches')
# early stopping
parser.add_argument('--disable_early_stop', dest='enable_early_stop', action='store_false')
parser.add_argument('--early_stop_metric', default='val_acc', type=str)
parser.add_argument('--early_stop_mode', default='min', type=str)
parser.add_argument('--early_stop_patience', default=3, type=int, help='number of epochs until stop')
parser.add_argument('--early_stop_patience', default=3, type=int,
help='number of epochs until stop')
# gradient handling
parser.add_argument('--gradient_clip', default=-1, type=int)
parser.add_argument('--track_grad_norm', default=-1, type=int, help='if > 0, will track this grad norm')
parser.add_argument('--track_grad_norm', default=-1, type=int,
help='if > 0, will track this grad norm')
# model saving
parser.add_argument('--model_save_path', default=root_dir + '/model_weights')
@ -47,7 +58,8 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None
# test_tube settings
parser.add_argument('-en', '--tt_name', default='pt_test')
parser.add_argument('-td', '--tt_description', default='pytorch lightning test')
parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'), help='logging dir')
parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'),
help='logging dir')
parser.add_argument('--enable_single_run', dest='single_run', action='store_true')
parser.add_argument('--nb_hopt_trials', default=1, type=int)
parser.add_argument('--log_stdout', dest='log_stdout', action='store_true')
@ -65,17 +77,23 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None
# FAST training
# use these settings to make sure network has no bugs without running a full dataset
parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true', help='runs validation after 1 tng step')
parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true', help='false removes the prog bar')
parser.add_argument('--overfit', default=-1, type=float, help='% of dataset to use with this option. float, or -1 for none')
parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true',
help='runs validation after 1 tng step')
parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true',
help='false removes the prog bar')
parser.add_argument('--overfit', default=-1, type=float,
help='% of dataset to use with this option. float, or -1 for none')
# debug args
if rand_seed is not None:
parser.add_argument('--random_seed', default=rand_seed, type=int)
parser.add_argument('--interactive', dest='interactive', action='store_true', help='runs on gpu without cluster')
parser.add_argument('--debug', dest='debug', action='store_true', help='enables/disables test tube')
parser.add_argument('--local', dest='local', action='store_true', help='enables local tng')
parser.add_argument('--interactive', dest='interactive', action='store_true',
help='runs on gpu without cluster')
parser.add_argument('--debug', dest='debug', action='store_true',
help='enables/disables test tube')
parser.add_argument('--local', dest='local', action='store_true',
help='enables local tng')
# optimizer
parser.add_argument('--lr_scheduler_milestones', default=None, type=str)

View File

@ -107,7 +107,8 @@ def load_model(exp, save_dir):
checkpoints = [x for x in os.listdir(save_dir) if '.ckpt' in x]
weights_dir = os.path.join(save_dir, checkpoints[0])
trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir, tags_csv=tags_path, on_gpu=True)
trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir,
tags_csv=tags_path, on_gpu=True)
assert trained_model is not None, 'loading model failed'

View File

@ -30,10 +30,12 @@ def test_amp_gpu_ddp():
:return:
"""
if not torch.cuda.is_available():
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
warnings.warn('test_amp_gpu_ddp cannot run.'
'Rerun on a GPU node to run this test')
return
if not torch.cuda.device_count() > 1:
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
warnings.warn('test_amp_gpu_ddp cannot run.'
'Rerun on a node with 2+ GPUs to run this test')
return
os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
@ -105,7 +107,8 @@ def test_cpu_slurm_save_load():
# wipe-out trainer and model
# retrain with not much data... this simulates picking training back up after slurm
# we want to see if the weights come back correctly
continue_tng_hparams = get_hparams(continue_training=True, hpc_exp_number=cluster_a.hpc_exp_number)
continue_tng_hparams = get_hparams(continue_training=True,
hpc_exp_number=cluster_a.hpc_exp_number)
trainer_options = dict(
max_nb_epochs=1,
cluster=SlurmCluster(continue_tng_hparams),
@ -219,7 +222,8 @@ def test_model_saving_loading():
# load new model
tags_path = exp.get_data_path(exp.name, exp.version)
tags_path = os.path.join(tags_path, 'meta_tags.csv')
model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path, tags_csv=tags_path, on_gpu=False)
model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path,
tags_csv=tags_path, on_gpu=False)
model_2.eval()
# make prediction
@ -244,10 +248,12 @@ def test_amp_gpu_ddp_slurm_managed():
:return:
"""
if not torch.cuda.is_available():
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
warnings.warn('test_amp_gpu_ddp cannot run.'
' Rerun on a GPU node to run this test')
return
if not torch.cuda.device_count() > 1:
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
warnings.warn('test_amp_gpu_ddp cannot run.'
' Rerun on a node with 2+ GPUs to run this test')
return
# simulate setting slurm flags
@ -411,7 +417,8 @@ def test_single_gpu_model():
:return:
"""
if not torch.cuda.is_available():
warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test')
warnings.warn('test_single_gpu_model cannot run.'
' Rerun on a GPU node to run this test')
return
model, hparams = get_model()
@ -432,10 +439,12 @@ def test_multi_gpu_model_dp():
:return:
"""
if not torch.cuda.is_available():
warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a GPU node to run this test')
warnings.warn('test_multi_gpu_model_dp cannot run.'
' Rerun on a GPU node to run this test')
return
if not torch.cuda.device_count() > 1:
warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
warnings.warn('test_multi_gpu_model_dp cannot run.'
' Rerun on a node with 2+ GPUs to run this test')
return
model, hparams = get_model()
trainer_options = dict(
@ -458,10 +467,12 @@ def test_amp_gpu_dp():
:return:
"""
if not torch.cuda.is_available():
warnings.warn('test_amp_gpu_dp cannot run. Rerun on a GPU node to run this test')
warnings.warn('test_amp_gpu_dp cannot run.'
' Rerun on a GPU node to run this test')
return
if not torch.cuda.device_count() > 1:
warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
warnings.warn('test_amp_gpu_dp cannot run.'
' Rerun on a node with 2+ GPUs to run this test')
return
model, hparams = get_model()
trainer_options = dict(
@ -480,10 +491,12 @@ def test_multi_gpu_model_ddp():
:return:
"""
if not torch.cuda.is_available():
warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a GPU node to run this test')
warnings.warn('test_multi_gpu_model_ddp cannot run.'
' Rerun on a GPU node to run this test')
return
if not torch.cuda.device_count() > 1:
warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
warnings.warn('test_multi_gpu_model_ddp cannot run.'
' Rerun on a node with 2+ GPUs to run this test')
return
os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])