update by flake8
This commit is contained in:
parent
4e0b9c50e7
commit
d9bfe964f9
|
@ -18,7 +18,7 @@ references:
|
|||
check-manifest --ignore tox.ini
|
||||
python setup.py check -m -s
|
||||
coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules
|
||||
flake8 . --max-line-length=100
|
||||
flake8 . --max-line-length=120
|
||||
codecov
|
||||
|
||||
jobs:
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
[![PyPI Status](https://badge.fury.io/py/pytorch-lightning.svg)](https://badge.fury.io/py/pytorch-lightning)
|
||||
[![PyPI Status](https://pepy.tech/badge/pytorch-lightning)](https://pepy.tech/project/pytorch-lightning)
|
||||
[![Build Status](https://travis-ci.org/williamFalcon/pytorch-lightning.svg?branch=master)](https://travis-ci.org/williamFalcon/pytorch-lightning)
|
||||
[![CircleCI](https://circleci.com/gh/Borda/pytorch-lightning.svg?style=svg)](https://circleci.com/gh/Borda/pytorch-lightning)
|
||||
[![Build status](https://ci.appveyor.com/api/projects/status/rum89d7hq8l1kfye?svg=true)](https://ci.appveyor.com/project/Borda/pytorch-lightning)
|
||||
[![codecov](https://codecov.io/gh/Borda/pytorch-lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/Borda/pytorch-lightning)
|
||||
[![CodeFactor](https://www.codefactor.io/repository/github/borda/pytorch-lightning/badge)](https://www.codefactor.io/repository/github/borda/pytorch-lightning)
|
||||
|
|
|
@ -47,11 +47,13 @@ class LightningTemplateModel(LightningModule):
|
|||
Layout model
|
||||
:return:
|
||||
"""
|
||||
self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim)
|
||||
self.c_d1 = nn.Linear(in_features=self.hparams.in_features,
|
||||
out_features=self.hparams.hidden_dim)
|
||||
self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim)
|
||||
self.c_d1_drop = nn.Dropout(self.hparams.drop_prob)
|
||||
|
||||
self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features)
|
||||
self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim,
|
||||
out_features=self.hparams.out_features)
|
||||
|
||||
# ---------------------
|
||||
# TRAINING
|
||||
|
@ -171,8 +173,10 @@ class LightningTemplateModel(LightningModule):
|
|||
|
||||
def __dataloader(self, train):
|
||||
# init data generators
|
||||
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
|
||||
dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True)
|
||||
transform = transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.5,), (1.0,))])
|
||||
dataset = MNIST(root=self.hparams.data_root, train=train,
|
||||
transform=transform, download=True)
|
||||
|
||||
# when using multi-node we need to add the datasampler
|
||||
train_sampler = None
|
||||
|
@ -234,11 +238,15 @@ class LightningTemplateModel(LightningModule):
|
|||
parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
|
||||
|
||||
# training params (opt)
|
||||
parser.opt_list('--learning_rate', default=0.001 * 8, type=float, options=[0.0001, 0.0005, 0.001, 0.005],
|
||||
parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
|
||||
options=[0.0001, 0.0005, 0.001, 0.005],
|
||||
tunable=False)
|
||||
parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False)
|
||||
parser.opt_list('--optimizer_name', default='adam', type=str,
|
||||
options=['adam'], tunable=False)
|
||||
|
||||
# if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu
|
||||
parser.opt_list('--batch_size', default=256 * 8, type=int, options=[32, 64, 128, 256], tunable=False,
|
||||
help='batch size will be divided over all the gpus being used across all nodes')
|
||||
# if using 2 nodes with 4 gpus each the batch size here
|
||||
# (256) will be 256 / (2*8) = 16 per gpu
|
||||
parser.opt_list('--batch_size', default=256 * 8, type=int,
|
||||
options=[32, 64, 128, 256], tunable=False,
|
||||
help='batch size will be divided over all gpus being used across all nodes')
|
||||
return parser
|
||||
|
|
|
@ -10,12 +10,12 @@ from test_tube import HyperOptArgumentParser, Experiment, SlurmCluster
|
|||
from pytorch_lightning.models.trainer import Trainer
|
||||
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
SEED = 2334
|
||||
torch.manual_seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
|
||||
def main_local(hparams):
|
||||
main(hparams, None, None)
|
||||
|
@ -112,8 +112,10 @@ def optimize_on_cluster(hyperparams):
|
|||
cluster.add_command('source activate lightning')
|
||||
|
||||
# run only on 32GB voltas
|
||||
cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus')
|
||||
cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus')
|
||||
cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
|
||||
comment='use 32gb gpus')
|
||||
cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition,
|
||||
comment='use 32gb gpus')
|
||||
|
||||
# run hopt
|
||||
# creates and submits jobs to slurm
|
||||
|
@ -140,15 +142,23 @@ if __name__ == '__main__':
|
|||
parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual')
|
||||
|
||||
# TODO: make 1 param
|
||||
parent_parser.add_argument('--per_experiment_nb_gpus', type=int, help='how many gpus to use in a node')
|
||||
parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node')
|
||||
parent_parser.add_argument('--per_experiment_nb_gpus', type=int,
|
||||
help='how many gpus to use in a node')
|
||||
parent_parser.add_argument('--gpus', type=str, default='-1',
|
||||
help='how many gpus to use in the node')
|
||||
|
||||
parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1, help='how many nodes to use in a cluster')
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
|
||||
parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, help='where to save slurm meta')
|
||||
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
|
||||
parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, help='how many grid search trials to run')
|
||||
parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1,
|
||||
help='how many nodes to use in a cluster')
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
|
||||
help='where to save logs')
|
||||
parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir,
|
||||
help='where to save slurm meta')
|
||||
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
|
||||
help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
|
||||
help='test tube exp name')
|
||||
parent_parser.add_argument('--nb_hopt_trials', type=int, default=1,
|
||||
help='how many grid search trials to run')
|
||||
|
||||
# allow model to overwrite or extend args
|
||||
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
|
||||
|
|
|
@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
|
|||
from pytorch_lightning.models.trainer import Trainer
|
||||
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
SEED = 2334
|
||||
torch.manual_seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
|
||||
def main(hparams):
|
||||
"""
|
||||
|
@ -90,9 +90,12 @@ if __name__ == '__main__':
|
|||
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
|
||||
|
||||
# gpu args
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
|
||||
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str,
|
||||
default=test_tube_dir, help='where to save logs')
|
||||
parent_parser.add_argument('--model_save_path', type=str,
|
||||
default=checkpoint_dir, help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str,
|
||||
default='pt_lightning_exp_a', help='test tube exp name')
|
||||
|
||||
# allow model to overwrite or extend args
|
||||
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
|
||||
|
|
|
@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
|
|||
from pytorch_lightning.models.trainer import Trainer
|
||||
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
SEED = 2334
|
||||
torch.manual_seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
|
||||
def main(hparams):
|
||||
"""
|
||||
|
@ -92,10 +92,15 @@ if __name__ == '__main__':
|
|||
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
|
||||
|
||||
# gpu args
|
||||
parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
|
||||
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
|
||||
parent_parser.add_argument('--gpus', type=str, default='-1',
|
||||
help='how many gpus to use in the node.'
|
||||
'value -1 uses all the gpus on the node')
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
|
||||
help='where to save logs')
|
||||
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
|
||||
help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
|
||||
help='test tube exp name')
|
||||
|
||||
# allow model to overwrite or extend args
|
||||
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
|
||||
|
|
|
@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
|
|||
from pytorch_lightning.models.trainer import Trainer
|
||||
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
SEED = 2334
|
||||
torch.manual_seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
|
||||
def main(hparams):
|
||||
"""
|
||||
|
@ -92,10 +92,15 @@ if __name__ == '__main__':
|
|||
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
|
||||
|
||||
# gpu args
|
||||
parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
|
||||
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
|
||||
parent_parser.add_argument('--gpus', type=str, default='-1',
|
||||
help='how many gpus to use in the node.'
|
||||
' value -1 uses all the gpus on the node')
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
|
||||
help='where to save logs')
|
||||
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
|
||||
help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
|
||||
help='test tube exp name')
|
||||
|
||||
# allow model to overwrite or extend args
|
||||
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
|
||||
|
|
|
@ -9,12 +9,12 @@ from test_tube import HyperOptArgumentParser, Experiment
|
|||
from pytorch_lightning.models.trainer import Trainer
|
||||
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
SEED = 2334
|
||||
torch.manual_seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
|
||||
from .lightning_module_template import LightningTemplateModel
|
||||
|
||||
|
||||
def main(hparams):
|
||||
"""
|
||||
|
@ -91,10 +91,15 @@ if __name__ == '__main__':
|
|||
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
|
||||
|
||||
# gpu args
|
||||
parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
|
||||
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
|
||||
parent_parser.add_argument('--gpus', type=str, default='-1',
|
||||
help='how many gpus to use in the node.'
|
||||
' value -1 uses all the gpus on the node')
|
||||
parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
|
||||
help='where to save logs')
|
||||
parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
|
||||
help='where to save model')
|
||||
parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
|
||||
help='test tube exp name')
|
||||
|
||||
# allow model to overwrite or extend args
|
||||
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
|
||||
|
|
|
@ -15,7 +15,8 @@ import torch.distributed as dist
|
|||
|
||||
from ..root_module.memory import get_gpu_memory_map
|
||||
from ..root_module.model_saving import TrainerIO
|
||||
from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
|
||||
from ..pt_overrides.override_data_parallel import (
|
||||
LightningDistributedDataParallel, LightningDataParallel)
|
||||
from ..utilities.debugging import MisconfigurationException
|
||||
|
||||
try:
|
||||
|
@ -64,17 +65,20 @@ class Trainer(TrainerIO):
|
|||
check_val_every_n_epoch=1,
|
||||
fast_dev_run=False,
|
||||
accumulate_grad_batches=1,
|
||||
max_nb_epochs=1000, min_nb_epochs=1,
|
||||
train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0,
|
||||
max_nb_epochs=1000,
|
||||
min_nb_epochs=1,
|
||||
train_percent_check=1.0,
|
||||
val_percent_check=1.0,
|
||||
test_percent_check=1.0,
|
||||
val_check_interval=0.95,
|
||||
log_save_interval=100, add_log_row_interval=10,
|
||||
log_save_interval=100,
|
||||
add_log_row_interval=10,
|
||||
distributed_backend='dp',
|
||||
use_amp=False,
|
||||
print_nan_grads=False,
|
||||
print_weights_summary=True,
|
||||
amp_level='O2',
|
||||
nb_sanity_val_steps=5):
|
||||
|
||||
"""
|
||||
|
||||
:param experiment: Test-tube experiment
|
||||
|
@ -100,16 +104,15 @@ class Trainer(TrainerIO):
|
|||
:param val_check_interval:
|
||||
:param log_save_interval:
|
||||
:param add_log_row_interval:
|
||||
:param distributed_backend: 'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel
|
||||
:param distributed_backend:
|
||||
'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel
|
||||
:param use_amp:
|
||||
:param print_nan_grads:
|
||||
:param print_weights_summary:
|
||||
:param amp_level:
|
||||
:param nb_sanity_val_steps:
|
||||
"""
|
||||
|
||||
# Transfer params
|
||||
|
||||
self.nb_gpu_nodes = nb_gpu_nodes
|
||||
self.gradient_clip = gradient_clip
|
||||
self.check_val_every_n_epoch = check_val_every_n_epoch
|
||||
|
@ -171,13 +174,13 @@ class Trainer(TrainerIO):
|
|||
|
||||
# set the correct cuda visible devices (using pci order)
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in self.data_parallel_device_ids])
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in
|
||||
self.data_parallel_device_ids])
|
||||
print('VISIBLE GPUS: %r' % os.environ["CUDA_VISIBLE_DEVICES"])
|
||||
|
||||
# make DP and DDP mutually exclusive
|
||||
# single GPU will also use DP with devices=[0]
|
||||
have_gpus = self.data_parallel_device_ids is not None and len(self.data_parallel_device_ids) > 0
|
||||
if have_gpus:
|
||||
if self.data_parallel_device_ids:
|
||||
self.use_dp = distributed_backend == 'dp'
|
||||
self.use_ddp = distributed_backend == 'ddp'
|
||||
|
||||
|
@ -224,7 +227,8 @@ class Trainer(TrainerIO):
|
|||
self.val_dataloader = None
|
||||
|
||||
# how much of the data to use
|
||||
self.__determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct)
|
||||
self.__determine_data_use_amount(train_percent_check, val_percent_check,
|
||||
test_percent_check, overfit_pct)
|
||||
print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
|
||||
|
||||
# 16 bit mixed precision training using apex
|
||||
|
@ -246,7 +250,8 @@ class Trainer(TrainerIO):
|
|||
def data_parallel(self):
|
||||
return self.use_dp or self.use_ddp
|
||||
|
||||
def __determine_data_use_amount(self, train_percent_check, val_percent_check, test_percent_check, overfit_pct):
|
||||
def __determine_data_use_amount(self, train_percent_check, val_percent_check,
|
||||
test_percent_check, overfit_pct):
|
||||
"""
|
||||
Use less data for debugging purposes
|
||||
"""
|
||||
|
@ -388,17 +393,18 @@ class Trainer(TrainerIO):
|
|||
|
||||
if self.use_ddp and not isinstance(self.tng_dataloader.sampler, DistributedSampler):
|
||||
msg = """
|
||||
when using multiple gpus and multiple nodes you must pass a DistributedSampler to DataLoader(sampler).
|
||||
when using multiple gpus and multiple nodes you must pass
|
||||
a DistributedSampler to DataLoader(sampler).
|
||||
|
||||
ie: this:
|
||||
dataset = myDataset()
|
||||
dataloader = Dataloader(dataset)
|
||||
ie: this:
|
||||
dataset = myDataset()
|
||||
dataloader = Dataloader(dataset)
|
||||
|
||||
becomes:
|
||||
dataset = myDataset()
|
||||
dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
|
||||
dataloader = Dataloader(dataset, sampler=dist_sampler)
|
||||
"""
|
||||
becomes:
|
||||
dataset = myDataset()
|
||||
dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
|
||||
dataloader = Dataloader(dataset, sampler=dist_sampler)
|
||||
"""
|
||||
raise MisconfigurationException(msg)
|
||||
|
||||
# -----------------------------
|
||||
|
@ -408,7 +414,8 @@ class Trainer(TrainerIO):
|
|||
|
||||
# when using multi-node or DDP within a node start each module in a separate process
|
||||
if self.use_ddp:
|
||||
# must copy only the meta of the exp so it survives pickle/unpickle when going to new process
|
||||
# must copy only the meta of the exp so it survives pickle/unpickle
|
||||
# when going to new process
|
||||
self.experiment = self.experiment.get_meta_copy()
|
||||
|
||||
if self.is_slurm_managing_tasks:
|
||||
|
@ -416,11 +423,11 @@ class Trainer(TrainerIO):
|
|||
self.ddp_train(task, model)
|
||||
else:
|
||||
msg = """
|
||||
You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks.
|
||||
We will launch %(nb_gpus)s processes for you.
|
||||
We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s
|
||||
If you're not using SLURM, ignore this message!
|
||||
""" % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks}
|
||||
You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks.
|
||||
We will launch %(nb_gpus)s processes for you.
|
||||
We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s
|
||||
If you're not using SLURM, ignore this message!
|
||||
""" % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks}
|
||||
warnings.warn(msg)
|
||||
mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))
|
||||
|
||||
|
@ -433,7 +440,8 @@ class Trainer(TrainerIO):
|
|||
else:
|
||||
# run through amp wrapper
|
||||
if self.use_amp:
|
||||
raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option')
|
||||
raise MisconfigurationException('amp + cpu is not supported.'
|
||||
' Please use a GPU option')
|
||||
|
||||
# CHOOSE OPTIMIZER
|
||||
# allow for lr schedulers as well
|
||||
|
@ -461,10 +469,10 @@ class Trainer(TrainerIO):
|
|||
# https://github.com/NVIDIA/apex/issues/227
|
||||
if self.use_dp and self.use_amp:
|
||||
m = """
|
||||
Amp level %r with DataParallel is not supported.
|
||||
See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
|
||||
We recommend you switch to ddp if you want to use amp
|
||||
""" % self.amp_level
|
||||
Amp level %r with DataParallel is not supported.
|
||||
See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
|
||||
We recommend you switch to ddp if you want to use amp
|
||||
""" % self.amp_level
|
||||
raise MisconfigurationException(m)
|
||||
|
||||
model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids)
|
||||
|
@ -527,7 +535,8 @@ class Trainer(TrainerIO):
|
|||
)
|
||||
self.optimizers = optimizers
|
||||
|
||||
model = LightningDistributedDataParallel(model, device_ids=[gpu_nb], find_unused_parameters=True)
|
||||
model = LightningDistributedDataParallel(model, device_ids=[gpu_nb],
|
||||
find_unused_parameters=True)
|
||||
|
||||
# continue training routine
|
||||
self.__run_pretrain_routine(model)
|
||||
|
@ -642,7 +651,8 @@ class Trainer(TrainerIO):
|
|||
|
||||
# init progbar when requested
|
||||
if self.progress_bar:
|
||||
self.prog_bar = tqdm.tqdm(range(self.total_batches), position=self.process_position)
|
||||
self.prog_bar = tqdm.tqdm(range(self.total_batches),
|
||||
position=self.process_position)
|
||||
|
||||
for batch_nb, data_batch in enumerate(self.tng_dataloader):
|
||||
self.batch_nb = batch_nb
|
||||
|
@ -651,7 +661,8 @@ class Trainer(TrainerIO):
|
|||
model = self.__get_model()
|
||||
model.global_step = self.global_step
|
||||
|
||||
# stop when the flag is changed or we've gone past the amount requested in the batches
|
||||
# stop when the flag is changed or we've gone past the amount
|
||||
# requested in the batches
|
||||
self.total_batch_nb += 1
|
||||
met_batch_limit = batch_nb > self.nb_tng_batches
|
||||
if met_batch_limit:
|
||||
|
@ -698,7 +709,8 @@ class Trainer(TrainerIO):
|
|||
model.on_tng_metrics(metrics)
|
||||
|
||||
# log metrics
|
||||
scalar_metrics = self.__metrics_to_scalars(metrics, blacklist=self.__log_vals_blacklist())
|
||||
scalar_metrics = self.__metrics_to_scalars(
|
||||
metrics, blacklist=self.__log_vals_blacklist())
|
||||
if self.proc_rank == 0:
|
||||
self.experiment.log(scalar_metrics, global_step=self.global_step)
|
||||
self.experiment.save()
|
||||
|
@ -720,7 +732,8 @@ class Trainer(TrainerIO):
|
|||
# early stopping
|
||||
met_min_epochs = epoch_nb > self.min_nb_epochs
|
||||
if self.enable_early_stop and met_min_epochs:
|
||||
should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb, logs=self.__tng_tqdm_dic)
|
||||
should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb,
|
||||
logs=self.__tng_tqdm_dic)
|
||||
|
||||
# stop training
|
||||
stop = should_stop and met_min_epochs
|
||||
|
@ -828,7 +841,8 @@ class Trainer(TrainerIO):
|
|||
# clear gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
# queuing loss across batches blows it up proportionally... divide out the number accumulated
|
||||
# queuing loss across batches blows it up proportionally...
|
||||
# divide out the number accumulated
|
||||
self.batch_loss_value = self.batch_loss_value / self.accumulate_grad_batches
|
||||
|
||||
# track loss
|
||||
|
@ -885,4 +899,5 @@ class Trainer(TrainerIO):
|
|||
# model checkpointing
|
||||
if self.proc_rank == 0 and self.checkpoint_callback is not None:
|
||||
print('save callback...')
|
||||
self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, logs=self.__tng_tqdm_dic)
|
||||
self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch,
|
||||
logs=self.__tng_tqdm_dic)
|
||||
|
|
|
@ -17,11 +17,13 @@ class GradInformation(nn.Module):
|
|||
total_norm += param_norm ** norm_type
|
||||
norm = param_norm ** (1 / norm_type)
|
||||
|
||||
results['grad_{}_norm_{}'.format(norm_type, i)] = round(norm.data.cpu().numpy().flatten()[0], 3)
|
||||
grad = round(norm.data.cpu().numpy().flatten()[0], 3)
|
||||
results['grad_{}_norm_{}'.format(norm_type, i)] = grad
|
||||
except Exception:
|
||||
# this param had no grad
|
||||
pass
|
||||
|
||||
total_norm = total_norm ** (1. / norm_type)
|
||||
results['grad_{}_norm_total'.format(norm_type)] = round(total_norm.data.cpu().numpy().flatten()[0], 3)
|
||||
grad = round(total_norm.data.cpu().numpy().flatten()[0], 3)
|
||||
results['grad_{}_norm_total'.format(norm_type)] = grad
|
||||
return results
|
||||
|
|
|
@ -3,7 +3,8 @@ import re
|
|||
|
||||
import torch
|
||||
|
||||
from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
|
||||
from ..pt_overrides.override_data_parallel import (
|
||||
LightningDistributedDataParallel, LightningDataParallel)
|
||||
|
||||
|
||||
class ModelIO(object):
|
||||
|
@ -45,7 +46,8 @@ class ModelIO(object):
|
|||
class TrainerIO(object):
|
||||
|
||||
def __get_model(self):
|
||||
is_dp_module = type(self.model) is LightningDistributedDataParallel or type(self.model) is LightningDataParallel
|
||||
is_dp_module = isinstance(self.model, (LightningDistributedDataParallel,
|
||||
LightningDataParallel))
|
||||
model = self.model.module if is_dp_module else self.model
|
||||
return model
|
||||
|
||||
|
|
|
@ -48,11 +48,13 @@ class LightningTestModel(LightningModule):
|
|||
Layout model
|
||||
:return:
|
||||
"""
|
||||
self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim)
|
||||
self.c_d1 = nn.Linear(in_features=self.hparams.in_features,
|
||||
out_features=self.hparams.hidden_dim)
|
||||
self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim)
|
||||
self.c_d1_drop = nn.Dropout(self.hparams.drop_prob)
|
||||
|
||||
self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features)
|
||||
self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim,
|
||||
out_features=self.hparams.out_features)
|
||||
|
||||
# ---------------------
|
||||
# TRAINING
|
||||
|
@ -191,8 +193,10 @@ class LightningTestModel(LightningModule):
|
|||
|
||||
def __dataloader(self, train):
|
||||
# init data generators
|
||||
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
|
||||
dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True)
|
||||
transform = transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize((0.5,), (1.0,))])
|
||||
dataset = MNIST(root=self.hparams.data_root, train=train,
|
||||
transform=transform, download=True)
|
||||
|
||||
# when using multi-node we need to add the datasampler
|
||||
train_sampler = None
|
||||
|
@ -251,11 +255,15 @@ class LightningTestModel(LightningModule):
|
|||
parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
|
||||
|
||||
# training params (opt)
|
||||
parser.opt_list('--learning_rate', default=0.001 * 8, type=float, options=[0.0001, 0.0005, 0.001, 0.005],
|
||||
parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
|
||||
options=[0.0001, 0.0005, 0.001, 0.005],
|
||||
tunable=False)
|
||||
parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False)
|
||||
parser.opt_list('--optimizer_name', default='adam', type=str,
|
||||
options=['adam'], tunable=False)
|
||||
|
||||
# if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu
|
||||
parser.opt_list('--batch_size', default=256 * 8, type=int, options=[32, 64, 128, 256], tunable=False,
|
||||
help='batch size will be divided over all the gpus being used across all nodes')
|
||||
# if using 2 nodes with 4 gpus each the batch size here
|
||||
# (256) will be 256 / (2*8) = 16 per gpu
|
||||
parser.opt_list('--batch_size', default=256 * 8, type=int,
|
||||
options=[32, 64, 128, 256], tunable=False,
|
||||
help='batch size will be divided over all gpus being used across all nodes')
|
||||
return parser
|
||||
|
|
|
@ -9,29 +9,40 @@ import os
|
|||
def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None):
|
||||
|
||||
# tng, test, val check intervals
|
||||
parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true', help='true = run test set also')
|
||||
parser.add_argument('--check_val_every_n_epoch', default=1, type=int, help='check val every n epochs')
|
||||
parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true',
|
||||
help='true = run test set also')
|
||||
parser.add_argument('--check_val_every_n_epoch', default=1, type=int,
|
||||
help='check val every n epochs')
|
||||
parser.opt_list('--accumulate_grad_batches', default=1, type=int, tunable=False,
|
||||
help='accumulates gradients k times before applying update. Simulates huge batch size')
|
||||
help='accumulates gradients k times before applying update.'
|
||||
' Simulates huge batch size')
|
||||
parser.add_argument('--max_nb_epochs', default=200, type=int, help='cap epochs')
|
||||
parser.add_argument('--min_nb_epochs', default=2, type=int, help='min epochs')
|
||||
parser.add_argument('--train_percent_check', default=1.0, type=float, help='how much of tng set to check')
|
||||
parser.add_argument('--val_percent_check', default=1.0, type=float, help='how much of val set to check')
|
||||
parser.add_argument('--test_percent_check', default=1.0, type=float, help='how much of test set to check')
|
||||
parser.add_argument('--train_percent_check', default=1.0, type=float,
|
||||
help='how much of tng set to check')
|
||||
parser.add_argument('--val_percent_check', default=1.0, type=float,
|
||||
help='how much of val set to check')
|
||||
parser.add_argument('--test_percent_check', default=1.0, type=float,
|
||||
help='how much of test set to check')
|
||||
|
||||
parser.add_argument('--val_check_interval', default=0.95, type=float, help='how much within 1 epoch to check val')
|
||||
parser.add_argument('--log_save_interval', default=100, type=int, help='how many batches between log saves')
|
||||
parser.add_argument('--add_log_row_interval', default=100, type=int, help='add log every k batches')
|
||||
parser.add_argument('--val_check_interval', default=0.95, type=float,
|
||||
help='how much within 1 epoch to check val')
|
||||
parser.add_argument('--log_save_interval', default=100, type=int,
|
||||
help='how many batches between log saves')
|
||||
parser.add_argument('--add_log_row_interval', default=100, type=int,
|
||||
help='add log every k batches')
|
||||
|
||||
# early stopping
|
||||
parser.add_argument('--disable_early_stop', dest='enable_early_stop', action='store_false')
|
||||
parser.add_argument('--early_stop_metric', default='val_acc', type=str)
|
||||
parser.add_argument('--early_stop_mode', default='min', type=str)
|
||||
parser.add_argument('--early_stop_patience', default=3, type=int, help='number of epochs until stop')
|
||||
parser.add_argument('--early_stop_patience', default=3, type=int,
|
||||
help='number of epochs until stop')
|
||||
|
||||
# gradient handling
|
||||
parser.add_argument('--gradient_clip', default=-1, type=int)
|
||||
parser.add_argument('--track_grad_norm', default=-1, type=int, help='if > 0, will track this grad norm')
|
||||
parser.add_argument('--track_grad_norm', default=-1, type=int,
|
||||
help='if > 0, will track this grad norm')
|
||||
|
||||
# model saving
|
||||
parser.add_argument('--model_save_path', default=root_dir + '/model_weights')
|
||||
|
@ -47,7 +58,8 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None
|
|||
# test_tube settings
|
||||
parser.add_argument('-en', '--tt_name', default='pt_test')
|
||||
parser.add_argument('-td', '--tt_description', default='pytorch lightning test')
|
||||
parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'), help='logging dir')
|
||||
parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'),
|
||||
help='logging dir')
|
||||
parser.add_argument('--enable_single_run', dest='single_run', action='store_true')
|
||||
parser.add_argument('--nb_hopt_trials', default=1, type=int)
|
||||
parser.add_argument('--log_stdout', dest='log_stdout', action='store_true')
|
||||
|
@ -65,17 +77,23 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None
|
|||
|
||||
# FAST training
|
||||
# use these settings to make sure network has no bugs without running a full dataset
|
||||
parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true', help='runs validation after 1 tng step')
|
||||
parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true', help='false removes the prog bar')
|
||||
parser.add_argument('--overfit', default=-1, type=float, help='% of dataset to use with this option. float, or -1 for none')
|
||||
parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true',
|
||||
help='runs validation after 1 tng step')
|
||||
parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true',
|
||||
help='false removes the prog bar')
|
||||
parser.add_argument('--overfit', default=-1, type=float,
|
||||
help='% of dataset to use with this option. float, or -1 for none')
|
||||
|
||||
# debug args
|
||||
if rand_seed is not None:
|
||||
parser.add_argument('--random_seed', default=rand_seed, type=int)
|
||||
|
||||
parser.add_argument('--interactive', dest='interactive', action='store_true', help='runs on gpu without cluster')
|
||||
parser.add_argument('--debug', dest='debug', action='store_true', help='enables/disables test tube')
|
||||
parser.add_argument('--local', dest='local', action='store_true', help='enables local tng')
|
||||
parser.add_argument('--interactive', dest='interactive', action='store_true',
|
||||
help='runs on gpu without cluster')
|
||||
parser.add_argument('--debug', dest='debug', action='store_true',
|
||||
help='enables/disables test tube')
|
||||
parser.add_argument('--local', dest='local', action='store_true',
|
||||
help='enables local tng')
|
||||
|
||||
# optimizer
|
||||
parser.add_argument('--lr_scheduler_milestones', default=None, type=str)
|
||||
|
|
|
@ -107,7 +107,8 @@ def load_model(exp, save_dir):
|
|||
checkpoints = [x for x in os.listdir(save_dir) if '.ckpt' in x]
|
||||
weights_dir = os.path.join(save_dir, checkpoints[0])
|
||||
|
||||
trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir, tags_csv=tags_path, on_gpu=True)
|
||||
trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir,
|
||||
tags_csv=tags_path, on_gpu=True)
|
||||
|
||||
assert trained_model is not None, 'loading model failed'
|
||||
|
||||
|
|
|
@ -30,10 +30,12 @@ def test_amp_gpu_ddp():
|
|||
:return:
|
||||
"""
|
||||
if not torch.cuda.is_available():
|
||||
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
|
||||
warnings.warn('test_amp_gpu_ddp cannot run.'
|
||||
'Rerun on a GPU node to run this test')
|
||||
return
|
||||
if not torch.cuda.device_count() > 1:
|
||||
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
||||
warnings.warn('test_amp_gpu_ddp cannot run.'
|
||||
'Rerun on a node with 2+ GPUs to run this test')
|
||||
return
|
||||
|
||||
os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
|
||||
|
@ -105,7 +107,8 @@ def test_cpu_slurm_save_load():
|
|||
# wipe-out trainer and model
|
||||
# retrain with not much data... this simulates picking training back up after slurm
|
||||
# we want to see if the weights come back correctly
|
||||
continue_tng_hparams = get_hparams(continue_training=True, hpc_exp_number=cluster_a.hpc_exp_number)
|
||||
continue_tng_hparams = get_hparams(continue_training=True,
|
||||
hpc_exp_number=cluster_a.hpc_exp_number)
|
||||
trainer_options = dict(
|
||||
max_nb_epochs=1,
|
||||
cluster=SlurmCluster(continue_tng_hparams),
|
||||
|
@ -219,7 +222,8 @@ def test_model_saving_loading():
|
|||
# load new model
|
||||
tags_path = exp.get_data_path(exp.name, exp.version)
|
||||
tags_path = os.path.join(tags_path, 'meta_tags.csv')
|
||||
model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path, tags_csv=tags_path, on_gpu=False)
|
||||
model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path,
|
||||
tags_csv=tags_path, on_gpu=False)
|
||||
model_2.eval()
|
||||
|
||||
# make prediction
|
||||
|
@ -244,10 +248,12 @@ def test_amp_gpu_ddp_slurm_managed():
|
|||
:return:
|
||||
"""
|
||||
if not torch.cuda.is_available():
|
||||
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
|
||||
warnings.warn('test_amp_gpu_ddp cannot run.'
|
||||
' Rerun on a GPU node to run this test')
|
||||
return
|
||||
if not torch.cuda.device_count() > 1:
|
||||
warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
||||
warnings.warn('test_amp_gpu_ddp cannot run.'
|
||||
' Rerun on a node with 2+ GPUs to run this test')
|
||||
return
|
||||
|
||||
# simulate setting slurm flags
|
||||
|
@ -411,7 +417,8 @@ def test_single_gpu_model():
|
|||
:return:
|
||||
"""
|
||||
if not torch.cuda.is_available():
|
||||
warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test')
|
||||
warnings.warn('test_single_gpu_model cannot run.'
|
||||
' Rerun on a GPU node to run this test')
|
||||
return
|
||||
model, hparams = get_model()
|
||||
|
||||
|
@ -432,10 +439,12 @@ def test_multi_gpu_model_dp():
|
|||
:return:
|
||||
"""
|
||||
if not torch.cuda.is_available():
|
||||
warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a GPU node to run this test')
|
||||
warnings.warn('test_multi_gpu_model_dp cannot run.'
|
||||
' Rerun on a GPU node to run this test')
|
||||
return
|
||||
if not torch.cuda.device_count() > 1:
|
||||
warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
||||
warnings.warn('test_multi_gpu_model_dp cannot run.'
|
||||
' Rerun on a node with 2+ GPUs to run this test')
|
||||
return
|
||||
model, hparams = get_model()
|
||||
trainer_options = dict(
|
||||
|
@ -458,10 +467,12 @@ def test_amp_gpu_dp():
|
|||
:return:
|
||||
"""
|
||||
if not torch.cuda.is_available():
|
||||
warnings.warn('test_amp_gpu_dp cannot run. Rerun on a GPU node to run this test')
|
||||
warnings.warn('test_amp_gpu_dp cannot run.'
|
||||
' Rerun on a GPU node to run this test')
|
||||
return
|
||||
if not torch.cuda.device_count() > 1:
|
||||
warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
||||
warnings.warn('test_amp_gpu_dp cannot run.'
|
||||
' Rerun on a node with 2+ GPUs to run this test')
|
||||
return
|
||||
model, hparams = get_model()
|
||||
trainer_options = dict(
|
||||
|
@ -480,10 +491,12 @@ def test_multi_gpu_model_ddp():
|
|||
:return:
|
||||
"""
|
||||
if not torch.cuda.is_available():
|
||||
warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a GPU node to run this test')
|
||||
warnings.warn('test_multi_gpu_model_ddp cannot run.'
|
||||
' Rerun on a GPU node to run this test')
|
||||
return
|
||||
if not torch.cuda.device_count() > 1:
|
||||
warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
|
||||
warnings.warn('test_multi_gpu_model_ddp cannot run.'
|
||||
' Rerun on a node with 2+ GPUs to run this test')
|
||||
return
|
||||
|
||||
os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
|
||||
|
|
Loading…
Reference in New Issue