2019-07-09 00:12:27 +00:00
|
|
|
"""
|
2019-08-07 13:01:19 +00:00
|
|
|
The trainer handles all the logic for running a val loop, training loop, distributing, etc.. .
|
2019-07-09 00:12:27 +00:00
|
|
|
"""
|
2019-08-05 21:57:39 +00:00
|
|
|
|
2019-07-09 00:11:20 +00:00
|
|
|
import os
|
2019-07-20 12:53:36 +00:00
|
|
|
import re
|
2019-08-05 21:57:39 +00:00
|
|
|
import warnings
|
2019-07-09 00:11:20 +00:00
|
|
|
|
2019-08-05 08:52:09 +00:00
|
|
|
import numpy as np
|
|
|
|
import tqdm
|
2019-03-31 01:45:16 +00:00
|
|
|
import torch
|
2019-07-09 00:11:20 +00:00
|
|
|
from torch.utils.data.distributed import DistributedSampler
|
|
|
|
import torch.multiprocessing as mp
|
|
|
|
import torch.distributed as dist
|
|
|
|
|
2019-08-11 14:01:57 +00:00
|
|
|
from pytorch_lightning.root_module.root_module import LightningModule
|
2019-08-07 14:14:59 +00:00
|
|
|
from pytorch_lightning.root_module.memory import get_gpu_memory_map
|
|
|
|
from pytorch_lightning.root_module.model_saving import TrainerIO
|
|
|
|
from pytorch_lightning.pt_overrides.override_data_parallel import (
|
2019-08-06 10:08:31 +00:00
|
|
|
LightningDistributedDataParallel, LightningDataParallel)
|
2019-08-07 14:14:59 +00:00
|
|
|
from pytorch_lightning.utilities.debugging import MisconfigurationException
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-05-14 00:40:07 +00:00
|
|
|
try:
|
|
|
|
from apex import amp
|
|
|
|
APEX_AVAILABLE = True
|
2019-08-05 21:28:04 +00:00
|
|
|
except ImportError:
|
2019-05-14 00:40:07 +00:00
|
|
|
APEX_AVAILABLE = False
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-07-09 00:12:27 +00:00
|
|
|
|
2019-07-18 15:29:21 +00:00
|
|
|
def reduce_distributed_output(output, nb_gpus):
|
2019-07-18 16:08:17 +00:00
|
|
|
if nb_gpus <= 1:
|
2019-07-18 15:40:00 +00:00
|
|
|
return output
|
|
|
|
|
2019-07-18 16:08:17 +00:00
|
|
|
# when using DP, we get one output per gpu
|
|
|
|
# average outputs and return
|
|
|
|
if type(output) is torch.Tensor:
|
|
|
|
return output.mean()
|
|
|
|
|
2019-07-18 15:29:21 +00:00
|
|
|
for k, v in output.items():
|
|
|
|
# recurse on nested dics
|
|
|
|
if isinstance(output[k], dict):
|
|
|
|
output[k] = reduce_distributed_output(output[k], nb_gpus)
|
|
|
|
|
|
|
|
# reduce only metrics that have the same nb of gpus
|
|
|
|
elif output[k].size(0) == nb_gpus:
|
|
|
|
reduced = torch.mean(output[k])
|
|
|
|
output[k] = reduced
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
class Trainer(TrainerIO):
|
|
|
|
|
|
|
|
def __init__(self,
|
2019-08-08 14:59:16 +00:00
|
|
|
experiment=None,
|
2019-07-15 18:53:37 +00:00
|
|
|
early_stop_callback=None,
|
2019-07-15 17:17:38 +00:00
|
|
|
checkpoint_callback=None,
|
2019-07-01 22:38:07 +00:00
|
|
|
gradient_clip=0,
|
2019-03-31 20:29:50 +00:00
|
|
|
cluster=None,
|
2019-03-31 01:45:16 +00:00
|
|
|
process_position=0,
|
|
|
|
current_gpu_name=0,
|
2019-07-08 21:33:20 +00:00
|
|
|
nb_gpu_nodes=1,
|
2019-07-01 22:38:07 +00:00
|
|
|
gpus=None,
|
|
|
|
progress_bar=True,
|
2019-03-31 20:29:50 +00:00
|
|
|
overfit_pct=0.0,
|
2019-03-31 01:45:16 +00:00
|
|
|
track_grad_norm=-1,
|
|
|
|
check_val_every_n_epoch=1,
|
|
|
|
fast_dev_run=False,
|
2019-03-31 20:29:50 +00:00
|
|
|
accumulate_grad_batches=1,
|
2019-08-06 10:08:31 +00:00
|
|
|
max_nb_epochs=1000,
|
|
|
|
min_nb_epochs=1,
|
|
|
|
train_percent_check=1.0,
|
|
|
|
val_percent_check=1.0,
|
|
|
|
test_percent_check=1.0,
|
2019-07-16 00:48:46 +00:00
|
|
|
val_check_interval=0.95,
|
2019-08-06 10:08:31 +00:00
|
|
|
log_save_interval=100,
|
|
|
|
add_log_row_interval=10,
|
2019-07-18 15:15:21 +00:00
|
|
|
distributed_backend='dp',
|
2019-05-14 02:02:53 +00:00
|
|
|
use_amp=False,
|
2019-07-01 22:38:07 +00:00
|
|
|
print_nan_grads=False,
|
2019-07-16 01:11:29 +00:00
|
|
|
print_weights_summary=True,
|
2019-05-16 19:45:56 +00:00
|
|
|
amp_level='O2',
|
2019-03-31 01:45:16 +00:00
|
|
|
nb_sanity_val_steps=5):
|
2019-07-18 16:04:19 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
:param experiment: Test-tube experiment
|
|
|
|
:param early_stop_callback: from pytorch_lightning import EarlyStopping
|
|
|
|
:param checkpoint_callback: from pytorch_lightning import Checkpoint
|
|
|
|
:param gradient_clip:
|
|
|
|
:param cluster:
|
|
|
|
:param process_position:
|
|
|
|
:param current_gpu_name:
|
|
|
|
:param nb_gpu_nodes:
|
|
|
|
:param gpus:
|
|
|
|
:param progress_bar:
|
|
|
|
:param overfit_pct:
|
|
|
|
:param track_grad_norm:
|
|
|
|
:param check_val_every_n_epoch:
|
|
|
|
:param fast_dev_run:
|
|
|
|
:param accumulate_grad_batches:
|
|
|
|
:param max_nb_epochs:
|
|
|
|
:param min_nb_epochs:
|
|
|
|
:param train_percent_check:
|
|
|
|
:param val_percent_check:
|
|
|
|
:param test_percent_check:
|
|
|
|
:param val_check_interval:
|
|
|
|
:param log_save_interval:
|
|
|
|
:param add_log_row_interval:
|
2019-08-06 10:08:31 +00:00
|
|
|
:param distributed_backend:
|
2019-08-07 17:39:40 +00:00
|
|
|
'do' to use DistributedParallel, 'dp' to use DistributedDataParallel, 'n' to use none
|
2019-07-18 16:04:19 +00:00
|
|
|
:param use_amp:
|
|
|
|
:param print_nan_grads:
|
|
|
|
:param print_weights_summary:
|
|
|
|
:param amp_level:
|
|
|
|
:param nb_sanity_val_steps:
|
|
|
|
"""
|
2019-03-31 01:45:16 +00:00
|
|
|
# Transfer params
|
2019-07-03 20:34:49 +00:00
|
|
|
self.nb_gpu_nodes = nb_gpu_nodes
|
2019-07-01 22:38:07 +00:00
|
|
|
self.gradient_clip = gradient_clip
|
2019-03-31 01:45:16 +00:00
|
|
|
self.check_val_every_n_epoch = check_val_every_n_epoch
|
2019-07-15 18:53:37 +00:00
|
|
|
self.enable_early_stop = early_stop_callback is not None
|
2019-03-31 01:45:16 +00:00
|
|
|
self.track_grad_norm = track_grad_norm
|
|
|
|
self.fast_dev_run = fast_dev_run
|
2019-07-01 22:38:07 +00:00
|
|
|
self.on_gpu = gpus is not None and torch.cuda.is_available()
|
|
|
|
self.progress_bar = progress_bar
|
2019-03-31 01:45:16 +00:00
|
|
|
self.experiment = experiment
|
2019-08-08 14:59:16 +00:00
|
|
|
self.exp_save_path = None
|
|
|
|
if self.experiment is not None:
|
|
|
|
self.exp_save_path = experiment.get_data_path(experiment.name, experiment.version)
|
2019-03-31 01:45:16 +00:00
|
|
|
self.cluster = cluster
|
|
|
|
self.process_position = process_position
|
|
|
|
self.current_gpu_name = current_gpu_name
|
2019-07-16 01:11:29 +00:00
|
|
|
self.print_weights_summary = print_weights_summary
|
2019-03-31 01:45:16 +00:00
|
|
|
self.checkpoint_callback = checkpoint_callback
|
2019-07-15 17:18:56 +00:00
|
|
|
|
|
|
|
if self.checkpoint_callback is not None:
|
|
|
|
self.checkpoint_callback.save_function = self.save_checkpoint
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
self.early_stop = early_stop_callback
|
|
|
|
self.model = None
|
|
|
|
self.max_nb_epochs = max_nb_epochs
|
|
|
|
self.accumulate_grad_batches = accumulate_grad_batches
|
|
|
|
self.early_stop_callback = early_stop_callback
|
|
|
|
self.min_nb_epochs = min_nb_epochs
|
|
|
|
self.nb_sanity_val_steps = nb_sanity_val_steps
|
|
|
|
self.lr_schedulers = []
|
2019-05-16 19:45:56 +00:00
|
|
|
self.amp_level = amp_level
|
2019-07-01 22:38:07 +00:00
|
|
|
self.print_nan_grads = print_nan_grads
|
2019-07-08 21:44:06 +00:00
|
|
|
self.data_parallel_device_ids = None
|
2019-07-08 21:51:07 +00:00
|
|
|
self.world_size = 1
|
2019-07-20 12:38:17 +00:00
|
|
|
self.node_rank = 0
|
2019-07-18 15:03:16 +00:00
|
|
|
self.use_ddp = False
|
|
|
|
self.use_dp = False
|
2019-08-07 17:39:40 +00:00
|
|
|
self.single_gpu = False
|
2019-07-08 13:42:13 +00:00
|
|
|
|
2019-07-24 14:42:01 +00:00
|
|
|
# training bookeeping
|
|
|
|
self.total_batch_nb = 0
|
|
|
|
self.running_loss = []
|
|
|
|
self.avg_loss = 0
|
|
|
|
self.batch_nb = 0
|
|
|
|
self.tqdm_metrics = {}
|
|
|
|
self.nb_val_batches = None
|
|
|
|
self.nb_tng_batches = None
|
|
|
|
self.nb_test_batches = None
|
|
|
|
|
2019-07-08 13:42:13 +00:00
|
|
|
# gpus come in as a string.
|
|
|
|
# if gpus = -1 then use all available devices
|
|
|
|
# otherwise, split the string using commas
|
|
|
|
if gpus is not None:
|
2019-07-21 12:08:21 +00:00
|
|
|
if type(gpus) is list:
|
|
|
|
self.data_parallel_device_ids = gpus
|
|
|
|
elif type(gpus) is str:
|
|
|
|
if gpus == '-1':
|
|
|
|
self.data_parallel_device_ids = list(range(0, torch.cuda.device_count()))
|
|
|
|
else:
|
|
|
|
self.data_parallel_device_ids = [int(x.strip()) for x in gpus.split(',')]
|
2019-07-08 13:42:13 +00:00
|
|
|
else:
|
2019-07-21 12:08:21 +00:00
|
|
|
raise Exception('gpus has to be a string or list of ids')
|
2019-06-25 22:51:41 +00:00
|
|
|
|
2019-07-08 14:00:04 +00:00
|
|
|
# set the correct cuda visible devices (using pci order)
|
|
|
|
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
2019-08-06 10:08:31 +00:00
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in
|
|
|
|
self.data_parallel_device_ids])
|
2019-08-05 22:12:19 +00:00
|
|
|
print('VISIBLE GPUS: %r' % os.environ["CUDA_VISIBLE_DEVICES"])
|
2019-07-08 14:00:04 +00:00
|
|
|
|
2019-07-18 15:03:16 +00:00
|
|
|
# make DP and DDP mutually exclusive
|
|
|
|
# single GPU will also use DP with devices=[0]
|
2019-08-06 22:02:22 +00:00
|
|
|
requested_gpus = self.data_parallel_device_ids is not None
|
|
|
|
if requested_gpus and len(self.data_parallel_device_ids) > 0:
|
2019-07-18 15:15:21 +00:00
|
|
|
self.use_dp = distributed_backend == 'dp'
|
|
|
|
self.use_ddp = distributed_backend == 'ddp'
|
2019-07-08 13:44:20 +00:00
|
|
|
|
2019-07-21 12:20:06 +00:00
|
|
|
# use ddp automatically if nb_gpu_nodes > 1
|
2019-07-24 20:37:05 +00:00
|
|
|
if nb_gpu_nodes > 1 and self.use_dp: # pragma: no cover
|
2019-07-21 12:20:06 +00:00
|
|
|
self.use_ddp = True
|
2019-07-22 00:06:03 +00:00
|
|
|
self.use_dp = False
|
2019-07-21 12:20:06 +00:00
|
|
|
w = 'DataParallel does not support nb_gpu_nodes > 1. ' \
|
|
|
|
'Switching to DistributedDataParallel for you. ' \
|
|
|
|
'To silence this warning set distributed_backend=ddp'
|
|
|
|
warnings.warn(w)
|
|
|
|
|
2019-08-07 17:39:40 +00:00
|
|
|
# remove dp and ddp when requesting single gpu
|
2019-08-07 17:43:28 +00:00
|
|
|
if self.data_parallel_device_ids is not None and len(self.data_parallel_device_ids) == 1:
|
2019-08-07 17:39:40 +00:00
|
|
|
self.use_ddp = False
|
|
|
|
self.use_dp = False
|
|
|
|
self.single_gpu = True
|
|
|
|
|
2019-07-24 22:42:22 +00:00
|
|
|
# extract SLURM flag vars
|
|
|
|
# whenever we have the correct number of tasks, we let slurm manage processes
|
|
|
|
# otherwise we launch the required number of processes
|
|
|
|
if self.use_ddp:
|
|
|
|
self.nb_requested_gpus = len(self.data_parallel_device_ids) * self.nb_gpu_nodes
|
|
|
|
self.nb_slurm_tasks = 0
|
|
|
|
try:
|
|
|
|
self.nb_slurm_tasks = int(os.environ['SLURM_NTASKS'])
|
|
|
|
self.is_slurm_managing_tasks = self.nb_slurm_tasks == self.nb_requested_gpus
|
2019-08-05 21:57:39 +00:00
|
|
|
except Exception:
|
2019-07-24 22:42:22 +00:00
|
|
|
# likely not on slurm, so set the slurm managed flag to false
|
|
|
|
self.is_slurm_managing_tasks = False
|
|
|
|
|
2019-07-03 21:02:30 +00:00
|
|
|
# process info
|
|
|
|
self.proc_rank = 0
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
# training state
|
|
|
|
self.optimizers = None
|
|
|
|
self.prog_bar = None
|
|
|
|
self.global_step = 0
|
|
|
|
self.current_epoch = 0
|
|
|
|
self.total_batches = 0
|
|
|
|
|
|
|
|
# logging
|
|
|
|
self.log_save_interval = log_save_interval
|
|
|
|
self.val_check_interval = val_check_interval
|
|
|
|
self.add_log_row_interval = add_log_row_interval
|
|
|
|
|
|
|
|
# dataloaders
|
|
|
|
self.tng_dataloader = None
|
|
|
|
self.test_dataloader = None
|
|
|
|
self.val_dataloader = None
|
|
|
|
|
|
|
|
# how much of the data to use
|
2019-08-06 10:08:31 +00:00
|
|
|
self.__determine_data_use_amount(train_percent_check, val_percent_check,
|
|
|
|
test_percent_check, overfit_pct)
|
2019-03-31 01:45:16 +00:00
|
|
|
print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
|
|
|
|
|
2019-07-09 00:13:40 +00:00
|
|
|
# 16 bit mixed precision training using apex
|
2019-05-14 00:40:07 +00:00
|
|
|
self.use_amp = use_amp and APEX_AVAILABLE
|
2019-05-14 00:41:23 +00:00
|
|
|
if self.use_amp:
|
|
|
|
print('using 16bit precision')
|
2019-05-14 00:40:07 +00:00
|
|
|
|
2019-07-24 21:18:58 +00:00
|
|
|
if use_amp and not APEX_AVAILABLE: # pragma: no cover
|
2019-08-05 21:57:39 +00:00
|
|
|
msg = """
|
2019-07-09 00:00:43 +00:00
|
|
|
You set use_amp=True but do not have apex installed.
|
2019-08-05 21:57:39 +00:00
|
|
|
Install apex first using this guide and rerun with use_amp=True:
|
2019-07-09 00:00:43 +00:00
|
|
|
https://github.com/NVIDIA/apex#linux
|
2019-08-05 21:57:39 +00:00
|
|
|
|
2019-07-09 00:03:31 +00:00
|
|
|
this run will NOT use 16 bit precision
|
2019-08-05 21:57:39 +00:00
|
|
|
"""
|
2019-07-24 13:44:36 +00:00
|
|
|
raise ModuleNotFoundError(msg)
|
2019-07-09 00:00:43 +00:00
|
|
|
|
2019-08-07 10:55:05 +00:00
|
|
|
def restore_state_if_existing_checkpoint(self):
|
|
|
|
# restore trainer state and model if there is a weight for this experiment
|
|
|
|
last_epoch = -1
|
|
|
|
last_ckpt_name = None
|
|
|
|
|
2019-08-08 10:00:04 +00:00
|
|
|
# do nothing if there's not dir or callback
|
|
|
|
no_ckpt_callback = self.checkpoint_callback is None
|
|
|
|
if no_ckpt_callback or not os.path.exists(self.checkpoint_callback.filepath):
|
|
|
|
return
|
|
|
|
|
2019-08-07 10:55:05 +00:00
|
|
|
# find last epoch
|
|
|
|
checkpoints = os.listdir(self.checkpoint_callback.filepath)
|
|
|
|
for name in checkpoints:
|
2019-08-07 12:12:54 +00:00
|
|
|
# ignore hpc ckpts
|
|
|
|
if 'hpc_' in name:
|
|
|
|
continue
|
|
|
|
|
2019-08-07 10:55:05 +00:00
|
|
|
if '.ckpt' in name:
|
|
|
|
epoch = name.split('epoch_')[1]
|
2019-08-07 14:19:03 +00:00
|
|
|
epoch = int(re.sub('[^0-9]', '', epoch))
|
2019-08-07 10:55:05 +00:00
|
|
|
|
|
|
|
if epoch > last_epoch:
|
|
|
|
last_epoch = epoch
|
|
|
|
last_ckpt_name = name
|
|
|
|
|
|
|
|
# restore last checkpoint
|
2019-08-07 11:15:23 +00:00
|
|
|
if last_ckpt_name is not None:
|
|
|
|
last_ckpt_path = os.path.join(self.checkpoint_callback.filepath, last_ckpt_name)
|
|
|
|
self.restore(last_ckpt_path, self.on_gpu)
|
|
|
|
print(f'model and trainer restored from checkpoint: {last_ckpt_path}')
|
2019-08-07 10:55:05 +00:00
|
|
|
|
2019-07-18 15:08:48 +00:00
|
|
|
@property
|
|
|
|
def data_parallel(self):
|
|
|
|
return self.use_dp or self.use_ddp
|
|
|
|
|
2019-08-06 10:08:31 +00:00
|
|
|
def __determine_data_use_amount(self, train_percent_check, val_percent_check,
|
|
|
|
test_percent_check, overfit_pct):
|
2019-03-31 01:45:16 +00:00
|
|
|
"""
|
|
|
|
Use less data for debugging purposes
|
|
|
|
"""
|
|
|
|
self.train_percent_check = train_percent_check
|
|
|
|
self.val_percent_check = val_percent_check
|
|
|
|
self.test_percent_check = test_percent_check
|
|
|
|
if overfit_pct > 0:
|
|
|
|
self.train_percent_check = overfit_pct
|
|
|
|
self.val_percent_check = overfit_pct
|
|
|
|
self.test_percent_check = overfit_pct
|
|
|
|
|
2019-07-12 16:42:17 +00:00
|
|
|
def __get_model(self):
|
|
|
|
return self.model.module if self.data_parallel else self.model
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
def __is_function_implemented(self, f_name):
|
2019-07-12 16:42:17 +00:00
|
|
|
model = self.__get_model()
|
|
|
|
f_op = getattr(model, f_name, None)
|
2019-03-31 01:45:16 +00:00
|
|
|
return callable(f_op)
|
|
|
|
|
2019-08-11 14:01:57 +00:00
|
|
|
def __is_overriden(self, f_name):
|
|
|
|
model = self.__get_model()
|
|
|
|
super_object = super(model.__class__, model)
|
|
|
|
|
|
|
|
# when code pointers are different, it was overriden
|
|
|
|
is_overriden = getattr(model, f_name).__code__ is not getattr(super_object, f_name).__code__
|
|
|
|
return is_overriden
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
@property
|
|
|
|
def __tng_tqdm_dic(self):
|
|
|
|
tqdm_dic = {
|
|
|
|
'tng_loss': '{0:.3f}'.format(self.avg_loss),
|
|
|
|
'epoch': '{}'.format(self.current_epoch),
|
2019-08-05 21:57:39 +00:00
|
|
|
'batch_nb': '{}'.format(self.batch_nb),
|
2019-03-31 01:45:16 +00:00
|
|
|
}
|
2019-08-08 14:59:16 +00:00
|
|
|
|
|
|
|
if self.experiment is not None:
|
|
|
|
tqdm_dic['v_nb'] = self.experiment.version
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
tqdm_dic.update(self.tqdm_metrics)
|
2019-07-01 22:38:07 +00:00
|
|
|
|
|
|
|
if self.on_gpu:
|
|
|
|
tqdm_dic['gpu'] = '{}'.format(self.current_gpu_name)
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
return tqdm_dic
|
|
|
|
|
2019-07-24 12:53:00 +00:00
|
|
|
@property
|
|
|
|
def tng_tqdm_dic(self):
|
|
|
|
"""
|
|
|
|
Read-only for tqdm metrics
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
return self.__tng_tqdm_dic
|
|
|
|
|
2019-07-09 00:13:40 +00:00
|
|
|
def __layout_bookeeping(self):
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
# determine number of training batches
|
2019-07-08 23:11:16 +00:00
|
|
|
self.nb_tng_batches = len(self.tng_dataloader)
|
2019-05-14 10:11:16 +00:00
|
|
|
self.nb_tng_batches = int(self.nb_tng_batches * self.train_percent_check)
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
# determine number of validation batches
|
2019-08-12 19:23:11 +00:00
|
|
|
# val datasets could be none, 1 or 2+
|
|
|
|
self.nb_val_batches = 0
|
|
|
|
if self.val_dataloader is not None:
|
|
|
|
self.nb_val_batches = sum(len(dataloader) for dataloader in self.val_dataloader)
|
|
|
|
|
2019-05-14 10:11:16 +00:00
|
|
|
self.nb_val_batches = int(self.nb_val_batches * self.val_percent_check)
|
|
|
|
self.nb_val_batches = max(1, self.nb_val_batches)
|
|
|
|
self.nb_val_batches = self.nb_val_batches
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
# determine number of test batches
|
2019-08-11 14:01:57 +00:00
|
|
|
self.nb_test_batches = len(self.test_dataloader) if self.test_dataloader is not None else 0
|
2019-05-14 10:11:16 +00:00
|
|
|
self.nb_test_batches = int(self.nb_test_batches * self.test_percent_check)
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
# determine when to check validation
|
2019-05-14 10:11:16 +00:00
|
|
|
self.val_check_batch = int(self.nb_tng_batches * self.val_check_interval)
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
def __add_tqdm_metrics(self, metrics):
|
|
|
|
for k, v in metrics.items():
|
2019-07-01 22:38:07 +00:00
|
|
|
if type(v) is torch.Tensor:
|
|
|
|
v = v.item()
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
self.tqdm_metrics[k] = v
|
|
|
|
|
2019-08-13 15:37:37 +00:00
|
|
|
def __validation_forward(self, model, data_batch, batch_i, dataloader_i):
|
|
|
|
# make dataloader_i arg in validation_step optional
|
|
|
|
args = [data_batch, batch_i]
|
|
|
|
if len(self.val_dataloader) > 1:
|
|
|
|
args.append(dataloader_i)
|
|
|
|
|
|
|
|
if self.use_ddp:
|
|
|
|
output = model(*args)
|
|
|
|
elif self.use_dp:
|
|
|
|
output = model(*args)
|
|
|
|
elif self.single_gpu:
|
|
|
|
# put inputs on gpu manually
|
|
|
|
gpu_id = self.data_parallel_device_ids[0]
|
2019-08-15 13:39:09 +00:00
|
|
|
data_batch = self.transfer_batch_to_gpu(data_batch, gpu_id)
|
|
|
|
args[0] = data_batch
|
2019-08-13 15:37:37 +00:00
|
|
|
|
|
|
|
# do non dp, ddp step
|
|
|
|
output = model.validation_step(*args)
|
|
|
|
|
|
|
|
else:
|
|
|
|
# CPU
|
|
|
|
output = model.validation_step(*args)
|
|
|
|
|
|
|
|
return output
|
|
|
|
|
2019-08-12 19:23:11 +00:00
|
|
|
def validate(self, model, dataloader, max_batches, dataloader_i):
|
2019-03-31 01:45:16 +00:00
|
|
|
"""
|
|
|
|
Run validation code
|
|
|
|
:param model: PT model
|
|
|
|
:param dataloader: PT dataloader
|
|
|
|
:param max_batches: Scalar
|
|
|
|
:return:
|
|
|
|
"""
|
2019-08-11 14:01:57 +00:00
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
# enable eval mode
|
|
|
|
model.zero_grad()
|
|
|
|
model.eval()
|
|
|
|
|
|
|
|
# disable gradients to save memory
|
|
|
|
torch.set_grad_enabled(False)
|
|
|
|
|
|
|
|
# bookkeeping
|
|
|
|
outputs = []
|
|
|
|
|
|
|
|
# run training
|
2019-05-14 10:36:26 +00:00
|
|
|
for batch_i, data_batch in enumerate(dataloader):
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-07-24 23:05:20 +00:00
|
|
|
if data_batch is None: # pragma: no cover
|
2019-03-31 01:45:16 +00:00
|
|
|
continue
|
|
|
|
|
|
|
|
# stop short when on fast dev run
|
2019-05-14 10:40:11 +00:00
|
|
|
if max_batches is not None and batch_i >= max_batches:
|
2019-03-31 01:45:16 +00:00
|
|
|
break
|
|
|
|
|
|
|
|
# -----------------
|
|
|
|
# RUN VALIDATION STEP
|
|
|
|
# -----------------
|
2019-08-13 15:37:37 +00:00
|
|
|
output = self.__validation_forward(model, data_batch, batch_i, dataloader_i)
|
2019-07-01 22:38:07 +00:00
|
|
|
|
2019-08-13 15:37:37 +00:00
|
|
|
# track outputs for collation
|
2019-03-31 01:45:16 +00:00
|
|
|
outputs.append(output)
|
|
|
|
|
|
|
|
# batch done
|
2019-07-01 22:38:07 +00:00
|
|
|
if self.progress_bar and self.prog_bar is not None:
|
2019-03-31 01:45:16 +00:00
|
|
|
self.prog_bar.update(1)
|
|
|
|
|
2019-08-11 14:01:57 +00:00
|
|
|
# give model a chance to do something with the outputs (and method defined)
|
|
|
|
val_results = {}
|
|
|
|
if self.__is_overriden('validation_end'):
|
|
|
|
if self.data_parallel:
|
|
|
|
val_results = model.module.validation_end(outputs)
|
|
|
|
else:
|
|
|
|
val_results = model.validation_end(outputs)
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
# enable train mode again
|
|
|
|
model.train()
|
|
|
|
|
|
|
|
# enable gradients to save memory
|
|
|
|
torch.set_grad_enabled(True)
|
2019-07-01 22:38:07 +00:00
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
return val_results
|
|
|
|
|
2019-07-24 21:09:14 +00:00
|
|
|
def get_dataloaders(self, model):
|
2019-03-31 01:45:16 +00:00
|
|
|
"""
|
|
|
|
Dataloaders are provided by the model
|
|
|
|
:param model:
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
self.tng_dataloader = model.tng_dataloader
|
2019-08-11 14:01:57 +00:00
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
self.test_dataloader = model.test_dataloader
|
|
|
|
self.val_dataloader = model.val_dataloader
|
|
|
|
|
2019-08-12 19:23:11 +00:00
|
|
|
# handle returning an actual dataloader instead of a list of loaders
|
|
|
|
have_val_loaders = self.val_dataloader is not None
|
|
|
|
if have_val_loaders and not isinstance(self.val_dataloader, list):
|
|
|
|
self.val_dataloader = [self.val_dataloader]
|
|
|
|
|
2019-07-18 15:18:19 +00:00
|
|
|
if self.use_ddp and not isinstance(self.tng_dataloader.sampler, DistributedSampler):
|
2019-08-05 21:57:39 +00:00
|
|
|
msg = """
|
2019-08-10 19:58:12 +00:00
|
|
|
You're using multiple gpus and multiple nodes without using a DistributedSampler
|
|
|
|
to assign a subset of your data to each process. To silence this warning, pass a
|
|
|
|
DistributedSampler to your DataLoader.
|
2019-08-05 21:57:39 +00:00
|
|
|
|
2019-08-06 10:08:31 +00:00
|
|
|
ie: this:
|
|
|
|
dataset = myDataset()
|
|
|
|
dataloader = Dataloader(dataset)
|
2019-08-05 21:57:39 +00:00
|
|
|
|
2019-08-06 10:08:31 +00:00
|
|
|
becomes:
|
|
|
|
dataset = myDataset()
|
|
|
|
dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
|
|
|
|
dataloader = Dataloader(dataset, sampler=dist_sampler)
|
2019-08-10 19:58:12 +00:00
|
|
|
|
2019-08-12 19:23:11 +00:00
|
|
|
If you want each process to load the full dataset, ignore this warning.
|
|
|
|
"""
|
|
|
|
warnings.warn(msg)
|
|
|
|
|
|
|
|
if self.use_ddp and\
|
|
|
|
not all(isinstance(dataloader, DistributedSampler)
|
|
|
|
for dataloader in self.val_dataloader):
|
|
|
|
msg = """
|
|
|
|
You're val_dataloader(s) are not all DistributedSamplers.
|
|
|
|
You're using multiple gpus and multiple nodes without using a DistributedSampler
|
|
|
|
to assign a subset of your data to each process. To silence this warning, pass a
|
|
|
|
DistributedSampler to your DataLoader.
|
|
|
|
|
|
|
|
ie: this:
|
|
|
|
dataset = myDataset()
|
|
|
|
dataloader = Dataloader(dataset)
|
|
|
|
|
|
|
|
becomes:
|
|
|
|
dataset = myDataset()
|
|
|
|
dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
|
|
|
|
dataloader = Dataloader(dataset, sampler=dist_sampler)
|
|
|
|
|
2019-08-10 19:58:12 +00:00
|
|
|
If you want each process to load the full dataset, ignore this warning.
|
2019-08-06 10:08:31 +00:00
|
|
|
"""
|
2019-08-10 19:58:12 +00:00
|
|
|
warnings.warn(msg)
|
2019-07-08 23:39:59 +00:00
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
# -----------------------------
|
|
|
|
# MODEL TRAINING
|
|
|
|
# -----------------------------
|
|
|
|
def fit(self, model):
|
2019-07-08 21:38:57 +00:00
|
|
|
|
2019-07-18 15:08:48 +00:00
|
|
|
# when using multi-node or DDP within a node start each module in a separate process
|
|
|
|
if self.use_ddp:
|
2019-08-06 10:08:31 +00:00
|
|
|
# must copy only the meta of the exp so it survives pickle/unpickle
|
|
|
|
# when going to new process
|
2019-08-08 14:59:16 +00:00
|
|
|
if self.experiment is not None:
|
|
|
|
self.experiment = self.experiment.get_meta_copy()
|
2019-07-18 20:47:46 +00:00
|
|
|
|
2019-07-24 22:39:27 +00:00
|
|
|
if self.is_slurm_managing_tasks:
|
2019-07-18 20:47:46 +00:00
|
|
|
task = int(os.environ['SLURM_LOCALID'])
|
|
|
|
self.ddp_train(task, model)
|
|
|
|
else:
|
2019-08-05 22:12:19 +00:00
|
|
|
msg = """
|
2019-08-06 10:08:31 +00:00
|
|
|
You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks.
|
|
|
|
We will launch %(nb_gpus)s processes for you.
|
|
|
|
We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s
|
|
|
|
If you're not using SLURM, ignore this message!
|
|
|
|
""" % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks}
|
2019-07-18 20:47:46 +00:00
|
|
|
warnings.warn(msg)
|
2019-07-24 14:50:29 +00:00
|
|
|
mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))
|
2019-07-14 20:57:15 +00:00
|
|
|
|
2019-07-18 15:08:48 +00:00
|
|
|
# 1 gpu or dp option triggers training using DP module
|
|
|
|
# easier to avoid NCCL issues
|
|
|
|
elif self.use_dp:
|
2019-07-24 14:38:22 +00:00
|
|
|
self.__dp_train(model)
|
2019-07-14 20:57:15 +00:00
|
|
|
|
2019-08-07 17:39:40 +00:00
|
|
|
elif self.single_gpu:
|
2019-08-07 17:49:01 +00:00
|
|
|
self.__single_gpu_train(model)
|
2019-08-07 17:39:40 +00:00
|
|
|
|
2019-07-18 15:09:00 +00:00
|
|
|
# ON CPU
|
2019-07-03 19:09:49 +00:00
|
|
|
else:
|
2019-07-11 18:17:43 +00:00
|
|
|
# run through amp wrapper
|
|
|
|
if self.use_amp:
|
2019-08-06 10:08:31 +00:00
|
|
|
raise MisconfigurationException('amp + cpu is not supported.'
|
|
|
|
' Please use a GPU option')
|
2019-07-11 18:17:43 +00:00
|
|
|
|
2019-07-25 15:08:31 +00:00
|
|
|
# CHOOSE OPTIMIZER
|
2019-07-28 13:33:58 +00:00
|
|
|
# allow for lr schedulers as well
|
2019-07-25 15:08:31 +00:00
|
|
|
self.optimizers = model.configure_optimizers()
|
2019-08-14 13:02:11 +00:00
|
|
|
if len(self.optimizers) == 2 and type(self.optimizers[0]) is list:
|
2019-07-28 13:33:58 +00:00
|
|
|
self.optimizers, self.lr_schedulers = self.optimizers
|
2019-07-25 15:08:31 +00:00
|
|
|
|
2019-07-03 19:09:49 +00:00
|
|
|
self.__run_pretrain_routine(model)
|
|
|
|
|
2019-07-24 11:26:18 +00:00
|
|
|
# return 1 when finished
|
|
|
|
# used for testing or when we need to know that training succeeded
|
|
|
|
return 1
|
|
|
|
|
2019-08-07 17:39:40 +00:00
|
|
|
def __single_gpu_train(self, model):
|
|
|
|
# CHOOSE OPTIMIZER
|
|
|
|
# allow for lr schedulers as well
|
|
|
|
self.optimizers = model.configure_optimizers()
|
|
|
|
if len(self.optimizers) == 2:
|
|
|
|
self.optimizers, self.lr_schedulers = self.optimizers
|
|
|
|
|
|
|
|
model.cuda(self.data_parallel_device_ids[0])
|
|
|
|
|
|
|
|
if self.use_amp:
|
|
|
|
# An example
|
|
|
|
model, optimizers = amp.initialize(
|
|
|
|
model, self.optimizers, opt_level=self.amp_level,
|
|
|
|
)
|
|
|
|
self.optimizers = optimizers
|
|
|
|
|
|
|
|
self.__run_pretrain_routine(model)
|
|
|
|
|
2019-07-24 14:38:22 +00:00
|
|
|
def __dp_train(self, model):
|
2019-07-14 20:57:15 +00:00
|
|
|
|
|
|
|
# CHOOSE OPTIMIZER
|
2019-07-28 13:33:58 +00:00
|
|
|
# allow for lr schedulers as well
|
2019-07-14 20:57:15 +00:00
|
|
|
self.optimizers = model.configure_optimizers()
|
2019-07-28 13:33:58 +00:00
|
|
|
if len(self.optimizers) == 2:
|
|
|
|
self.optimizers, self.lr_schedulers = self.optimizers
|
2019-07-14 20:57:15 +00:00
|
|
|
|
2019-07-24 17:56:49 +00:00
|
|
|
model.cuda(self.data_parallel_device_ids[0])
|
|
|
|
|
2019-07-24 18:11:05 +00:00
|
|
|
# check for this bug (amp + dp + !01 doesn't work)
|
|
|
|
# https://github.com/NVIDIA/apex/issues/227
|
2019-07-24 23:43:38 +00:00
|
|
|
if self.use_dp and self.use_amp:
|
2019-08-05 22:47:39 +00:00
|
|
|
m = """
|
2019-08-06 10:08:31 +00:00
|
|
|
Amp level %r with DataParallel is not supported.
|
|
|
|
See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
|
|
|
|
We recommend you switch to ddp if you want to use amp
|
|
|
|
""" % self.amp_level
|
2019-07-24 20:57:21 +00:00
|
|
|
raise MisconfigurationException(m)
|
2019-07-24 18:11:05 +00:00
|
|
|
|
2019-07-23 17:30:07 +00:00
|
|
|
model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids)
|
|
|
|
|
2019-07-14 20:57:15 +00:00
|
|
|
self.__run_pretrain_routine(model)
|
|
|
|
|
2019-07-24 14:51:35 +00:00
|
|
|
def ddp_train(self, gpu_nb, model):
|
2019-07-03 19:09:49 +00:00
|
|
|
"""
|
|
|
|
Entry point into a DP thread
|
|
|
|
:param gpu_nb:
|
|
|
|
:param model:
|
|
|
|
:param cluster_obj:
|
|
|
|
:return:
|
|
|
|
"""
|
2019-07-08 17:48:59 +00:00
|
|
|
# node rank using relative slurm id
|
2019-07-08 21:31:47 +00:00
|
|
|
# otherwise default to node rank 0
|
|
|
|
try:
|
2019-07-20 13:08:24 +00:00
|
|
|
node_id = os.environ['SLURM_NODEID']
|
2019-07-20 13:15:09 +00:00
|
|
|
self.node_rank = int(node_id)
|
2019-08-05 21:57:39 +00:00
|
|
|
except Exception:
|
2019-07-20 12:38:17 +00:00
|
|
|
self.node_rank = 0
|
2019-07-08 16:27:53 +00:00
|
|
|
|
2019-07-03 20:29:10 +00:00
|
|
|
# recover original exp before went into process
|
2019-07-12 18:36:00 +00:00
|
|
|
# init in write mode only on proc 0
|
2019-08-08 14:59:16 +00:00
|
|
|
if self.experiment is not None:
|
|
|
|
self.experiment.debug = self.proc_rank > 0
|
|
|
|
self.experiment = self.experiment.get_non_ddp_exp()
|
2019-07-03 20:17:56 +00:00
|
|
|
|
2019-07-03 22:18:29 +00:00
|
|
|
# show progbar only on prog_rank 0
|
2019-07-20 12:38:17 +00:00
|
|
|
self.prog_bar = self.prog_bar and self.node_rank == 0 and gpu_nb == 0
|
2019-07-08 16:27:53 +00:00
|
|
|
|
2019-07-08 13:36:09 +00:00
|
|
|
# determine which process we are and world size
|
2019-07-20 12:38:17 +00:00
|
|
|
self.proc_rank = self.node_rank * len(self.data_parallel_device_ids) + gpu_nb
|
2019-07-08 21:51:07 +00:00
|
|
|
self.world_size = self.nb_gpu_nodes * len(self.data_parallel_device_ids)
|
2019-07-08 13:36:09 +00:00
|
|
|
|
2019-07-26 22:52:02 +00:00
|
|
|
# let the exp know the rank to avoid overwriting logs
|
2019-08-08 14:59:16 +00:00
|
|
|
if self.experiment is not None:
|
|
|
|
self.experiment.rank = self.proc_rank
|
2019-07-26 22:52:02 +00:00
|
|
|
|
2019-07-08 13:36:09 +00:00
|
|
|
# set up server using proc 0's ip address
|
2019-07-11 18:35:41 +00:00
|
|
|
# try to init for 20 times at max in case ports are taken
|
2019-07-12 16:41:54 +00:00
|
|
|
# where to store ip_table
|
2019-07-12 17:19:10 +00:00
|
|
|
self.__init_tcp_connection()
|
2019-07-03 19:09:49 +00:00
|
|
|
|
2019-07-11 19:23:33 +00:00
|
|
|
# CHOOSE OPTIMIZER
|
2019-07-28 13:33:58 +00:00
|
|
|
# allow for lr schedulers as well
|
2019-07-11 19:23:33 +00:00
|
|
|
self.optimizers = model.configure_optimizers()
|
2019-07-28 13:33:58 +00:00
|
|
|
if len(self.optimizers) == 2:
|
|
|
|
self.optimizers, self.lr_schedulers = self.optimizers
|
2019-07-11 19:23:33 +00:00
|
|
|
|
|
|
|
# MODEL
|
2019-07-03 19:09:49 +00:00
|
|
|
# copy model to each gpu
|
|
|
|
torch.cuda.set_device(gpu_nb)
|
|
|
|
model.cuda(gpu_nb)
|
2019-07-11 18:17:43 +00:00
|
|
|
|
2019-07-11 19:23:33 +00:00
|
|
|
# AMP
|
2019-07-11 18:17:43 +00:00
|
|
|
# run through amp wrapper before going to distributed DP
|
|
|
|
if self.use_amp:
|
|
|
|
# An example
|
|
|
|
model, optimizers = amp.initialize(
|
|
|
|
model, self.optimizers, opt_level=self.amp_level,
|
|
|
|
)
|
|
|
|
self.optimizers = optimizers
|
|
|
|
|
2019-08-06 10:08:31 +00:00
|
|
|
model = LightningDistributedDataParallel(model, device_ids=[gpu_nb],
|
|
|
|
find_unused_parameters=True)
|
2019-07-03 19:09:49 +00:00
|
|
|
|
|
|
|
# continue training routine
|
|
|
|
self.__run_pretrain_routine(model)
|
|
|
|
|
2019-07-12 20:07:57 +00:00
|
|
|
def __init_tcp_connection(self):
|
2019-07-12 17:39:58 +00:00
|
|
|
"""
|
|
|
|
Connect all procs in the world using the env:// init
|
|
|
|
Use the first node as the root address
|
|
|
|
:param port:
|
|
|
|
:param tries:
|
|
|
|
:return:
|
|
|
|
"""
|
2019-07-24 18:36:29 +00:00
|
|
|
# sets the appropriate port
|
2019-07-24 19:11:29 +00:00
|
|
|
try:
|
|
|
|
port = os.environ['MASTER_PORT']
|
2019-08-05 21:57:39 +00:00
|
|
|
except Exception:
|
2019-07-24 19:11:29 +00:00
|
|
|
port = 12910
|
2019-08-05 22:47:39 +00:00
|
|
|
os.environ['MASTER_PORT'] = str(port)
|
2019-07-12 19:55:28 +00:00
|
|
|
|
2019-07-24 22:53:12 +00:00
|
|
|
# figure out the root node addr
|
2019-07-25 15:05:15 +00:00
|
|
|
try:
|
|
|
|
root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
|
2019-08-05 21:57:39 +00:00
|
|
|
except Exception:
|
2019-07-25 15:05:15 +00:00
|
|
|
root_node = '127.0.0.2'
|
|
|
|
|
2019-07-24 22:53:12 +00:00
|
|
|
root_node = self.resolve_root_node_address(root_node)
|
2019-07-12 19:11:32 +00:00
|
|
|
os.environ['MASTER_ADDR'] = root_node
|
2019-07-24 22:53:12 +00:00
|
|
|
|
2019-07-12 20:05:46 +00:00
|
|
|
dist.init_process_group("nccl", rank=self.proc_rank, world_size=self.world_size)
|
2019-07-11 18:35:41 +00:00
|
|
|
|
2019-07-24 22:53:12 +00:00
|
|
|
def resolve_root_node_address(self, root_node):
|
2019-07-25 15:05:15 +00:00
|
|
|
if '[' in root_node:
|
|
|
|
name = root_node.split('[')[0]
|
|
|
|
number = root_node.split(',')[0]
|
|
|
|
if '-' in number:
|
|
|
|
number = number.split('-')[0]
|
|
|
|
|
|
|
|
number = re.sub('[^0-9]', '', number)
|
|
|
|
root_node = name + number
|
2019-07-20 12:53:24 +00:00
|
|
|
|
|
|
|
return root_node
|
|
|
|
|
2019-07-03 19:09:49 +00:00
|
|
|
def __run_pretrain_routine(self, model):
|
|
|
|
"""
|
|
|
|
Sanity check a few things before starting actual training
|
|
|
|
:param model:
|
|
|
|
:return:
|
|
|
|
"""
|
2019-07-08 21:38:57 +00:00
|
|
|
ref_model = model
|
2019-07-14 02:21:17 +00:00
|
|
|
if self.data_parallel:
|
2019-07-08 21:38:57 +00:00
|
|
|
ref_model = model.module
|
|
|
|
|
2019-07-08 22:55:05 +00:00
|
|
|
ref_model.trainer = self
|
|
|
|
|
2019-07-08 21:15:26 +00:00
|
|
|
# set local properties on the model
|
2019-07-08 21:38:57 +00:00
|
|
|
ref_model.on_gpu = self.on_gpu
|
2019-07-08 21:15:26 +00:00
|
|
|
|
|
|
|
# transfer data loaders from model
|
2019-07-24 21:09:14 +00:00
|
|
|
self.get_dataloaders(ref_model)
|
2019-07-08 21:15:26 +00:00
|
|
|
|
|
|
|
# init training constants
|
2019-07-09 00:13:40 +00:00
|
|
|
self.__layout_bookeeping()
|
2019-07-08 21:15:26 +00:00
|
|
|
|
|
|
|
# print model summary
|
2019-07-16 01:11:29 +00:00
|
|
|
if self.proc_rank == 0 and self.print_weights_summary:
|
2019-07-08 21:41:07 +00:00
|
|
|
ref_model.summarize()
|
2019-07-08 21:15:26 +00:00
|
|
|
|
2019-07-03 20:17:56 +00:00
|
|
|
# give model convenience properties
|
2019-07-08 21:40:23 +00:00
|
|
|
ref_model.trainer = self
|
2019-08-08 14:59:16 +00:00
|
|
|
|
|
|
|
if self.experiment is not None:
|
|
|
|
ref_model.experiment = self.experiment
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
# save exp to get started
|
2019-08-08 14:59:16 +00:00
|
|
|
if self.proc_rank == 0 and self.experiment is not None:
|
2019-07-03 21:02:30 +00:00
|
|
|
self.experiment.save()
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-07-27 02:57:49 +00:00
|
|
|
# track model now.
|
|
|
|
# if cluster resets state, the model will update with the saved weights
|
|
|
|
self.model = model
|
|
|
|
|
2019-08-07 11:42:14 +00:00
|
|
|
# restore training and model before hpc call
|
|
|
|
self.restore_state_if_existing_checkpoint()
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
# enable cluster checkpointing
|
2019-07-27 02:57:49 +00:00
|
|
|
# also restores training state
|
2019-08-07 11:42:14 +00:00
|
|
|
# hpc checkpoint overrides any other checkpoints loaded before
|
2019-07-24 23:37:04 +00:00
|
|
|
if self.cluster is not None: # pragma: no cover
|
2019-03-31 20:29:50 +00:00
|
|
|
self.enable_auto_hpc_walltime_manager()
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-08-12 19:23:11 +00:00
|
|
|
# run tiny validation (if validation defined) to make sure program won't crash during val
|
2019-08-07 12:14:52 +00:00
|
|
|
ref_model.on_sanity_check_start()
|
2019-08-12 19:23:11 +00:00
|
|
|
if self.val_dataloader is not None:
|
|
|
|
for ds_i, dataloader in enumerate(self.val_dataloader):
|
|
|
|
self.validate(model, dataloader, self.nb_sanity_val_steps, ds_i)
|
2019-08-07 11:51:55 +00:00
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
# ---------------------------
|
|
|
|
# CORE TRAINING LOOP
|
|
|
|
# ---------------------------
|
|
|
|
self.__train()
|
|
|
|
|
|
|
|
def __train(self):
|
|
|
|
# run all epochs
|
|
|
|
for epoch_nb in range(self.current_epoch, self.max_nb_epochs):
|
2019-08-12 20:07:42 +00:00
|
|
|
# get model
|
2019-07-12 16:42:17 +00:00
|
|
|
model = self.__get_model()
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-08-12 20:07:42 +00:00
|
|
|
# update training progress in trainer and model
|
|
|
|
model.current_epoch = epoch_nb
|
2019-03-31 01:45:16 +00:00
|
|
|
self.current_epoch = epoch_nb
|
|
|
|
self.total_batches = self.nb_tng_batches + self.nb_val_batches
|
|
|
|
self.batch_loss_value = 0 # accumulated grads
|
|
|
|
|
|
|
|
# init progbar when requested
|
2019-07-09 00:17:55 +00:00
|
|
|
if self.progress_bar:
|
2019-08-06 10:08:31 +00:00
|
|
|
self.prog_bar = tqdm.tqdm(range(self.total_batches),
|
|
|
|
position=self.process_position)
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-08-12 20:07:42 +00:00
|
|
|
# -----------------
|
|
|
|
# RUN TNG EPOCH
|
|
|
|
# -----------------
|
|
|
|
self.run_tng_epoch()
|
2019-04-23 12:46:20 +00:00
|
|
|
|
2019-08-12 20:07:42 +00:00
|
|
|
# update LR schedulers
|
|
|
|
if self.lr_schedulers is not None:
|
|
|
|
for lr_scheduler in self.lr_schedulers:
|
|
|
|
lr_scheduler.step()
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
# early stopping
|
2019-07-16 14:00:03 +00:00
|
|
|
met_min_epochs = epoch_nb > self.min_nb_epochs
|
|
|
|
if self.enable_early_stop and met_min_epochs:
|
2019-08-06 10:08:31 +00:00
|
|
|
should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb,
|
|
|
|
logs=self.__tng_tqdm_dic)
|
2019-03-31 01:45:16 +00:00
|
|
|
# stop training
|
|
|
|
stop = should_stop and met_min_epochs
|
|
|
|
if stop:
|
|
|
|
return
|
|
|
|
|
2019-08-12 20:07:42 +00:00
|
|
|
def run_tng_epoch(self):
|
|
|
|
# before epoch hook
|
|
|
|
if self.__is_function_implemented('on_epoch_start'):
|
|
|
|
model = self.__get_model()
|
|
|
|
model.on_epoch_start()
|
|
|
|
|
|
|
|
# run epoch
|
|
|
|
for batch_nb, data_batch in enumerate(self.tng_dataloader):
|
|
|
|
self.batch_nb = batch_nb
|
|
|
|
self.global_step += 1
|
|
|
|
|
|
|
|
model = self.__get_model()
|
|
|
|
model.global_step = self.global_step
|
|
|
|
|
|
|
|
# stop when the flag is changed or we've gone past the amount
|
|
|
|
# requested in the batches
|
|
|
|
self.total_batch_nb += 1
|
|
|
|
met_batch_limit = batch_nb > self.nb_tng_batches
|
|
|
|
if met_batch_limit:
|
|
|
|
break
|
|
|
|
|
|
|
|
# ---------------
|
|
|
|
# RUN TRAIN STEP
|
|
|
|
# ---------------
|
|
|
|
batch_result = self.__run_tng_batch(data_batch, batch_nb)
|
|
|
|
early_stop_epoch = batch_result == -1
|
|
|
|
|
|
|
|
# ---------------
|
|
|
|
# RUN VAL STEP
|
|
|
|
# ---------------
|
|
|
|
is_val_check_batch = (batch_nb + 1) % self.val_check_batch == 0
|
|
|
|
if self.fast_dev_run or is_val_check_batch or early_stop_epoch:
|
|
|
|
self.__run_validation()
|
|
|
|
|
|
|
|
# when batch should be saved
|
|
|
|
if (batch_nb + 1) % self.log_save_interval == 0 or early_stop_epoch:
|
|
|
|
if self.proc_rank == 0 and self.experiment is not None:
|
|
|
|
self.experiment.save()
|
|
|
|
|
|
|
|
# when metrics should be logged
|
|
|
|
if batch_nb % self.add_log_row_interval == 0 or early_stop_epoch:
|
|
|
|
# count items in memory
|
|
|
|
# nb_params, nb_tensors = count_mem_items()
|
|
|
|
|
|
|
|
model = self.__get_model()
|
|
|
|
metrics = self.__tng_tqdm_dic
|
|
|
|
|
|
|
|
# add gpu memory
|
|
|
|
if self.on_gpu:
|
|
|
|
mem_map = get_gpu_memory_map()
|
|
|
|
metrics.update(mem_map)
|
|
|
|
|
|
|
|
# add norms
|
|
|
|
if self.track_grad_norm > 0:
|
|
|
|
model = self.__get_model()
|
|
|
|
grad_norm_dic = model.grad_norm(self.track_grad_norm)
|
|
|
|
metrics.update(grad_norm_dic)
|
|
|
|
|
|
|
|
if self.__is_function_implemented('on_tng_metrics'):
|
|
|
|
model.on_tng_metrics(metrics)
|
|
|
|
|
|
|
|
# log metrics
|
|
|
|
scalar_metrics = self.__metrics_to_scalars(
|
|
|
|
metrics, blacklist=self.__log_vals_blacklist())
|
|
|
|
if self.proc_rank == 0 and self.experiment is not None:
|
|
|
|
self.experiment.log(scalar_metrics, global_step=self.global_step)
|
|
|
|
self.experiment.save()
|
|
|
|
|
|
|
|
# end epoch early
|
|
|
|
if early_stop_epoch:
|
|
|
|
break
|
|
|
|
|
|
|
|
# epoch end hook
|
|
|
|
if self.__is_function_implemented('on_epoch_end'):
|
|
|
|
model = self.__get_model()
|
|
|
|
model.on_epoch_end()
|
|
|
|
|
2019-08-08 14:59:16 +00:00
|
|
|
def __metrics_to_scalars(self, metrics, blacklist=set()):
|
2019-07-01 22:38:07 +00:00
|
|
|
new_metrics = {}
|
|
|
|
for k, v in metrics.items():
|
|
|
|
if type(v) is torch.Tensor:
|
|
|
|
v = v.item()
|
|
|
|
|
|
|
|
if type(v) is dict:
|
|
|
|
v = self.__metrics_to_scalars(v)
|
|
|
|
|
|
|
|
if k not in blacklist:
|
|
|
|
new_metrics[k] = float(v)
|
|
|
|
|
|
|
|
return new_metrics
|
|
|
|
|
|
|
|
def __log_vals_blacklist(self):
|
|
|
|
"""avoid logging some vals lightning uses to maintain state"""
|
2019-07-18 17:32:36 +00:00
|
|
|
blacklist = {'batch_nb', 'v_nb', 'gpu'}
|
2019-07-01 22:38:07 +00:00
|
|
|
return blacklist
|
2019-04-23 12:57:58 +00:00
|
|
|
|
2019-08-15 13:39:09 +00:00
|
|
|
def transfer_batch_to_gpu(self, batch, gpu_id):
|
|
|
|
# base case
|
|
|
|
if isinstance(batch, torch.Tensor):
|
|
|
|
return batch.cuda(gpu_id)
|
|
|
|
|
|
|
|
# when list
|
|
|
|
elif isinstance(batch, list):
|
|
|
|
for i, x in enumerate(batch):
|
|
|
|
batch[i] = self.transfer_batch_to_gpu(x, gpu_id)
|
|
|
|
return batch
|
|
|
|
|
|
|
|
# when dict
|
|
|
|
elif isinstance(batch, dict):
|
|
|
|
for k, v in batch.items():
|
|
|
|
batch[k] = self.transfer_batch_to_gpu(v, gpu_id)
|
|
|
|
|
|
|
|
return batch
|
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
def __tng_forward(self, data_batch, batch_nb, opt_idx):
|
|
|
|
"""
|
|
|
|
Handle forward for each training case (distributed, single gpu, etc...)
|
|
|
|
:param data_batch:
|
|
|
|
:param batch_nb:
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
# ---------------
|
|
|
|
# FORWARD
|
|
|
|
# ---------------
|
|
|
|
# enable not needing to add opt_idx to training_step
|
|
|
|
args = [data_batch, batch_nb]
|
|
|
|
if len(self.optimizers) > 1:
|
|
|
|
args.append(opt_idx)
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-07-18 15:29:21 +00:00
|
|
|
if self.use_ddp:
|
2019-08-13 13:32:45 +00:00
|
|
|
output = self.model(*args)
|
2019-07-18 15:29:21 +00:00
|
|
|
elif self.use_dp:
|
2019-08-13 13:32:45 +00:00
|
|
|
output = self.model(*args)
|
2019-08-07 17:46:06 +00:00
|
|
|
elif self.single_gpu:
|
2019-08-07 17:49:01 +00:00
|
|
|
gpu_id = self.data_parallel_device_ids[0]
|
2019-08-15 13:39:09 +00:00
|
|
|
data_batch = self.transfer_batch_to_gpu(data_batch, gpu_id)
|
|
|
|
args[0] = data_batch
|
2019-08-13 13:32:45 +00:00
|
|
|
output = self.model.training_step(*args)
|
2019-08-07 17:49:01 +00:00
|
|
|
|
2019-07-03 20:51:32 +00:00
|
|
|
else:
|
2019-08-13 13:32:45 +00:00
|
|
|
output = self.model.training_step(*args)
|
2019-07-01 22:38:07 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
# ---------------
|
|
|
|
# TQDM metrics
|
|
|
|
# ---------------
|
2019-07-11 18:58:47 +00:00
|
|
|
try:
|
2019-08-08 16:06:29 +00:00
|
|
|
prog_output = output['prog']
|
|
|
|
|
|
|
|
# reduce prog metrics for tqdm when using dp
|
|
|
|
if self.use_dp:
|
|
|
|
nb_gpus = len(self.data_parallel_device_ids)
|
|
|
|
prog_output = reduce_distributed_output(prog_output, nb_gpus)
|
|
|
|
|
|
|
|
model_specific_tqdm_metrics_dic = prog_output
|
2019-08-05 21:57:39 +00:00
|
|
|
except Exception:
|
2019-07-11 18:58:47 +00:00
|
|
|
model_specific_tqdm_metrics_dic = {}
|
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
# ---------------
|
|
|
|
# EXTRACT LOSS
|
|
|
|
# ---------------
|
2019-07-11 19:08:45 +00:00
|
|
|
# if output dict doesn't have the keyword loss
|
|
|
|
# then assume the output=loss if scalar
|
|
|
|
try:
|
|
|
|
loss = output['loss']
|
2019-08-05 21:57:39 +00:00
|
|
|
except Exception:
|
2019-07-13 14:16:50 +00:00
|
|
|
if type(output) is torch.Tensor:
|
2019-07-11 19:08:45 +00:00
|
|
|
loss = output
|
2019-07-01 22:38:07 +00:00
|
|
|
|
2019-08-08 16:06:29 +00:00
|
|
|
# when using dp need to reduce the loss
|
|
|
|
if self.use_dp:
|
|
|
|
loss = reduce_distributed_output(loss, len(self.data_parallel_device_ids))
|
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
return loss, model_specific_tqdm_metrics_dic
|
2019-05-14 00:40:07 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
def __clip_gradients(self):
|
|
|
|
if self.gradient_clip > 0:
|
|
|
|
model = self.__get_model()
|
|
|
|
torch.nn.utils.clip_grad_norm_(model.parameters(), self.gradient_clip)
|
2019-07-21 22:23:48 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
def __print_nan_grads(self):
|
2019-07-01 22:38:07 +00:00
|
|
|
if self.print_nan_grads:
|
2019-07-12 16:42:17 +00:00
|
|
|
model = self.__get_model()
|
2019-07-01 22:38:07 +00:00
|
|
|
for param in model.parameters():
|
2019-05-16 20:01:15 +00:00
|
|
|
print(param.grad.float().sum())
|
2019-05-16 19:58:06 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
def __run_tng_batch(self, data_batch, batch_nb):
|
|
|
|
if data_batch is None:
|
|
|
|
return 0
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
# hook
|
|
|
|
if self.__is_function_implemented('on_batch_start'):
|
|
|
|
model_ref = self.__get_model()
|
|
|
|
response = model_ref.on_batch_start(data_batch)
|
2019-07-01 22:38:07 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
if response == -1:
|
|
|
|
return -1
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
if self.progress_bar:
|
|
|
|
self.prog_bar.update(1)
|
2019-07-21 22:15:58 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
# call training_step once per optimizer
|
|
|
|
for opt_idx, optimizer in enumerate(self.optimizers):
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
# forward pass
|
|
|
|
loss, model_specific_tqdm_metrics = self.__tng_forward(data_batch, batch_nb, opt_idx)
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-08-13 13:32:45 +00:00
|
|
|
# track metrics
|
|
|
|
self.__add_tqdm_metrics(model_specific_tqdm_metrics)
|
|
|
|
|
|
|
|
# accumulate loss
|
|
|
|
# (if accumulate_grad_batches = 1 no effect)
|
|
|
|
loss = loss / self.accumulate_grad_batches
|
|
|
|
|
|
|
|
# backward pass
|
|
|
|
if self.use_amp:
|
|
|
|
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
|
|
|
scaled_loss.backward()
|
|
|
|
else:
|
|
|
|
loss.backward()
|
|
|
|
|
|
|
|
# insert after step hook
|
|
|
|
if self.__is_function_implemented('on_after_backward'):
|
|
|
|
model_ref = self.__get_model()
|
|
|
|
model_ref.on_after_backward()
|
|
|
|
|
|
|
|
# nan grads
|
|
|
|
self.__print_nan_grads()
|
|
|
|
|
|
|
|
# track total loss for logging (avoid mem leaks)
|
|
|
|
self.batch_loss_value += loss.item()
|
|
|
|
|
|
|
|
# gradient update with accumulated gradients
|
|
|
|
if (self.batch_nb + 1) % self.accumulate_grad_batches == 0:
|
|
|
|
# clip gradients
|
|
|
|
self.__clip_gradients()
|
|
|
|
|
|
|
|
# calls .step(), .zero_grad()
|
|
|
|
# override function to modify this behavior
|
|
|
|
model = self.__get_model()
|
|
|
|
model.optimizer_step(self.current_epoch, batch_nb, optimizer, opt_idx)
|
|
|
|
|
|
|
|
# calculate running loss for display
|
|
|
|
self.running_loss.append(self.batch_loss_value)
|
|
|
|
self.batch_loss_value = 0
|
|
|
|
self.avg_loss = np.mean(self.running_loss[-100:])
|
|
|
|
|
|
|
|
# update progbar
|
|
|
|
if self.progress_bar:
|
|
|
|
# add model specific metrics
|
|
|
|
tqdm_metrics = self.__tng_tqdm_dic
|
|
|
|
self.prog_bar.set_postfix(**tqdm_metrics)
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
# activate batch end hook
|
|
|
|
if self.__is_function_implemented('on_batch_end'):
|
2019-07-12 16:42:17 +00:00
|
|
|
model = self.__get_model()
|
|
|
|
model.on_batch_end()
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-04-23 12:26:48 +00:00
|
|
|
return 0
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
def __run_validation(self):
|
|
|
|
# decide if can check epochs
|
|
|
|
can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0
|
|
|
|
if self.fast_dev_run:
|
|
|
|
print('skipping to check performance bc of --fast_dev_run')
|
|
|
|
elif not can_check_epoch:
|
|
|
|
return
|
|
|
|
|
2019-08-12 19:23:11 +00:00
|
|
|
# validate only if model has validation_step defined
|
|
|
|
if self.__is_overriden('validation_step'):
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-08-12 19:23:11 +00:00
|
|
|
# hook
|
|
|
|
if self.__is_function_implemented('on_pre_performance_check'):
|
|
|
|
model = self.__get_model()
|
|
|
|
model.on_pre_performance_check()
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-08-12 19:23:11 +00:00
|
|
|
# use full val set on end of epoch
|
|
|
|
# use a small portion otherwise
|
|
|
|
max_batches = None if not self.fast_dev_run else 1
|
|
|
|
for ds_i, dataloader in enumerate(self.val_dataloader):
|
|
|
|
val_out_metrics = self.validate(self.model, dataloader, max_batches, ds_i)
|
|
|
|
self.__add_tqdm_metrics(val_out_metrics)
|
|
|
|
|
|
|
|
# hook
|
|
|
|
if self.__is_function_implemented('on_post_performance_check'):
|
|
|
|
model = self.__get_model()
|
|
|
|
model.on_post_performance_check()
|
|
|
|
|
|
|
|
if self.progress_bar:
|
|
|
|
# add model specific metrics
|
|
|
|
tqdm_metrics = self.__tng_tqdm_dic
|
|
|
|
self.prog_bar.set_postfix(**tqdm_metrics)
|
2019-03-31 01:45:16 +00:00
|
|
|
|
|
|
|
# model checkpointing
|
2019-07-25 21:14:33 +00:00
|
|
|
if self.proc_rank == 0 and self.checkpoint_callback is not None:
|
2019-07-16 16:51:48 +00:00
|
|
|
print('save callback...')
|
2019-08-06 10:08:31 +00:00
|
|
|
self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch,
|
|
|
|
logs=self.__tng_tqdm_dic)
|