2020-07-25 18:38:51 +00:00
|
|
|
# Copyright The PyTorch Lightning team.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2019-07-09 00:11:20 +00:00
|
|
|
import os
|
2020-07-24 15:42:15 +00:00
|
|
|
import warnings
|
2020-09-12 03:33:09 +00:00
|
|
|
from typing import Dict, Iterable, List, Optional, Union
|
2019-07-09 00:11:20 +00:00
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
import torch
|
2020-02-23 02:23:30 +00:00
|
|
|
from torch.utils.data import DataLoader
|
2019-07-09 00:11:20 +00:00
|
|
|
|
2020-07-24 15:42:15 +00:00
|
|
|
from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint
|
|
|
|
from pytorch_lightning.core.datamodule import LightningDataModule
|
2020-03-24 18:55:27 +00:00
|
|
|
from pytorch_lightning.core.lightning import LightningModule
|
2020-06-15 21:05:58 +00:00
|
|
|
from pytorch_lightning.core.memory import ModelSummary
|
2020-07-24 15:42:15 +00:00
|
|
|
from pytorch_lightning.core.step_result import EvalResult
|
2020-02-23 02:23:30 +00:00
|
|
|
from pytorch_lightning.loggers import LightningLoggerBase
|
2020-09-12 03:33:09 +00:00
|
|
|
from pytorch_lightning.profiler import BaseProfiler
|
2020-03-06 17:00:05 +00:00
|
|
|
from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin
|
2020-08-07 22:33:51 +00:00
|
|
|
from pytorch_lightning.trainer.configuration_validator import ConfigValidator
|
2020-03-24 18:55:27 +00:00
|
|
|
from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
|
2019-12-04 16:39:14 +00:00
|
|
|
from pytorch_lightning.trainer.logging import TrainerLoggingMixin
|
|
|
|
from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin
|
2020-04-02 15:48:53 +00:00
|
|
|
from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin
|
2020-08-09 10:24:09 +00:00
|
|
|
from pytorch_lightning.trainer.states import TrainerState, trainer_state
|
2019-12-04 16:39:14 +00:00
|
|
|
from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
|
2020-09-12 03:33:09 +00:00
|
|
|
from pytorch_lightning.utilities import rank_zero_warn
|
2020-07-20 23:00:20 +00:00
|
|
|
from pytorch_lightning.utilities.debugging import InternalDebugger
|
2020-07-24 15:42:15 +00:00
|
|
|
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
2020-09-09 12:45:04 +00:00
|
|
|
from pytorch_lightning.trainer.evaluation_loop import EvaluationLoop
|
|
|
|
from pytorch_lightning.trainer.training_loop import TrainLoop
|
2020-09-01 19:48:28 +00:00
|
|
|
from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector
|
2020-09-12 03:33:09 +00:00
|
|
|
from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
|
|
|
|
from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
|
|
|
|
from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
|
|
|
|
from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
|
|
|
|
from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
|
|
|
|
from pytorch_lightning.trainer.connectors.debugging_connector import DebuggingConnector
|
2020-09-12 11:05:21 +00:00
|
|
|
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
|
2020-09-12 15:07:15 +00:00
|
|
|
from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
|
2020-09-06 21:50:47 +00:00
|
|
|
from pytorch_lightning import _logger as log
|
2020-09-08 22:46:42 +00:00
|
|
|
from pytorch_lightning.tuner.tuning import Tuner
|
2020-09-12 03:33:09 +00:00
|
|
|
from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector
|
|
|
|
from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector
|
|
|
|
from pytorch_lightning.trainer.connectors.data_connector import DataConnector
|
2020-08-31 16:12:02 +00:00
|
|
|
from pytorch_lightning.utilities.model_utils import is_overridden
|
2020-09-09 12:45:21 +00:00
|
|
|
from pytorch_lightning.trainer import docstrings
|
2020-09-10 00:03:18 +00:00
|
|
|
from pytorch_lightning.trainer.properties import TrainerProperties
|
2020-06-30 22:09:16 +00:00
|
|
|
|
2020-07-09 15:36:21 +00:00
|
|
|
# warnings to ignore in trainer
|
2020-07-24 15:42:15 +00:00
|
|
|
warnings.filterwarnings(
|
|
|
|
'ignore', message='torch.distributed.reduce_op is deprecated, ' 'please use torch.distributed.ReduceOp instead'
|
|
|
|
)
|
2020-09-14 01:10:37 +00:00
|
|
|
os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning'
|
2019-10-04 19:35:02 +00:00
|
|
|
|
2019-05-14 00:40:07 +00:00
|
|
|
try:
|
|
|
|
from apex import amp
|
2019-08-05 21:28:04 +00:00
|
|
|
except ImportError:
|
2020-08-08 09:07:32 +00:00
|
|
|
amp = None
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2019-07-09 00:12:27 +00:00
|
|
|
|
2020-03-06 17:00:05 +00:00
|
|
|
class Trainer(
|
2020-09-10 00:03:18 +00:00
|
|
|
TrainerProperties,
|
2020-06-19 15:00:46 +00:00
|
|
|
TrainerCallbackHookMixin,
|
|
|
|
TrainerModelHooksMixin,
|
2020-04-02 15:48:53 +00:00
|
|
|
TrainerOptimizersMixin,
|
2020-03-06 17:00:05 +00:00
|
|
|
TrainerLoggingMixin,
|
|
|
|
TrainerTrainingTricksMixin,
|
|
|
|
TrainerDataLoadingMixin,
|
|
|
|
):
|
2019-12-04 11:57:10 +00:00
|
|
|
def __init__(
|
2020-06-12 18:37:52 +00:00
|
|
|
self,
|
|
|
|
logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True,
|
|
|
|
checkpoint_callback: Union[ModelCheckpoint, bool] = True,
|
|
|
|
early_stop_callback: Optional[Union[EarlyStopping, bool]] = False,
|
|
|
|
callbacks: Optional[List[Callback]] = None,
|
|
|
|
default_root_dir: Optional[str] = None,
|
|
|
|
gradient_clip_val: float = 0,
|
|
|
|
process_position: int = 0,
|
|
|
|
num_nodes: int = 1,
|
|
|
|
num_processes: int = 1,
|
|
|
|
gpus: Optional[Union[List[int], str, int]] = None,
|
|
|
|
auto_select_gpus: bool = False,
|
2020-06-23 16:06:57 +00:00
|
|
|
tpu_cores: Optional[Union[List[int], str, int]] = None,
|
2020-06-12 18:37:52 +00:00
|
|
|
log_gpu_memory: Optional[str] = None,
|
|
|
|
progress_bar_refresh_rate: int = 1,
|
2020-06-17 12:03:28 +00:00
|
|
|
overfit_batches: Union[int, float] = 0.0,
|
2020-06-12 18:37:52 +00:00
|
|
|
track_grad_norm: Union[int, float, str] = -1,
|
|
|
|
check_val_every_n_epoch: int = 1,
|
|
|
|
fast_dev_run: bool = False,
|
|
|
|
accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1,
|
|
|
|
max_epochs: int = 1000,
|
|
|
|
min_epochs: int = 1,
|
|
|
|
max_steps: Optional[int] = None,
|
|
|
|
min_steps: Optional[int] = None,
|
2020-06-17 17:42:28 +00:00
|
|
|
limit_train_batches: Union[int, float] = 1.0,
|
2020-06-17 12:03:28 +00:00
|
|
|
limit_val_batches: Union[int, float] = 1.0,
|
|
|
|
limit_test_batches: Union[int, float] = 1.0,
|
2020-06-17 17:42:28 +00:00
|
|
|
val_check_interval: Union[int, float] = 1.0,
|
2020-06-12 18:37:52 +00:00
|
|
|
log_save_interval: int = 100,
|
2020-06-15 00:17:49 +00:00
|
|
|
row_log_interval: int = 50,
|
2020-06-12 18:37:52 +00:00
|
|
|
distributed_backend: Optional[str] = None,
|
2020-08-05 23:12:11 +00:00
|
|
|
sync_batchnorm: bool = False,
|
2020-06-12 18:37:52 +00:00
|
|
|
precision: int = 32,
|
2020-06-15 21:05:58 +00:00
|
|
|
weights_summary: Optional[str] = ModelSummary.MODE_DEFAULT,
|
2020-06-12 18:37:52 +00:00
|
|
|
weights_save_path: Optional[str] = None,
|
|
|
|
num_sanity_val_steps: int = 2,
|
|
|
|
truncated_bptt_steps: Optional[int] = None,
|
|
|
|
resume_from_checkpoint: Optional[str] = None,
|
|
|
|
profiler: Optional[Union[BaseProfiler, bool]] = None,
|
|
|
|
benchmark: bool = False,
|
|
|
|
deterministic: bool = False,
|
|
|
|
reload_dataloaders_every_epoch: bool = False,
|
|
|
|
auto_lr_find: Union[bool, str] = False,
|
|
|
|
replace_sampler_ddp: bool = True,
|
|
|
|
terminate_on_nan: bool = False,
|
|
|
|
auto_scale_batch_size: Union[str, bool] = False,
|
2020-06-13 16:00:14 +00:00
|
|
|
prepare_data_per_node: bool = True,
|
2020-08-13 14:03:13 +00:00
|
|
|
amp_backend: str = 'native',
|
2020-06-25 22:54:32 +00:00
|
|
|
amp_level: str = 'O2', # backward compatible, todo: remove in v1.0.0
|
2020-07-24 15:42:15 +00:00
|
|
|
overfit_pct: float = None, # backward compatible, todo: remove in v1.0.0
|
2019-12-04 11:57:10 +00:00
|
|
|
):
|
2020-05-17 13:14:54 +00:00
|
|
|
super().__init__()
|
2019-07-18 16:04:19 +00:00
|
|
|
|
2020-09-10 17:21:04 +00:00
|
|
|
# init connectors
|
2020-09-08 22:46:42 +00:00
|
|
|
self.dev_debugger = InternalDebugger(self)
|
|
|
|
self.config_validator = ConfigValidator(self)
|
|
|
|
self.data_connector = DataConnector(self)
|
2020-09-10 17:21:04 +00:00
|
|
|
self.optimizer_connector = OptimizerConnector(self)
|
2020-09-08 22:46:42 +00:00
|
|
|
self.accelerator_connector = AcceleratorConnector(self)
|
|
|
|
self.logger_connector = LoggerConnector(self)
|
2020-09-09 04:24:20 +00:00
|
|
|
self.model_connector = ModelConnector(self)
|
2020-09-10 12:55:30 +00:00
|
|
|
self.precision_connector = PrecisionConnector(self)
|
2020-09-10 12:07:55 +00:00
|
|
|
self.callback_connector = CallbackConnector(self)
|
2020-09-10 12:55:30 +00:00
|
|
|
self.debugging_connector = DebuggingConnector(self)
|
2020-09-10 14:51:35 +00:00
|
|
|
self.training_tricks_connector = TrainingTricksConnector(self)
|
2020-09-10 17:21:04 +00:00
|
|
|
self.profile_connector = ProfilerConnector(self)
|
2020-09-12 11:05:21 +00:00
|
|
|
self.checkpoint_connector = CheckpointConnector(self)
|
2020-09-12 15:07:15 +00:00
|
|
|
self.slurm_connector = SLURMConnector(self)
|
2020-09-08 22:46:42 +00:00
|
|
|
self.tuner = Tuner(self)
|
|
|
|
self.accelerator_backend = None
|
|
|
|
self.evaluation_loop = EvaluationLoop(self)
|
|
|
|
self.train_loop = TrainLoop(self)
|
|
|
|
|
2020-06-29 01:36:46 +00:00
|
|
|
# training state
|
2020-09-10 14:51:35 +00:00
|
|
|
self.weights_summary = weights_summary
|
2020-06-29 01:36:46 +00:00
|
|
|
self.model = None
|
2020-09-10 17:21:04 +00:00
|
|
|
self.shown_warnings = set()
|
2020-06-29 01:36:46 +00:00
|
|
|
|
|
|
|
# init callbacks
|
2020-09-10 12:07:55 +00:00
|
|
|
self.callback_connector.on_trainer_init(
|
|
|
|
callbacks,
|
|
|
|
early_stop_callback,
|
|
|
|
checkpoint_callback,
|
|
|
|
progress_bar_refresh_rate,
|
2020-09-10 14:51:35 +00:00
|
|
|
process_position,
|
|
|
|
default_root_dir,
|
|
|
|
weights_save_path,
|
2020-09-10 17:21:04 +00:00
|
|
|
resume_from_checkpoint
|
2020-09-10 12:07:55 +00:00
|
|
|
)
|
2020-06-29 01:36:46 +00:00
|
|
|
|
2020-09-10 14:51:35 +00:00
|
|
|
# hook
|
|
|
|
self.on_init_start()
|
2020-06-02 22:51:09 +00:00
|
|
|
|
2020-09-10 17:21:04 +00:00
|
|
|
# init optimizer + lr scheduler related flags
|
|
|
|
self.optimizer_connector.on_trainer_init()
|
|
|
|
|
|
|
|
# init data flags
|
|
|
|
self.data_connector.on_trainer_init(
|
|
|
|
check_val_every_n_epoch,
|
|
|
|
reload_dataloaders_every_epoch,
|
|
|
|
prepare_data_per_node
|
|
|
|
)
|
|
|
|
|
2020-09-10 14:51:35 +00:00
|
|
|
# init training tricks
|
|
|
|
self.training_tricks_connector.on_trainer_init(
|
|
|
|
gradient_clip_val,
|
|
|
|
track_grad_norm,
|
|
|
|
accumulate_grad_batches,
|
2020-09-10 17:21:04 +00:00
|
|
|
truncated_bptt_steps,
|
|
|
|
terminate_on_nan
|
2020-09-10 14:51:35 +00:00
|
|
|
)
|
2020-06-02 22:51:09 +00:00
|
|
|
|
2020-09-10 11:24:42 +00:00
|
|
|
# init accelerator related flags
|
|
|
|
self.accelerator_connector.on_trainer_init(
|
|
|
|
num_processes,
|
|
|
|
tpu_cores,
|
|
|
|
distributed_backend,
|
|
|
|
auto_select_gpus,
|
2020-09-10 12:07:55 +00:00
|
|
|
gpus,
|
|
|
|
num_nodes,
|
|
|
|
log_gpu_memory,
|
|
|
|
sync_batchnorm,
|
2020-09-10 14:51:35 +00:00
|
|
|
benchmark,
|
2020-09-10 17:21:04 +00:00
|
|
|
replace_sampler_ddp,
|
|
|
|
deterministic
|
2020-09-10 11:24:42 +00:00
|
|
|
)
|
|
|
|
|
2020-09-10 12:55:30 +00:00
|
|
|
# init train loop related flags
|
2020-09-10 17:21:04 +00:00
|
|
|
self.train_loop.on_trainer_init(max_epochs, min_epochs, max_steps, min_steps, num_sanity_val_steps)
|
|
|
|
self.evaluation_loop.on_trainer_init()
|
2020-07-27 21:56:55 +00:00
|
|
|
|
2020-09-10 17:21:04 +00:00
|
|
|
# configure tuner
|
|
|
|
self.tuner.on_trainer_init(auto_lr_find, auto_scale_batch_size)
|
2019-07-08 13:42:13 +00:00
|
|
|
|
2020-02-07 03:01:21 +00:00
|
|
|
# configure profiler
|
2020-09-10 17:21:04 +00:00
|
|
|
self.profile_connector.on_trainer_init(profiler)
|
2020-02-07 03:01:21 +00:00
|
|
|
|
2020-09-10 14:51:35 +00:00
|
|
|
# init logger flags
|
|
|
|
self.logger_connector.on_trainer_init(logger, log_save_interval, row_log_interval)
|
2019-09-06 04:29:38 +00:00
|
|
|
|
2020-09-10 12:55:30 +00:00
|
|
|
# init debugging flags
|
|
|
|
self.debugging_connector.on_init_start(
|
|
|
|
overfit_pct,
|
|
|
|
limit_train_batches,
|
|
|
|
limit_val_batches,
|
|
|
|
limit_test_batches,
|
|
|
|
val_check_interval,
|
|
|
|
overfit_batches,
|
|
|
|
fast_dev_run
|
|
|
|
)
|
2019-09-06 04:29:38 +00:00
|
|
|
|
2020-09-10 12:55:30 +00:00
|
|
|
# set precision
|
|
|
|
self.precision_connector.on_trainer_init(precision, amp_level, amp_backend)
|
2020-05-13 23:17:04 +00:00
|
|
|
|
2020-02-26 04:17:27 +00:00
|
|
|
# Callback system
|
2020-03-03 04:51:32 +00:00
|
|
|
self.on_init_end()
|
2020-02-26 04:17:27 +00:00
|
|
|
|
2020-08-31 21:36:09 +00:00
|
|
|
def tune(
|
|
|
|
self,
|
|
|
|
model: LightningModule,
|
|
|
|
train_dataloader: Optional[DataLoader] = None,
|
|
|
|
val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None,
|
|
|
|
datamodule: Optional[LightningDataModule] = None,
|
|
|
|
):
|
|
|
|
# TODO: temporary, need to decide if tune or separate object
|
|
|
|
|
|
|
|
# setup data, etc...
|
2020-09-11 01:58:47 +00:00
|
|
|
self.train_loop.setup_fit(model, train_dataloader, val_dataloaders, datamodule)
|
2020-08-31 21:36:09 +00:00
|
|
|
|
|
|
|
# hook
|
2020-09-01 18:59:09 +00:00
|
|
|
self.data_connector.prepare_data(model)
|
2020-08-31 21:36:09 +00:00
|
|
|
|
|
|
|
# Run auto batch size scaling
|
|
|
|
if self.auto_scale_batch_size:
|
|
|
|
if isinstance(self.auto_scale_batch_size, bool):
|
|
|
|
self.auto_scale_batch_size = 'power'
|
2020-09-07 20:45:31 +00:00
|
|
|
self.tuner.scale_batch_size(
|
2020-09-03 20:07:49 +00:00
|
|
|
model,
|
|
|
|
mode=self.auto_scale_batch_size,
|
|
|
|
train_dataloader=train_dataloader,
|
|
|
|
val_dataloaders=val_dataloaders,
|
|
|
|
datamodule=datamodule,
|
|
|
|
)
|
2020-08-31 21:36:09 +00:00
|
|
|
model.logger = self.logger # reset logger binding
|
|
|
|
|
|
|
|
# Run learning rate finder:
|
|
|
|
if self.auto_lr_find:
|
2020-09-10 02:12:27 +00:00
|
|
|
self.tuner.internal_find_lr(self, model)
|
2020-08-31 21:36:09 +00:00
|
|
|
model.logger = self.logger # reset logger binding
|
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
# -----------------------------
|
|
|
|
# MODEL TRAINING
|
|
|
|
# -----------------------------
|
2020-02-23 02:23:30 +00:00
|
|
|
def fit(
|
2020-07-24 15:42:15 +00:00
|
|
|
self,
|
|
|
|
model: LightningModule,
|
|
|
|
train_dataloader: Optional[DataLoader] = None,
|
|
|
|
val_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None,
|
|
|
|
datamodule: Optional[LightningDataModule] = None,
|
2020-02-23 02:23:30 +00:00
|
|
|
):
|
2020-09-21 02:58:43 +00:00
|
|
|
self._state = TrainerState.RUNNING
|
|
|
|
|
2020-08-31 15:08:22 +00:00
|
|
|
# setup data, etc...
|
2020-09-11 01:58:47 +00:00
|
|
|
self.train_loop.setup_fit(model, train_dataloader, val_dataloaders, datamodule)
|
2020-04-02 15:53:37 +00:00
|
|
|
|
2020-08-26 17:53:23 +00:00
|
|
|
# hook
|
|
|
|
self.call_hook('on_fit_start', model)
|
2020-06-17 11:37:16 +00:00
|
|
|
|
2020-08-27 02:20:00 +00:00
|
|
|
# hook
|
2020-09-01 18:59:09 +00:00
|
|
|
self.data_connector.prepare_data(model)
|
2020-02-19 11:00:08 +00:00
|
|
|
|
2020-08-02 12:13:31 +00:00
|
|
|
# set testing if set in environ
|
|
|
|
self.testing = os.environ.get('PL_TESTING_MODE', self.testing)
|
2020-06-13 16:00:14 +00:00
|
|
|
|
2020-08-27 02:20:00 +00:00
|
|
|
# -------------------------
|
|
|
|
# TRAIN
|
|
|
|
# -------------------------
|
2020-09-01 19:48:28 +00:00
|
|
|
self.accelerator_backend = self.accelerator_connector.select_accelerator()
|
2020-08-27 01:29:10 +00:00
|
|
|
self.accelerator_backend.setup(model)
|
|
|
|
results = self.accelerator_backend.train()
|
|
|
|
self.accelerator_backend.teardown()
|
|
|
|
|
2020-08-27 02:20:00 +00:00
|
|
|
# -------------------------
|
|
|
|
# POST-Training
|
|
|
|
# -------------------------
|
2020-08-27 01:29:10 +00:00
|
|
|
# hook
|
|
|
|
self.call_hook('on_fit_end')
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.teardown('fit')
|
|
|
|
if self.is_function_implemented('teardown'):
|
|
|
|
model.teardown('fit')
|
|
|
|
|
|
|
|
# return 1 when finished
|
|
|
|
# used for testing or when we need to know that training succeeded
|
2020-09-21 02:58:43 +00:00
|
|
|
|
|
|
|
if self._state != TrainerState.INTERRUPTED:
|
|
|
|
self._state = TrainerState.FINISHED
|
2020-08-27 01:29:10 +00:00
|
|
|
return results or 1
|
|
|
|
|
2020-09-06 21:50:47 +00:00
|
|
|
def train(self):
|
|
|
|
self.run_sanity_check(self.get_model())
|
|
|
|
|
|
|
|
# enable train mode
|
|
|
|
model = self.get_model()
|
|
|
|
model.train()
|
|
|
|
torch.set_grad_enabled(True)
|
|
|
|
|
|
|
|
# reload data when needed
|
|
|
|
self.train_loop.reset_train_val_dataloaders(model)
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.train_loop.on_train_start()
|
|
|
|
|
|
|
|
try:
|
|
|
|
# run all epochs
|
|
|
|
for epoch in range(self.current_epoch, self.max_epochs):
|
|
|
|
|
|
|
|
# reset train dataloader
|
|
|
|
if self.reload_dataloaders_every_epoch:
|
|
|
|
self.reset_train_dataloader(model)
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.train_loop.on_train_epoch_start(epoch)
|
|
|
|
|
|
|
|
# run train epoch
|
2020-09-06 23:55:18 +00:00
|
|
|
self.train_loop.run_training_epoch()
|
2020-09-06 21:50:47 +00:00
|
|
|
|
|
|
|
if self.max_steps and self.max_steps <= self.global_step:
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.train_loop.on_train_end()
|
|
|
|
return
|
|
|
|
|
|
|
|
# update LR schedulers
|
2020-09-10 17:21:04 +00:00
|
|
|
self.optimizer_connector.update_learning_rates(interval='epoch')
|
2020-09-06 21:50:47 +00:00
|
|
|
|
|
|
|
# early stopping
|
|
|
|
met_min_epochs = epoch >= self.min_epochs - 1
|
|
|
|
met_min_steps = self.global_step >= self.min_steps if self.min_steps else True
|
|
|
|
|
|
|
|
if self.should_stop:
|
|
|
|
if (met_min_epochs and met_min_steps):
|
|
|
|
self.train_loop.on_train_end()
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
log.info('Trainer was signaled to stop but required minimum epochs'
|
|
|
|
f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
|
|
|
|
' not been met. Training will continue...')
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.train_loop.on_train_end()
|
|
|
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...')
|
|
|
|
|
|
|
|
# user could press ctrl+c many times... only shutdown once
|
|
|
|
if not self.interrupted:
|
|
|
|
self.interrupted = True
|
|
|
|
self._state = TrainerState.INTERRUPTED
|
|
|
|
self.on_keyboard_interrupt()
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.train_loop.on_train_end()
|
|
|
|
|
2020-09-09 12:45:04 +00:00
|
|
|
def run_evaluation(self, test_mode: bool = False, max_batches=None):
|
|
|
|
# bookkeeping
|
|
|
|
self.evaluation_loop.testing = test_mode
|
|
|
|
dataloaders, max_batches = self.evaluation_loop.get_evaluation_dataloaders(max_batches)
|
|
|
|
if self.evaluation_loop.should_skip_evaluation(dataloaders, max_batches):
|
|
|
|
return [], []
|
|
|
|
|
|
|
|
# enable eval mode + no grads
|
|
|
|
model = self.get_model()
|
|
|
|
model.zero_grad()
|
|
|
|
model.eval()
|
|
|
|
torch.set_grad_enabled(False)
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.evaluation_loop.on_evaluation_start()
|
|
|
|
|
|
|
|
# set up the eval loop
|
|
|
|
self.evaluation_loop.setup(model, max_batches, dataloaders)
|
|
|
|
|
|
|
|
# hook
|
|
|
|
# TODO: should this be insider the dataloader loop?
|
|
|
|
self.evaluation_loop.on_evaluation_epoch_start()
|
|
|
|
|
|
|
|
# run validation/testing
|
|
|
|
for dataloader_idx, dataloader in enumerate(dataloaders):
|
|
|
|
# bookkeeping
|
|
|
|
dl_outputs = []
|
|
|
|
dataloader = self.accelerator_backend.process_dataloader(dataloader)
|
|
|
|
dl_max_batches = self.evaluation_loop.max_batches[dataloader_idx]
|
|
|
|
|
|
|
|
for batch_idx, batch in enumerate(dataloader):
|
|
|
|
if batch is None:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# stop short when running on limited batches
|
|
|
|
if batch_idx >= dl_max_batches:
|
|
|
|
break
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.evaluation_loop.on_evaluation_batch_start(batch, batch_idx, dataloader_idx)
|
|
|
|
|
|
|
|
# lightning module methods
|
|
|
|
output = self.evaluation_loop.evaluation_step(test_mode, batch, batch_idx, dataloader_idx)
|
|
|
|
output = self.evaluation_loop.evaluation_step_end(output)
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.evaluation_loop.on_evaluation_batch_end(batch, batch_idx, dataloader_idx)
|
|
|
|
|
|
|
|
# clean up
|
|
|
|
self.evaluation_loop.evaluation_batch_end_cleanup(output, batch_idx, dataloader_idx)
|
|
|
|
self.evaluation_loop.log_step_metrics(output, batch_idx)
|
|
|
|
|
|
|
|
# track epoch level metrics
|
|
|
|
if output is not None:
|
|
|
|
dl_outputs.append(output)
|
|
|
|
|
|
|
|
self.evaluation_loop.outputs.append(dl_outputs)
|
|
|
|
|
|
|
|
# lightning module method
|
|
|
|
eval_results = self.evaluation_loop.evaluation_epoch_end(num_dataloaders=len(dataloaders))
|
|
|
|
|
|
|
|
# bookkeeping
|
|
|
|
eval_loop_results = self.evaluation_loop.log_epoch_metrics(eval_results, test_mode)
|
|
|
|
self.evaluation_loop.predictions.to_disk()
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.evaluation_loop.on_evaluation_epoch_end()
|
|
|
|
|
|
|
|
# enable train mode again
|
|
|
|
model.train()
|
|
|
|
torch.set_grad_enabled(True)
|
|
|
|
|
|
|
|
# hook
|
|
|
|
self.evaluation_loop.on_evaluation_end()
|
|
|
|
|
|
|
|
return eval_loop_results, eval_results
|
|
|
|
|
2020-09-01 00:36:52 +00:00
|
|
|
def run_test(self):
|
|
|
|
# only load test dataloader for testing
|
|
|
|
# self.reset_test_dataloader(ref_model)
|
|
|
|
eval_loop_results, _ = self.run_evaluation(test_mode=True)
|
2020-07-07 16:24:56 +00:00
|
|
|
|
2020-09-01 00:36:52 +00:00
|
|
|
if len(eval_loop_results) == 0:
|
|
|
|
return 1
|
2019-08-30 22:56:09 +00:00
|
|
|
|
2020-09-01 00:36:52 +00:00
|
|
|
# remove the tensors from the eval results
|
|
|
|
for i, result in enumerate(eval_loop_results):
|
|
|
|
if isinstance(result, dict):
|
|
|
|
for k, v in result.items():
|
|
|
|
if isinstance(v, torch.Tensor):
|
|
|
|
result[k] = v.cpu().item()
|
2020-07-14 18:20:45 +00:00
|
|
|
|
2020-09-01 00:36:52 +00:00
|
|
|
return eval_loop_results
|
2020-07-14 18:20:45 +00:00
|
|
|
|
2020-09-01 00:36:52 +00:00
|
|
|
def run_sanity_check(self, ref_model):
|
|
|
|
using_val_step = ref_model.val_dataloader is not None and is_overridden('validation_step', ref_model)
|
2020-07-25 16:57:40 +00:00
|
|
|
should_sanity_check = using_val_step and self.num_sanity_val_steps > 0 and self.limit_val_batches > 0
|
2020-07-23 11:07:03 +00:00
|
|
|
|
2019-08-30 22:56:09 +00:00
|
|
|
# run tiny validation (if validation defined)
|
|
|
|
# to make sure program won't crash during val
|
2020-07-23 11:07:03 +00:00
|
|
|
if should_sanity_check:
|
2020-02-26 21:55:18 +00:00
|
|
|
self.reset_val_dataloader(ref_model)
|
2020-08-21 18:11:31 +00:00
|
|
|
self.num_sanity_val_batches = [
|
|
|
|
min(self.num_sanity_val_steps, val_batches) for val_batches in self.num_val_batches
|
|
|
|
]
|
2020-04-24 00:46:18 +00:00
|
|
|
|
|
|
|
# hook and callback
|
2020-07-22 17:53:10 +00:00
|
|
|
self.running_sanity_check = True
|
2020-04-24 00:46:18 +00:00
|
|
|
self.on_sanity_check_start()
|
2019-08-24 01:23:27 +00:00
|
|
|
|
2020-08-26 16:28:14 +00:00
|
|
|
# run eval step
|
|
|
|
_, eval_results = self.run_evaluation(test_mode=False, max_batches=self.num_sanity_val_batches)
|
2020-07-01 11:38:00 +00:00
|
|
|
|
|
|
|
# allow no returns from eval
|
|
|
|
if eval_results is not None and len(eval_results) > 0:
|
2020-07-14 18:20:45 +00:00
|
|
|
# when we get a list back, used only the last item
|
|
|
|
if isinstance(eval_results, list):
|
|
|
|
eval_results = eval_results[-1]
|
2020-07-22 17:53:10 +00:00
|
|
|
|
|
|
|
if isinstance(eval_results, EvalResult):
|
|
|
|
callback_metrics = eval_results.callback_metrics
|
|
|
|
else:
|
2020-09-21 02:58:43 +00:00
|
|
|
_, _, _, callback_metrics, _ = self.process_dict_result(eval_results)
|
2020-09-07 13:31:42 +00:00
|
|
|
self.logger_connector.callback_metrics = callback_metrics
|
2019-08-07 11:51:55 +00:00
|
|
|
|
2020-04-24 00:46:18 +00:00
|
|
|
self.on_sanity_check_end()
|
2020-07-22 17:53:10 +00:00
|
|
|
self.running_sanity_check = False
|
2019-11-03 10:42:53 +00:00
|
|
|
|
2020-08-09 10:24:09 +00:00
|
|
|
@trainer_state(entering=TrainerState.RUNNING, exiting=TrainerState.FINISHED)
|
2020-05-04 12:24:34 +00:00
|
|
|
def test(
|
2020-07-24 15:42:15 +00:00
|
|
|
self,
|
|
|
|
model: Optional[LightningModule] = None,
|
|
|
|
test_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None,
|
|
|
|
ckpt_path: Optional[str] = 'best',
|
|
|
|
verbose: bool = True,
|
|
|
|
datamodule: Optional[LightningDataModule] = None,
|
2020-05-04 12:24:34 +00:00
|
|
|
):
|
2020-07-07 16:24:56 +00:00
|
|
|
# --------------------
|
|
|
|
# SETUP HOOK
|
|
|
|
# --------------------
|
2020-07-14 18:20:45 +00:00
|
|
|
self.verbose_test = verbose
|
|
|
|
|
2020-07-09 22:36:36 +00:00
|
|
|
if self.global_rank != 0:
|
|
|
|
return
|
|
|
|
|
2020-07-24 15:42:15 +00:00
|
|
|
# If you supply a datamodule you can't supply train_dataloader or val_dataloaders
|
|
|
|
if test_dataloaders and datamodule:
|
|
|
|
raise MisconfigurationException(
|
|
|
|
'You cannot pass test_dataloaders to trainer.test if you supply a datamodule'
|
|
|
|
)
|
|
|
|
|
|
|
|
# Attach datamodule to get setup/prepare_data added to model before the call to it below
|
2020-08-31 15:08:22 +00:00
|
|
|
self.data_connector.attach_datamodule(model or self.get_model(), datamodule, 'test')
|
2020-07-09 22:36:36 +00:00
|
|
|
|
|
|
|
if model is not None:
|
|
|
|
results = self.__test_given_model(model, test_dataloaders)
|
|
|
|
else:
|
|
|
|
results = self.__test_using_best_weights(ckpt_path, test_dataloaders)
|
|
|
|
|
|
|
|
self.teardown('test')
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
def __test_using_best_weights(self, ckpt_path, test_dataloaders):
|
|
|
|
model = self.get_model()
|
2020-06-17 23:49:58 +00:00
|
|
|
|
2020-07-07 16:24:56 +00:00
|
|
|
# if user requests the best checkpoint but we don't have it, error
|
2020-07-09 22:36:36 +00:00
|
|
|
if ckpt_path == 'best' and self.checkpoint_callback.save_top_k <= 0:
|
2020-06-15 12:02:37 +00:00
|
|
|
raise MisconfigurationException(
|
2020-07-24 15:42:15 +00:00
|
|
|
'ckpt_path is "best", but ModelCheckpoint is not configured to save the best model.'
|
|
|
|
)
|
2020-06-15 12:02:37 +00:00
|
|
|
|
2020-07-09 22:36:36 +00:00
|
|
|
# load best weights
|
|
|
|
if ckpt_path is not None:
|
2020-06-15 12:02:37 +00:00
|
|
|
# ckpt_path is 'best' so load the best model
|
|
|
|
if ckpt_path == 'best':
|
|
|
|
ckpt_path = self.checkpoint_callback.best_model_path
|
2020-03-03 04:38:47 +00:00
|
|
|
|
2020-07-10 01:28:11 +00:00
|
|
|
if len(ckpt_path) == 0:
|
2020-07-24 15:42:15 +00:00
|
|
|
rank_zero_warn(
|
|
|
|
f'.test() found no path for the best weights, {ckpt_path}. Please '
|
|
|
|
f'specify a path for a checkpoint .test(ckpt_path=PATH)'
|
|
|
|
)
|
2020-07-10 01:28:11 +00:00
|
|
|
return {}
|
|
|
|
|
2020-07-09 22:36:36 +00:00
|
|
|
ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
|
|
|
|
model.load_state_dict(ckpt['state_dict'])
|
2020-04-10 15:44:03 +00:00
|
|
|
|
2020-07-09 22:36:36 +00:00
|
|
|
# attach dataloaders
|
2020-04-10 15:44:03 +00:00
|
|
|
if test_dataloaders is not None:
|
2020-08-31 15:08:22 +00:00
|
|
|
self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders)
|
2020-04-10 15:44:03 +00:00
|
|
|
|
2020-07-09 22:36:36 +00:00
|
|
|
# run tests
|
|
|
|
self.tested_ckpt_path = ckpt_path
|
2020-07-07 16:24:56 +00:00
|
|
|
self.testing = True
|
2020-07-09 22:36:36 +00:00
|
|
|
os.environ['PL_TESTING_MODE'] = '1'
|
2020-07-07 16:24:56 +00:00
|
|
|
self.model = model
|
|
|
|
results = self.fit(model)
|
2020-03-06 11:57:14 +00:00
|
|
|
self.testing = False
|
2020-07-09 22:36:36 +00:00
|
|
|
del os.environ['PL_TESTING_MODE']
|
2020-03-06 11:57:14 +00:00
|
|
|
|
2020-07-09 22:36:36 +00:00
|
|
|
# teardown
|
2020-06-17 23:49:58 +00:00
|
|
|
if self.is_function_implemented('teardown'):
|
2020-06-25 15:10:17 +00:00
|
|
|
model_ref = self.get_model()
|
|
|
|
model_ref.teardown('test')
|
2020-06-17 23:49:58 +00:00
|
|
|
|
2020-07-07 16:24:56 +00:00
|
|
|
return results
|
|
|
|
|
2020-07-09 22:36:36 +00:00
|
|
|
def __test_given_model(self, model, test_dataloaders):
|
|
|
|
|
|
|
|
# attach data
|
|
|
|
if test_dataloaders is not None:
|
2020-08-31 15:08:22 +00:00
|
|
|
self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders)
|
2020-07-09 22:36:36 +00:00
|
|
|
|
|
|
|
# run test
|
|
|
|
# sets up testing so we short circuit to eval
|
|
|
|
self.testing = True
|
|
|
|
self.model = model
|
|
|
|
results = self.fit(model)
|
|
|
|
self.testing = False
|
|
|
|
|
|
|
|
# teardown
|
|
|
|
if self.is_function_implemented('teardown'):
|
|
|
|
model.teardown('test')
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
2020-08-02 00:17:57 +00:00
|
|
|
def call_setup_hook(self, model):
|
|
|
|
# call setup after the ddp process has connected
|
|
|
|
stage_name = 'test' if self.testing else 'fit'
|
|
|
|
if self.datamodule is not None:
|
|
|
|
called = self.datamodule.has_setup_test if self.testing else self.datamodule.has_setup_fit
|
|
|
|
if not called:
|
|
|
|
self.datamodule.setup(stage_name)
|
|
|
|
self.setup(stage_name)
|
|
|
|
model.setup(stage_name)
|
|
|
|
|
2020-08-24 17:46:46 +00:00
|
|
|
def call_hook(self, hook_name, *args, **kwargs):
|
2020-08-24 19:48:14 +00:00
|
|
|
# always profile hooks
|
|
|
|
with self.profiler.profile(hook_name):
|
|
|
|
|
|
|
|
# first call trainer hook
|
|
|
|
if hasattr(self, hook_name):
|
|
|
|
trainer_hook = getattr(self, hook_name)
|
|
|
|
trainer_hook(*args, **kwargs)
|
|
|
|
|
|
|
|
# next call hook in lightningModule
|
|
|
|
output = None
|
2020-08-31 16:12:02 +00:00
|
|
|
model_ref = self.get_model()
|
|
|
|
if is_overridden(hook_name, model_ref):
|
2020-08-24 17:46:46 +00:00
|
|
|
hook_fx = getattr(model_ref, hook_name)
|
|
|
|
output = hook_fx(*args, **kwargs)
|
|
|
|
|
2020-08-24 21:50:47 +00:00
|
|
|
# if the PL module doesn't have the hook then call the accelator
|
|
|
|
# used to auto-reduce things for the user with Results obj
|
|
|
|
elif hasattr(self.accelerator_backend, hook_name):
|
|
|
|
accelerator_hook = getattr(self.accelerator_backend, hook_name)
|
|
|
|
output = accelerator_hook(*args, **kwargs)
|
|
|
|
|
2020-08-24 19:48:14 +00:00
|
|
|
return output
|
2020-08-24 17:46:46 +00:00
|
|
|
|
2020-09-11 01:58:47 +00:00
|
|
|
|
2020-09-09 12:45:21 +00:00
|
|
|
# add docstrings
|
|
|
|
Trainer.__init__.__doc__ = docstrings.trainer.init
|
|
|
|
Trainer.fit.__doc__ = docstrings.trainer.fit
|
|
|
|
Trainer.test.__doc__ = docstrings.trainer.test
|