2020-03-12 16:47:23 +00:00
|
|
|
from typing import Any
|
2019-11-28 17:48:55 +00:00
|
|
|
|
2019-03-31 01:45:16 +00:00
|
|
|
import torch
|
2020-03-12 16:47:23 +00:00
|
|
|
from torch import Tensor
|
2020-06-10 13:43:12 +00:00
|
|
|
from torch.nn import Module
|
2020-03-12 16:47:23 +00:00
|
|
|
from torch.optim.optimizer import Optimizer
|
2020-06-27 01:45:13 +00:00
|
|
|
from pytorch_lightning.utilities import move_data_to_device, NATIVE_AMP_AVALAIBLE
|
2020-06-03 01:45:19 +00:00
|
|
|
|
2019-08-04 18:08:14 +00:00
|
|
|
|
2019-10-24 11:56:56 +00:00
|
|
|
try:
|
|
|
|
from apex import amp
|
|
|
|
except ImportError:
|
|
|
|
APEX_AVAILABLE = False
|
2020-03-17 00:50:36 +00:00
|
|
|
else:
|
|
|
|
APEX_AVAILABLE = True
|
2019-10-24 11:56:56 +00:00
|
|
|
|
|
|
|
|
2020-06-10 13:43:12 +00:00
|
|
|
class ModelHooks(Module):
|
2019-08-07 11:51:55 +00:00
|
|
|
|
2020-06-18 11:21:44 +00:00
|
|
|
def setup(self, stage: str):
|
2020-06-17 23:49:58 +00:00
|
|
|
"""
|
|
|
|
Called at the beginning of fit and test.
|
2020-06-19 06:38:10 +00:00
|
|
|
This is a good hook when you need to build models dynamically or adjust something about them.
|
|
|
|
This hook is called on every process when using DDP.
|
2020-06-17 23:49:58 +00:00
|
|
|
|
|
|
|
Args:
|
2020-06-18 12:29:18 +00:00
|
|
|
stage: either 'fit' or 'test'
|
2020-06-19 06:38:10 +00:00
|
|
|
|
|
|
|
Example::
|
|
|
|
|
|
|
|
class LitModel(...):
|
|
|
|
def __init__(self):
|
|
|
|
self.l1 = None
|
|
|
|
|
|
|
|
def prepare_data(self):
|
|
|
|
download_data()
|
|
|
|
tokenize()
|
|
|
|
|
|
|
|
# don't do this
|
|
|
|
self.something = else
|
|
|
|
|
2020-07-09 15:54:38 +00:00
|
|
|
def setup(stage):
|
2020-06-19 06:38:10 +00:00
|
|
|
data = Load_data(...)
|
|
|
|
self.l1 = nn.Linear(28, data.num_classes)
|
|
|
|
|
2020-06-17 23:49:58 +00:00
|
|
|
"""
|
|
|
|
|
2020-06-18 11:21:44 +00:00
|
|
|
def teardown(self, stage: str):
|
2020-06-17 23:49:58 +00:00
|
|
|
"""
|
|
|
|
Called at the end of fit and test.
|
|
|
|
|
|
|
|
Args:
|
2020-06-19 04:42:20 +00:00
|
|
|
stage: either 'fit' or 'test'
|
2020-06-17 23:49:58 +00:00
|
|
|
"""
|
|
|
|
|
2020-06-17 11:37:16 +00:00
|
|
|
def on_fit_start(self):
|
|
|
|
"""
|
|
|
|
Called at the very beginning of fit.
|
|
|
|
If on DDP it is called on every process
|
|
|
|
"""
|
|
|
|
|
|
|
|
def on_fit_end(self):
|
|
|
|
"""
|
|
|
|
Called at the very end of fit.
|
|
|
|
If on DDP it is called on every process
|
|
|
|
"""
|
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_train_start(self) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Called at the beginning of training before sanity check.
|
2019-12-07 13:52:06 +00:00
|
|
|
"""
|
|
|
|
# do something at the start of training
|
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_train_end(self) -> None:
|
2019-12-07 13:52:06 +00:00
|
|
|
"""
|
2020-04-16 16:04:55 +00:00
|
|
|
Called at the end of training before logger experiment is closed.
|
2019-12-07 13:52:06 +00:00
|
|
|
"""
|
|
|
|
# do something at the end of training
|
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_batch_start(self, batch: Any) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Called in the training loop before anything happens for that batch.
|
2019-11-28 17:48:55 +00:00
|
|
|
|
2020-04-08 12:38:53 +00:00
|
|
|
If you return -1 here, you will skip training for the rest of the current epoch.
|
|
|
|
|
2020-04-16 16:04:55 +00:00
|
|
|
Args:
|
|
|
|
batch: The batched data as it is returned by the training DataLoader.
|
2019-11-28 17:48:55 +00:00
|
|
|
"""
|
|
|
|
# do something when the batch starts
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_batch_end(self) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Called in the training loop after the batch.
|
|
|
|
"""
|
2019-11-28 17:48:55 +00:00
|
|
|
# do something when the batch ends
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_epoch_start(self) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Called in the training loop at the very beginning of the epoch.
|
|
|
|
"""
|
2019-11-28 17:48:55 +00:00
|
|
|
# do something when the epoch starts
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_epoch_end(self) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Called in the training loop at the very end of the epoch.
|
|
|
|
"""
|
2019-11-28 17:48:55 +00:00
|
|
|
# do something when the epoch ends
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2020-07-20 23:00:20 +00:00
|
|
|
def on_train_epoch_start(self) -> None:
|
|
|
|
"""
|
|
|
|
Called in the training loop at the very beginning of the epoch.
|
|
|
|
"""
|
|
|
|
# do something when the epoch starts
|
|
|
|
|
|
|
|
def on_train_epoch_end(self) -> None:
|
|
|
|
"""
|
|
|
|
Called in the training loop at the very end of the epoch.
|
|
|
|
"""
|
|
|
|
# do something when the epoch ends
|
|
|
|
|
|
|
|
def on_validation_epoch_start(self) -> None:
|
|
|
|
"""
|
|
|
|
Called in the validation loop at the very beginning of the epoch.
|
|
|
|
"""
|
|
|
|
# do something when the epoch starts
|
|
|
|
|
|
|
|
def on_validation_epoch_end(self) -> None:
|
|
|
|
"""
|
|
|
|
Called in the validation loop at the very end of the epoch.
|
|
|
|
"""
|
|
|
|
# do something when the epoch ends
|
|
|
|
|
|
|
|
def on_test_epoch_start(self) -> None:
|
|
|
|
"""
|
|
|
|
Called in the test loop at the very beginning of the epoch.
|
|
|
|
"""
|
|
|
|
# do something when the epoch starts
|
|
|
|
|
|
|
|
def on_test_epoch_end(self) -> None:
|
|
|
|
"""
|
|
|
|
Called in the test loop at the very end of the epoch.
|
|
|
|
"""
|
|
|
|
# do something when the epoch ends
|
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_pre_performance_check(self) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Called at the very beginning of the validation loop.
|
|
|
|
"""
|
2019-11-28 17:48:55 +00:00
|
|
|
# do something before validation starts
|
2019-03-31 01:45:16 +00:00
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_post_performance_check(self) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Called at the very end of the validation loop.
|
|
|
|
"""
|
2019-11-28 17:48:55 +00:00
|
|
|
# do something before validation end
|
2019-04-21 16:26:35 +00:00
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_before_zero_grad(self, optimizer: Optimizer) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Called after optimizer.step() and before optimizer.zero_grad().
|
2019-11-28 17:48:55 +00:00
|
|
|
|
|
|
|
Called in the training loop after taking an optimizer step and before zeroing grads.
|
|
|
|
Good place to inspect weight information with weights updated.
|
|
|
|
|
2020-04-16 16:04:55 +00:00
|
|
|
This is where it is called::
|
2019-07-21 22:15:58 +00:00
|
|
|
|
2020-04-16 16:04:55 +00:00
|
|
|
for optimizer in optimizers:
|
|
|
|
optimizer.step()
|
|
|
|
model.on_before_zero_grad(optimizer) # < ---- called here
|
|
|
|
optimizer.zero_grad
|
2019-07-21 22:15:58 +00:00
|
|
|
|
2020-04-16 16:04:55 +00:00
|
|
|
Args:
|
|
|
|
optimizer: The optimizer for which grads should be zeroed.
|
2019-07-21 22:15:58 +00:00
|
|
|
"""
|
2019-11-28 17:48:55 +00:00
|
|
|
# do something with the optimizer or inspect it.
|
2019-07-21 22:15:58 +00:00
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def on_after_backward(self) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Called in the training loop after loss.backward() and before optimizers do anything.
|
|
|
|
This is the ideal place to inspect or log gradient information.
|
2019-11-28 17:48:55 +00:00
|
|
|
|
2020-04-16 16:04:55 +00:00
|
|
|
Example::
|
2019-11-28 17:48:55 +00:00
|
|
|
|
|
|
|
def on_after_backward(self):
|
|
|
|
# example to inspect gradient information in tensorboard
|
|
|
|
if self.trainer.global_step % 25 == 0: # don't make the tf file huge
|
|
|
|
params = self.state_dict()
|
|
|
|
for k, v in params.items():
|
|
|
|
grads = v
|
|
|
|
name = k
|
|
|
|
self.logger.experiment.add_histogram(tag=name, values=grads,
|
|
|
|
global_step=self.trainer.global_step)
|
|
|
|
|
2019-07-21 22:23:48 +00:00
|
|
|
"""
|
2019-10-24 11:56:56 +00:00
|
|
|
|
2020-03-12 16:47:23 +00:00
|
|
|
def backward(self, trainer, loss: Tensor, optimizer: Optimizer, optimizer_idx: int) -> None:
|
2020-04-16 16:04:55 +00:00
|
|
|
"""
|
|
|
|
Override backward with your own implementation if you need to.
|
2019-11-28 17:48:55 +00:00
|
|
|
|
2020-04-16 16:04:55 +00:00
|
|
|
Args:
|
|
|
|
trainer: Pointer to the trainer
|
|
|
|
loss: Loss is already scaled by accumulated grads
|
|
|
|
optimizer: Current optimizer being used
|
|
|
|
optimizer_idx: Index of the current optimizer being used
|
2019-11-28 17:48:55 +00:00
|
|
|
|
|
|
|
Called to perform backward step.
|
|
|
|
Feel free to override as needed.
|
|
|
|
|
|
|
|
The loss passed in has already been scaled for accumulated gradients if requested.
|
|
|
|
|
2020-04-16 16:04:55 +00:00
|
|
|
Example::
|
2019-11-28 17:48:55 +00:00
|
|
|
|
2020-06-19 19:44:44 +00:00
|
|
|
def backward(self, trainer, loss, optimizer, optimizer_idx):
|
|
|
|
loss.backward()
|
|
|
|
|
|
|
|
"""
|
|
|
|
loss.backward()
|
|
|
|
|
|
|
|
def amp_scale_loss(self, unscaled_loss, optimizer, optimizer_idx):
|
2020-06-27 01:45:13 +00:00
|
|
|
if NATIVE_AMP_AVALAIBLE:
|
2020-06-19 19:44:44 +00:00
|
|
|
scaled_loss = self.trainer.scaler.scale(unscaled_loss)
|
2019-10-24 11:56:56 +00:00
|
|
|
else:
|
2020-06-19 19:44:44 +00:00
|
|
|
scaled_loss = amp.scale_loss(unscaled_loss, optimizer)
|
|
|
|
|
|
|
|
return scaled_loss
|
2020-06-03 01:45:19 +00:00
|
|
|
|
|
|
|
def transfer_batch_to_device(self, batch: Any, device: torch.device) -> Any:
|
|
|
|
"""
|
|
|
|
Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors
|
|
|
|
wrapped in a custom data structure.
|
|
|
|
|
|
|
|
The data types listed below (and any arbitrary nesting of them) are supported out of the box:
|
|
|
|
|
2020-06-24 03:41:02 +00:00
|
|
|
- :class:`torch.Tensor` or anything that implements `.to(...)`
|
2020-06-03 01:45:19 +00:00
|
|
|
- :class:`list`
|
|
|
|
- :class:`dict`
|
|
|
|
- :class:`tuple`
|
2020-06-27 20:36:45 +00:00
|
|
|
- :class:`torchtext.data.batch.Batch`
|
2020-06-03 01:45:19 +00:00
|
|
|
|
|
|
|
For anything else, you need to define how the data is moved to the target device (CPU, GPU, TPU, ...).
|
|
|
|
|
|
|
|
Example::
|
|
|
|
|
|
|
|
def transfer_batch_to_device(self, batch, device)
|
|
|
|
if isinstance(batch, CustomBatch):
|
|
|
|
# move all tensors in your custom data structure to the device
|
|
|
|
batch.samples = batch.samples.to(device)
|
|
|
|
batch.targets = batch.targets.to(device)
|
|
|
|
else:
|
|
|
|
batch = super().transfer_batch_to_device(data, device)
|
|
|
|
return batch
|
|
|
|
|
|
|
|
Args:
|
|
|
|
batch: A batch of data that needs to be transferred to a new device.
|
|
|
|
device: The target device as defined in PyTorch.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A reference to the data on the new device.
|
|
|
|
|
|
|
|
Note:
|
|
|
|
This hook should only transfer the data and not modify it, nor should it move the data to
|
|
|
|
any other device than the one passed in as argument (unless you know what you are doing).
|
|
|
|
The :class:`~pytorch_lightning.trainer.trainer.Trainer` already takes care of splitting the
|
|
|
|
batch and determines the target devices.
|
|
|
|
|
|
|
|
See Also:
|
|
|
|
- :func:`~pytorch_lightning.utilities.apply_func.move_data_to_device`
|
|
|
|
- :func:`~pytorch_lightning.utilities.apply_func.apply_to_collection`
|
|
|
|
"""
|
|
|
|
return move_data_to_device(batch, device)
|