354 lines
12 KiB
Python
354 lines
12 KiB
Python
# Copyright The PyTorch Lightning team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import inspect
|
|
from abc import ABC, abstractmethod
|
|
from typing import Any, Dict, Generic, Optional, Type, TypeVar, Union
|
|
|
|
from deprecate import void
|
|
from torchmetrics import Metric
|
|
|
|
import pytorch_lightning as pl
|
|
from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection
|
|
from pytorch_lightning.trainer.progress import BaseProgress
|
|
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
|
from pytorch_lightning.utilities.imports import _fault_tolerant_training
|
|
|
|
T = TypeVar("T") # the output type of `run`
|
|
|
|
|
|
class Loop(ABC, Generic[T]):
|
|
"""Basic Loops interface. All classes derived from this must implement the following properties and methods:
|
|
|
|
* :attr:`done` (property): Condition to break the loop
|
|
* :attr:`reset` (method): Resets the internal state between multiple calls of :attr:`run`
|
|
* :attr:`advance` (method): Implements one step of the loop
|
|
|
|
This class implements the following loop structure:
|
|
|
|
.. code-block:: python
|
|
|
|
on_run_start()
|
|
|
|
while not done:
|
|
on_advance_start()
|
|
advance()
|
|
on_advance_end()
|
|
|
|
on_run_end()
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
self._restarting = False
|
|
self._trainer: Optional["pl.Trainer"] = None
|
|
|
|
@property
|
|
def trainer(self) -> "pl.Trainer":
|
|
if self._trainer is None:
|
|
raise RuntimeError("The loop is not attached to a Trainer.")
|
|
return self._trainer
|
|
|
|
@trainer.setter
|
|
def trainer(self, trainer: "pl.Trainer") -> None:
|
|
"""Connects this loop's trainer and its children."""
|
|
if not isinstance(trainer, pl.Trainer):
|
|
raise MisconfigurationException(
|
|
f"Loop {self.__class__.__name__} should be connected to a `Trainer`, found: {trainer}."
|
|
)
|
|
self._trainer = trainer
|
|
for v in self.__dict__.values():
|
|
if isinstance(v, Loop):
|
|
v.trainer = trainer
|
|
|
|
@property
|
|
def restarting(self) -> bool:
|
|
"""Whether the state of this loop was reloaded and it needs to restart."""
|
|
return self._restarting
|
|
|
|
@restarting.setter
|
|
def restarting(self, restarting: bool) -> None:
|
|
"""Connects this loop's restarting value and its children."""
|
|
self._restarting = restarting
|
|
for loop in vars(self).values():
|
|
if isinstance(loop, Loop):
|
|
loop.restarting = restarting
|
|
|
|
@property
|
|
@abstractmethod
|
|
def done(self) -> bool:
|
|
"""Property indicating when the loop is finished.
|
|
|
|
Example::
|
|
|
|
@property
|
|
def done(self):
|
|
return self.trainer.global_step >= self.trainer.max_steps
|
|
"""
|
|
|
|
@property
|
|
def skip(self) -> bool:
|
|
"""Determine whether to return immediately from the call to :meth:`run`.
|
|
|
|
Example::
|
|
|
|
@property
|
|
def skip(self):
|
|
return len(self.trainer.train_dataloader) == 0
|
|
"""
|
|
return False
|
|
|
|
def connect(self, **kwargs: "Loop") -> None:
|
|
"""Optionally connect one or multiple loops to this one.
|
|
|
|
Linked loops should form a tree.
|
|
"""
|
|
|
|
def replace(self, **loops: Union["Loop", Type["Loop"]]) -> None:
|
|
"""Optionally replace one or multiple of this loop's sub-loops.
|
|
|
|
This methods takes care of instantiating the class (if necessary) with all existing arguments, connecting all
|
|
sub-loops of the old loop to the new instance, setting the ``Trainer`` reference, and connecting the new loop to
|
|
the parent.
|
|
|
|
Args:
|
|
**loops: ``Loop`` subclasses or instances. The name used should match the loop attribute name you want to
|
|
replace.
|
|
|
|
Raises:
|
|
MisconfigurationException: When passing a ``Loop`` class, if the ``__init__`` arguments do not match those
|
|
of the Loop class it replaces.
|
|
"""
|
|
new_loops = {}
|
|
|
|
for name, type_or_object in loops.items():
|
|
old_loop = getattr(self, name)
|
|
|
|
if isinstance(type_or_object, type):
|
|
# compare the signatures
|
|
old_parameters = inspect.signature(old_loop.__class__.__init__).parameters
|
|
current_parameters = inspect.signature(type_or_object.__init__).parameters
|
|
if old_parameters != current_parameters:
|
|
raise MisconfigurationException(
|
|
f"`{self.__class__.__name__}.replace({type_or_object.__name__})` can only be used if the"
|
|
f" `__init__` signatures match but `{old_loop.__class__.__name__}` does not."
|
|
)
|
|
# instantiate the loop
|
|
kwargs = {p: getattr(old_loop, p) for p in old_parameters if p != "self"}
|
|
loop = type_or_object(**kwargs)
|
|
else:
|
|
loop = type_or_object
|
|
|
|
# connect sub-loops
|
|
kwargs = {n: l for n, l in old_loop.__dict__.items() if isinstance(l, Loop)}
|
|
loop.connect(**kwargs)
|
|
# set the trainer reference
|
|
loop.trainer = self.trainer
|
|
|
|
new_loops[name] = loop
|
|
# connect to self
|
|
self.connect(**new_loops)
|
|
|
|
def on_skip(self) -> T:
|
|
"""The function to run when :meth:`run` should be skipped, determined by the condition in :attr:`skip`.
|
|
|
|
Returns:
|
|
the default output value of :meth:`on_run_end`
|
|
"""
|
|
|
|
def run(self, *args: Any, **kwargs: Any) -> T:
|
|
"""The main entry point to the loop.
|
|
|
|
Will frequently check the :attr:`done` condition and calls :attr:`advance`
|
|
until :attr:`done` evaluates to ``True``.
|
|
|
|
Override this if you wish to change the default behavior. The default implementation is:
|
|
|
|
Example::
|
|
|
|
def run(self, *args, **kwargs):
|
|
if self.skip:
|
|
return self.on_skip()
|
|
|
|
self.reset()
|
|
self.on_run_start(*args, **kwargs)
|
|
|
|
while not self.done:
|
|
self.advance(*args, **kwargs)
|
|
|
|
output = self.on_run_end()
|
|
return output
|
|
|
|
Returns:
|
|
The output of :attr:`on_run_end` (often outputs collected from each step of the loop)
|
|
"""
|
|
if self.skip:
|
|
return self.on_skip()
|
|
|
|
self.reset()
|
|
|
|
self.on_run_start(*args, **kwargs)
|
|
|
|
while not self.done:
|
|
try:
|
|
self.on_advance_start(*args, **kwargs)
|
|
self.advance(*args, **kwargs)
|
|
self.on_advance_end()
|
|
self._restarting = False
|
|
except StopIteration:
|
|
break
|
|
self._restarting = False
|
|
|
|
output = self.on_run_end()
|
|
return output
|
|
|
|
@abstractmethod
|
|
def reset(self) -> None:
|
|
"""Resets the internal state of the loop at the beginning of each call to :attr:`run`.
|
|
|
|
Example::
|
|
|
|
def reset(self):
|
|
# reset your internal state or add custom logic
|
|
# if you expect run() to be called multiple times
|
|
self.current_iteration = 0
|
|
self.outputs = []
|
|
"""
|
|
|
|
def on_run_start(self, *args: Any, **kwargs: Any) -> None:
|
|
"""Hook to be called as the first thing after entering :attr:`run` (except the state reset).
|
|
|
|
Accepts all arguments passed to :attr:`run`.
|
|
"""
|
|
void(*args, **kwargs)
|
|
|
|
def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
|
|
"""Hook to be called each time before :attr:`advance` is called.
|
|
|
|
Accepts all arguments passed to :attr`run`.
|
|
"""
|
|
void(*args, **kwargs)
|
|
|
|
@abstractmethod
|
|
def advance(self, *args: Any, **kwargs: Any) -> None:
|
|
"""Performs a single step.
|
|
|
|
Accepts all arguments passed to :attr:`run`.
|
|
|
|
Example::
|
|
|
|
def advance(self, iterator):
|
|
batch = next(iterator)
|
|
loss = self.trainer.lightning_module.training_step(batch, batch_idx)
|
|
...
|
|
"""
|
|
|
|
def on_advance_end(self) -> None:
|
|
"""Hook to be called each time after :attr:`advance` is called."""
|
|
|
|
def on_run_end(self) -> T:
|
|
"""Hook to be called at the end of the run.
|
|
|
|
Its return argument is returned from :attr:`run`.
|
|
"""
|
|
|
|
def teardown(self) -> None:
|
|
"""Use to release memory etc."""
|
|
|
|
def on_save_checkpoint(self) -> Dict:
|
|
"""Called when saving a model checkpoint, use to persist loop state.
|
|
|
|
Returns:
|
|
The current loop state.
|
|
"""
|
|
return {}
|
|
|
|
def on_load_checkpoint(self, state_dict: Dict) -> None:
|
|
"""Called when loading a model checkpoint, use to reload loop state."""
|
|
|
|
def state_dict(self, destination: Optional[Dict] = None, prefix: str = "") -> Dict:
|
|
"""The state dict is determined by the state and progress of this loop and all its children.
|
|
|
|
Args:
|
|
destination: An existing dictionary to update with this loop's state. By default a new dictionary
|
|
is returned.
|
|
prefix: A prefix for each key in the state dictionary
|
|
"""
|
|
if destination is None:
|
|
destination = {}
|
|
|
|
destination[prefix + "state_dict"] = self.on_save_checkpoint()
|
|
|
|
# do not get the mode from `self.trainer` because it might not have been attached yet
|
|
ft_enabled = _fault_tolerant_training()
|
|
for k, v in self.__dict__.items():
|
|
key = prefix + k
|
|
if isinstance(v, BaseProgress):
|
|
destination[key] = v.state_dict()
|
|
elif isinstance(v, Loop):
|
|
v.state_dict(destination, key + ".")
|
|
elif ft_enabled and isinstance(v, _ResultCollection):
|
|
# sync / unsync metrics
|
|
v.sync()
|
|
destination[key] = v.state_dict()
|
|
v.unsync()
|
|
|
|
return destination
|
|
|
|
def load_state_dict(
|
|
self,
|
|
state_dict: Dict,
|
|
prefix: str = "",
|
|
metrics: Optional[Dict[str, Metric]] = None,
|
|
) -> None:
|
|
"""Loads the state of this loop and all its children."""
|
|
self._load_from_state_dict(state_dict.copy(), prefix, metrics)
|
|
for k, v in self.__dict__.items():
|
|
if isinstance(v, Loop):
|
|
v.load_state_dict(state_dict.copy(), prefix + k + ".")
|
|
self.restarting = True
|
|
|
|
def _load_from_state_dict(self, state_dict: Dict, prefix: str, metrics: Optional[Dict[str, Metric]] = None) -> None:
|
|
for k, v in self.__dict__.items():
|
|
key = prefix + k
|
|
if key not in state_dict:
|
|
# compatibility with old checkpoints
|
|
continue
|
|
|
|
if isinstance(v, BaseProgress):
|
|
v.load_state_dict(state_dict[key])
|
|
elif (
|
|
isinstance(v, _ResultCollection)
|
|
and self.trainer is not None
|
|
and self.trainer.lightning_module is not None
|
|
):
|
|
metric_attributes = {
|
|
name: module
|
|
for name, module in self.trainer.lightning_module.named_modules()
|
|
if isinstance(module, Metric)
|
|
}
|
|
if metrics:
|
|
metric_attributes.update(metrics)
|
|
|
|
# The `_ResultCollection` objects have 2 types of metrics: `Tensor` and `torchmetrics.Metric`.
|
|
# When creating a checkpoint, the `Metric`s are dropped from the loop `state_dict` to serialize only
|
|
# Python primitives. However, their states are saved with the model's `state_dict`.
|
|
# On reload, we need to re-attach the `Metric`s back to the `_ResultCollection`.
|
|
# The references are provided through the `metric_attributes` dictionary.
|
|
v.load_state_dict(state_dict[key], metrics=metric_attributes, sync_fn=self.trainer.strategy.reduce)
|
|
|
|
if not self.trainer.is_global_zero:
|
|
v.reset(metrics=False)
|
|
|
|
if prefix + "state_dict" in state_dict: # compatibility with old checkpoints
|
|
self.on_load_checkpoint(state_dict[prefix + "state_dict"])
|