2020-08-20 02:03:22 +00:00
|
|
|
# Copyright The PyTorch Lightning team.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
2020-03-06 03:25:05 +00:00
|
|
|
"""
|
|
|
|
Model Checkpointing
|
|
|
|
===================
|
|
|
|
|
|
|
|
Automatically save model checkpoints during training.
|
2020-04-05 09:38:52 +00:00
|
|
|
|
2020-03-06 03:25:05 +00:00
|
|
|
"""
|
|
|
|
|
2020-02-23 02:45:34 +00:00
|
|
|
import os
|
2020-12-21 01:00:17 +00:00
|
|
|
import re
|
2021-01-05 02:54:49 +00:00
|
|
|
from copy import deepcopy
|
|
|
|
from pathlib import Path
|
2020-12-04 15:11:58 +00:00
|
|
|
from typing import Any, Dict, Optional, Union
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-08-07 22:33:51 +00:00
|
|
|
import numpy as np
|
2020-04-24 21:21:00 +00:00
|
|
|
import torch
|
2020-12-04 15:11:58 +00:00
|
|
|
import yaml
|
|
|
|
|
2020-03-17 22:44:00 +00:00
|
|
|
from pytorch_lightning import _logger as log
|
2020-04-09 18:05:46 +00:00
|
|
|
from pytorch_lightning.callbacks.base import Callback
|
2020-12-04 15:11:58 +00:00
|
|
|
from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
|
2020-09-03 12:19:20 +00:00
|
|
|
from pytorch_lightning.utilities.cloud_io import get_filesystem
|
2020-09-21 02:58:43 +00:00
|
|
|
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
2021-02-17 10:46:58 +00:00
|
|
|
from pytorch_lightning.utilities.warnings import WarningCache
|
|
|
|
|
|
|
|
warning_cache = WarningCache()
|
2020-02-23 02:45:34 +00:00
|
|
|
|
|
|
|
|
|
|
|
class ModelCheckpoint(Callback):
|
|
|
|
r"""
|
2020-09-25 13:54:04 +00:00
|
|
|
Save the model after every epoch by monitoring a quantity.
|
2020-05-31 12:47:13 +00:00
|
|
|
|
|
|
|
After training finishes, use :attr:`best_model_path` to retrieve the path to the
|
|
|
|
best checkpoint file and :attr:`best_model_score` to retrieve its score.
|
2020-02-23 02:45:34 +00:00
|
|
|
|
|
|
|
Args:
|
2021-01-01 23:08:29 +00:00
|
|
|
dirpath: directory to save the model file.
|
|
|
|
|
|
|
|
Example::
|
|
|
|
|
|
|
|
# custom path
|
|
|
|
# saves a file like: my/path/epoch=0-step=10.ckpt
|
|
|
|
>>> checkpoint_callback = ModelCheckpoint(dirpath='my/path/')
|
|
|
|
|
|
|
|
By default, dirpath is ``None`` and will be set at runtime to the location
|
|
|
|
specified by :class:`~pytorch_lightning.trainer.trainer.Trainer`'s
|
|
|
|
:paramref:`~pytorch_lightning.trainer.trainer.Trainer.default_root_dir` or
|
|
|
|
:paramref:`~pytorch_lightning.trainer.trainer.Trainer.weights_save_path` arguments,
|
|
|
|
and if the Trainer uses a logger, the path will also contain logger name and version.
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2021-01-01 23:08:29 +00:00
|
|
|
filename: checkpoint filename. Can contain named formatting options to be auto-filled.
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2021-01-01 23:08:29 +00:00
|
|
|
Example::
|
2020-03-04 14:33:39 +00:00
|
|
|
|
2021-01-01 23:08:29 +00:00
|
|
|
# save any arbitrary metrics like `val_loss`, etc. in name
|
|
|
|
# saves a file like: my/path/epoch=2-val_loss=0.02-other_metric=0.03.ckpt
|
|
|
|
>>> checkpoint_callback = ModelCheckpoint(
|
|
|
|
... dirpath='my/path',
|
|
|
|
... filename='{epoch}-{val_loss:.2f}-{other_metric:.2f}'
|
|
|
|
... )
|
|
|
|
|
|
|
|
By default, filename is ``None`` and will be set to ``'{epoch}-{step}'``.
|
2020-10-23 04:29:12 +00:00
|
|
|
monitor: quantity to monitor. By default it is ``None`` which saves a checkpoint only for the last epoch.
|
2020-04-05 09:38:52 +00:00
|
|
|
verbose: verbosity mode. Default: ``False``.
|
2020-10-23 04:29:12 +00:00
|
|
|
save_last: When ``True``, always saves the model at the end of the epoch to
|
|
|
|
a file `last.ckpt`. Default: ``None``.
|
2020-08-04 15:46:35 +00:00
|
|
|
save_top_k: if ``save_top_k == k``,
|
2020-02-23 02:45:34 +00:00
|
|
|
the best k models according to
|
|
|
|
the quantity monitored will be saved.
|
|
|
|
if ``save_top_k == 0``, no models are saved.
|
|
|
|
if ``save_top_k == -1``, all models are saved.
|
2021-01-26 22:29:34 +00:00
|
|
|
Please note that the monitors are checked every ``period`` epochs.
|
2020-02-23 02:45:34 +00:00
|
|
|
if ``save_top_k >= 2`` and the callback is called multiple
|
|
|
|
times inside an epoch, the name of the saved file will be
|
2021-01-26 22:29:34 +00:00
|
|
|
appended with a version count starting with ``v1``.
|
2020-04-05 09:38:52 +00:00
|
|
|
mode: one of {auto, min, max}.
|
2020-02-23 02:45:34 +00:00
|
|
|
If ``save_top_k != 0``, the decision
|
|
|
|
to overwrite the current save file is made
|
|
|
|
based on either the maximization or the
|
|
|
|
minimization of the monitored quantity. For `val_acc`,
|
|
|
|
this should be `max`, for `val_loss` this should
|
|
|
|
be `min`, etc. In `auto` mode, the direction is
|
|
|
|
automatically inferred from the name of the monitored quantity.
|
2020-12-04 15:11:58 +00:00
|
|
|
|
|
|
|
.. warning::
|
|
|
|
Setting ``mode='auto'`` has been deprecated in v1.1 and will be removed in v1.3.
|
|
|
|
|
2020-04-05 09:38:52 +00:00
|
|
|
save_weights_only: if ``True``, then only the model's weights will be
|
|
|
|
saved (``model.save_weights(filepath)``), else the full model
|
|
|
|
is saved (``model.save(filepath)``).
|
|
|
|
period: Interval (number of epochs) between checkpoints.
|
2020-11-21 12:38:42 +00:00
|
|
|
prefix: A string to put at the beginning of checkpoint filename.
|
|
|
|
|
|
|
|
.. warning::
|
|
|
|
This argument has been deprecated in v1.1 and will be removed in v1.3
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2021-01-26 22:29:34 +00:00
|
|
|
Note:
|
|
|
|
For extra customization, ModelCheckpoint includes the following attributes:
|
|
|
|
|
|
|
|
- ``CHECKPOINT_JOIN_CHAR = "-"``
|
|
|
|
- ``CHECKPOINT_NAME_LAST = "last"``
|
|
|
|
- ``FILE_EXTENSION = ".ckpt"``
|
|
|
|
- ``STARTING_VERSION = 1``
|
|
|
|
|
|
|
|
For example, you can change the default last checkpoint name by doing
|
|
|
|
``checkpoint_callback.CHECKPOINT_NAME_LAST = "{epoch}-last"``
|
|
|
|
|
2021-02-15 10:24:36 +00:00
|
|
|
Raises:
|
|
|
|
MisconfigurationException:
|
|
|
|
If ``save_top_k`` is neither ``None`` nor more than or equal to ``-1``,
|
|
|
|
if ``monitor`` is ``None`` and ``save_top_k`` is none of ``None``, ``-1``, and ``0``, or
|
|
|
|
if ``mode`` is none of ``"min"``, ``"max"``, and ``"auto"``.
|
|
|
|
ValueError:
|
|
|
|
If ``trainer.save_checkpoint`` is ``None``.
|
|
|
|
|
2020-10-30 10:12:02 +00:00
|
|
|
Example::
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-04-05 09:38:52 +00:00
|
|
|
>>> from pytorch_lightning import Trainer
|
|
|
|
>>> from pytorch_lightning.callbacks import ModelCheckpoint
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-09-27 03:14:04 +00:00
|
|
|
# saves checkpoints to 'my/path/' at every epoch
|
2020-10-23 04:29:12 +00:00
|
|
|
>>> checkpoint_callback = ModelCheckpoint(dirpath='my/path/')
|
2020-11-02 05:18:20 +00:00
|
|
|
>>> trainer = Trainer(callbacks=[checkpoint_callback])
|
2020-03-05 04:02:19 +00:00
|
|
|
|
|
|
|
# save epoch and val_loss in name
|
2020-09-18 21:09:11 +00:00
|
|
|
# saves a file like: my/path/sample-mnist-epoch=02-val_loss=0.32.ckpt
|
2020-10-23 04:29:12 +00:00
|
|
|
>>> checkpoint_callback = ModelCheckpoint(
|
|
|
|
... monitor='val_loss',
|
|
|
|
... dirpath='my/path/',
|
|
|
|
... filename='sample-mnist-{epoch:02d}-{val_loss:.2f}'
|
2020-04-05 09:38:52 +00:00
|
|
|
... )
|
|
|
|
|
2020-05-31 12:47:13 +00:00
|
|
|
# retrieve the best checkpoint after training
|
2020-10-23 04:29:12 +00:00
|
|
|
checkpoint_callback = ModelCheckpoint(dirpath='my/path/')
|
2020-11-02 05:18:20 +00:00
|
|
|
trainer = Trainer(callbacks=[checkpoint_callback])
|
2020-05-31 12:47:13 +00:00
|
|
|
model = ...
|
|
|
|
trainer.fit(model)
|
|
|
|
checkpoint_callback.best_model_path
|
2021-01-26 22:29:34 +00:00
|
|
|
|
2020-02-23 02:45:34 +00:00
|
|
|
"""
|
2020-03-05 04:02:19 +00:00
|
|
|
|
2020-09-18 21:09:11 +00:00
|
|
|
CHECKPOINT_JOIN_CHAR = "-"
|
|
|
|
CHECKPOINT_NAME_LAST = "last"
|
2020-12-06 17:28:50 +00:00
|
|
|
FILE_EXTENSION = ".ckpt"
|
2021-01-26 22:29:34 +00:00
|
|
|
STARTING_VERSION = 1
|
2020-08-08 10:02:43 +00:00
|
|
|
|
2020-09-19 22:26:49 +00:00
|
|
|
def __init__(
|
|
|
|
self,
|
2021-01-01 23:08:29 +00:00
|
|
|
dirpath: Optional[Union[str, Path]] = None,
|
|
|
|
filename: Optional[str] = None,
|
2020-09-27 03:14:04 +00:00
|
|
|
monitor: Optional[str] = None,
|
2020-09-19 22:26:49 +00:00
|
|
|
verbose: bool = False,
|
2020-09-28 00:05:02 +00:00
|
|
|
save_last: Optional[bool] = None,
|
|
|
|
save_top_k: Optional[int] = None,
|
2020-09-19 22:26:49 +00:00
|
|
|
save_weights_only: bool = False,
|
|
|
|
mode: str = "auto",
|
|
|
|
period: int = 1,
|
|
|
|
prefix: str = "",
|
|
|
|
):
|
2020-02-23 02:45:34 +00:00
|
|
|
super().__init__()
|
|
|
|
self.monitor = monitor
|
|
|
|
self.verbose = verbose
|
2020-05-25 11:47:44 +00:00
|
|
|
self.save_last = save_last
|
2020-02-23 02:45:34 +00:00
|
|
|
self.save_top_k = save_top_k
|
|
|
|
self.save_weights_only = save_weights_only
|
|
|
|
self.period = period
|
2021-02-08 08:35:07 +00:00
|
|
|
self._last_global_step_saved = -1
|
2020-02-23 02:45:34 +00:00
|
|
|
self.prefix = prefix
|
2020-11-18 08:09:44 +00:00
|
|
|
self.current_score = None
|
2020-02-23 02:45:34 +00:00
|
|
|
self.best_k_models = {}
|
2020-09-19 22:26:49 +00:00
|
|
|
self.kth_best_model_path = ""
|
2020-11-18 08:09:44 +00:00
|
|
|
self.best_model_score = None
|
2020-09-19 22:26:49 +00:00
|
|
|
self.best_model_path = ""
|
|
|
|
self.last_model_path = ""
|
2020-02-23 02:45:34 +00:00
|
|
|
self.save_function = None
|
2020-08-17 14:29:28 +00:00
|
|
|
self.warned_result_obj = False
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-11-21 12:38:42 +00:00
|
|
|
if prefix:
|
|
|
|
rank_zero_warn(
|
|
|
|
'Argument `prefix` is deprecated in v1.1 and will be removed in v1.3.'
|
|
|
|
' Please prepend your prefix in `filename` instead.', DeprecationWarning
|
|
|
|
)
|
|
|
|
|
2020-09-27 03:14:04 +00:00
|
|
|
self.__init_monitor_mode(monitor, mode)
|
2021-01-01 23:08:29 +00:00
|
|
|
self.__init_ckpt_dir(dirpath, filename, save_top_k)
|
2020-09-27 03:14:04 +00:00
|
|
|
self.__validate_init_configuration()
|
|
|
|
|
|
|
|
def on_pretrain_routine_start(self, trainer, pl_module):
|
|
|
|
"""
|
|
|
|
When pretrain routine starts we build the ckpt dir on the fly
|
|
|
|
"""
|
2021-02-02 17:06:11 +00:00
|
|
|
self.__resolve_ckpt_dir(trainer)
|
2020-10-21 14:06:42 +00:00
|
|
|
self.save_function = trainer.save_checkpoint
|
2020-09-27 03:14:04 +00:00
|
|
|
|
|
|
|
def on_validation_end(self, trainer, pl_module):
|
|
|
|
"""
|
|
|
|
checkpoints can be saved at the end of the val loop
|
|
|
|
"""
|
|
|
|
self.save_checkpoint(trainer, pl_module)
|
|
|
|
|
|
|
|
def on_save_checkpoint(self, trainer, pl_module) -> Dict[str, Any]:
|
|
|
|
return {
|
2020-10-28 09:51:08 +00:00
|
|
|
"monitor": self.monitor,
|
2020-09-27 03:14:04 +00:00
|
|
|
"best_model_score": self.best_model_score,
|
|
|
|
"best_model_path": self.best_model_path,
|
2020-11-18 08:09:44 +00:00
|
|
|
"current_score": self.current_score,
|
2021-01-05 10:01:59 +00:00
|
|
|
"dirpath": self.dirpath
|
2020-09-27 03:14:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]):
|
|
|
|
self.best_model_score = checkpointed_state["best_model_score"]
|
|
|
|
self.best_model_path = checkpointed_state["best_model_path"]
|
|
|
|
|
|
|
|
def save_checkpoint(self, trainer, pl_module):
|
|
|
|
"""
|
2020-09-30 20:15:29 +00:00
|
|
|
Performs the main logic around saving a checkpoint.
|
|
|
|
This method runs on all ranks, it is the responsibility of `self.save_function`
|
|
|
|
to handle correct behaviour in distributed training, i.e., saving only on rank 0.
|
2020-09-27 03:14:04 +00:00
|
|
|
"""
|
2020-09-29 13:36:45 +00:00
|
|
|
epoch = trainer.current_epoch
|
2020-10-05 01:49:20 +00:00
|
|
|
global_step = trainer.global_step
|
2020-09-27 03:14:04 +00:00
|
|
|
|
2020-09-29 13:36:45 +00:00
|
|
|
if (
|
2021-01-05 02:54:49 +00:00
|
|
|
trainer.fast_dev_run # disable checkpointing with fast_dev_run
|
|
|
|
or self.save_top_k == 0 # no models are saved
|
2020-09-29 13:36:45 +00:00
|
|
|
or self.period < 1 # no models are saved
|
|
|
|
or (epoch + 1) % self.period # skip epoch
|
|
|
|
or trainer.running_sanity_check # don't save anything during sanity check
|
2021-02-08 08:35:07 +00:00
|
|
|
or self._last_global_step_saved == global_step # already saved at the last step
|
2020-09-29 13:36:45 +00:00
|
|
|
):
|
2020-09-27 03:14:04 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
self._add_backward_monitor_support(trainer)
|
|
|
|
self._validate_monitor_key(trainer)
|
|
|
|
|
|
|
|
# track epoch when ckpt was last checked
|
2021-02-08 08:35:07 +00:00
|
|
|
self._last_global_step_saved = global_step
|
2020-09-27 03:14:04 +00:00
|
|
|
|
|
|
|
# what can be monitored
|
|
|
|
monitor_candidates = self._monitor_candidates(trainer)
|
|
|
|
|
|
|
|
# callback supports multiple simultaneous modes
|
|
|
|
# here we call each mode sequentially
|
2020-09-30 20:15:29 +00:00
|
|
|
# Mode 1: save all checkpoints OR only the top k
|
2020-09-28 00:05:02 +00:00
|
|
|
if self.save_top_k:
|
2020-12-16 05:40:33 +00:00
|
|
|
self._save_top_k_checkpoints(trainer, pl_module, monitor_candidates)
|
2020-09-27 03:14:04 +00:00
|
|
|
|
2020-09-30 20:15:29 +00:00
|
|
|
# Mode 2: save the last checkpoint
|
2020-12-16 05:40:33 +00:00
|
|
|
self._save_last_checkpoint(trainer, pl_module, monitor_candidates)
|
2020-09-30 20:15:29 +00:00
|
|
|
|
2020-09-27 03:14:04 +00:00
|
|
|
def __validate_init_configuration(self):
|
2020-09-28 00:05:02 +00:00
|
|
|
if self.save_top_k is not None and self.save_top_k < -1:
|
2021-02-08 19:28:38 +00:00
|
|
|
raise MisconfigurationException(f'Invalid value for save_top_k={self.save_top_k}. Must be None or >= -1')
|
2020-09-28 00:05:02 +00:00
|
|
|
if self.monitor is None:
|
|
|
|
# None: save last epoch, -1: save all epochs, 0: nothing is saved
|
|
|
|
if self.save_top_k not in [None, -1, 0]:
|
|
|
|
raise MisconfigurationException(
|
|
|
|
f'ModelCheckpoint(save_top_k={self.save_top_k}, monitor=None) is not a valid'
|
|
|
|
' configuration. No quantity for top_k to track.'
|
|
|
|
)
|
|
|
|
if self.save_last:
|
2020-11-10 09:44:43 +00:00
|
|
|
rank_zero_warn(
|
|
|
|
'ModelCheckpoint(save_last=True, monitor=None) is a redundant configuration.'
|
|
|
|
' You can save the last checkpoint with ModelCheckpoint(save_top_k=None, monitor=None).'
|
2020-09-28 00:05:02 +00:00
|
|
|
)
|
2020-09-27 03:14:04 +00:00
|
|
|
|
2021-01-01 23:08:29 +00:00
|
|
|
def __init_ckpt_dir(self, dirpath, filename, save_top_k):
|
2020-10-23 04:29:12 +00:00
|
|
|
|
|
|
|
self._fs = get_filesystem(str(dirpath) if dirpath else '')
|
|
|
|
|
2020-09-28 00:05:02 +00:00
|
|
|
if (
|
2021-02-08 19:28:38 +00:00
|
|
|
save_top_k is not None and save_top_k > 0 and dirpath is not None and self._fs.isdir(dirpath)
|
2020-10-23 04:29:12 +00:00
|
|
|
and len(self._fs.ls(dirpath)) > 0
|
2020-09-28 00:05:02 +00:00
|
|
|
):
|
2021-02-08 19:28:38 +00:00
|
|
|
rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
|
2020-09-27 03:14:04 +00:00
|
|
|
|
2020-10-23 04:29:12 +00:00
|
|
|
if dirpath and self._fs.protocol == 'file':
|
|
|
|
dirpath = os.path.realpath(dirpath)
|
|
|
|
|
2020-11-08 17:16:22 +00:00
|
|
|
self.dirpath: Union[str, None] = dirpath or None
|
2020-10-23 04:29:12 +00:00
|
|
|
self.filename = filename or None
|
2020-09-27 03:14:04 +00:00
|
|
|
|
|
|
|
def __init_monitor_mode(self, monitor, mode):
|
2020-04-19 20:41:54 +00:00
|
|
|
torch_inf = torch.tensor(np.Inf)
|
2020-02-23 02:45:34 +00:00
|
|
|
mode_dict = {
|
2020-09-19 22:26:49 +00:00
|
|
|
"min": (torch_inf, "min"),
|
|
|
|
"max": (-torch_inf, "max"),
|
2020-02-23 02:45:34 +00:00
|
|
|
}
|
|
|
|
|
2020-12-04 15:11:58 +00:00
|
|
|
if mode not in mode_dict and mode != 'auto':
|
2021-02-08 19:28:38 +00:00
|
|
|
raise MisconfigurationException(f"`mode` can be auto, {', '.join(mode_dict.keys())}, got {mode}")
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2021-01-12 07:31:26 +00:00
|
|
|
# TODO: Update with MisconfigurationException when auto mode is removed in v1.3
|
2020-12-04 15:11:58 +00:00
|
|
|
if mode == 'auto':
|
|
|
|
rank_zero_warn(
|
|
|
|
"mode='auto' is deprecated in v1.1 and will be removed in v1.3."
|
2021-02-08 19:28:38 +00:00
|
|
|
" Default value for mode with be 'min' in v1.3.", DeprecationWarning
|
2020-12-04 15:11:58 +00:00
|
|
|
)
|
|
|
|
|
2021-02-08 19:28:38 +00:00
|
|
|
_condition = monitor is not None and ("acc" in monitor or monitor.startswith("fmeasure"))
|
|
|
|
mode_dict['auto'] = ((-torch_inf, "max") if _condition else (torch_inf, "min"))
|
2020-12-04 15:11:58 +00:00
|
|
|
|
2020-04-27 09:02:33 +00:00
|
|
|
self.kth_value, self.mode = mode_dict[mode]
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-09-30 20:15:29 +00:00
|
|
|
@rank_zero_only
|
2020-09-19 22:26:49 +00:00
|
|
|
def _del_model(self, filepath: str):
|
2020-09-03 12:19:20 +00:00
|
|
|
if self._fs.exists(filepath):
|
|
|
|
self._fs.rm(filepath)
|
2020-09-30 20:15:29 +00:00
|
|
|
log.debug(f"Removed checkpoint: {filepath}")
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-09-19 22:26:49 +00:00
|
|
|
def _save_model(self, filepath: str, trainer, pl_module):
|
2020-12-19 12:53:06 +00:00
|
|
|
# Todo: required argument `pl_module` is not used
|
2020-07-20 23:00:20 +00:00
|
|
|
# in debugging, track when we save checkpoints
|
|
|
|
trainer.dev_debugger.track_checkpointing_history(filepath)
|
|
|
|
|
2020-02-23 02:45:34 +00:00
|
|
|
# make paths
|
2020-09-30 20:15:29 +00:00
|
|
|
if trainer.is_global_zero:
|
|
|
|
self._fs.makedirs(os.path.dirname(filepath), exist_ok=True)
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-09-30 20:15:29 +00:00
|
|
|
# delegate the saving to the trainer
|
2020-02-23 02:45:34 +00:00
|
|
|
if self.save_function is not None:
|
2020-05-17 13:24:17 +00:00
|
|
|
self.save_function(filepath, self.save_weights_only)
|
2020-02-23 02:45:34 +00:00
|
|
|
else:
|
2020-03-05 04:02:19 +00:00
|
|
|
raise ValueError(".save_function() not set")
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-09-19 22:26:49 +00:00
|
|
|
def check_monitor_top_k(self, current) -> bool:
|
2020-10-01 06:31:11 +00:00
|
|
|
if current is None:
|
|
|
|
return False
|
|
|
|
|
2020-09-30 12:34:02 +00:00
|
|
|
if self.save_top_k == -1:
|
|
|
|
return True
|
|
|
|
|
2020-02-23 02:45:34 +00:00
|
|
|
less_than_k_models = len(self.best_k_models) < self.save_top_k
|
|
|
|
if less_than_k_models:
|
|
|
|
return True
|
2020-04-19 20:41:54 +00:00
|
|
|
|
|
|
|
if not isinstance(current, torch.Tensor):
|
2020-05-13 23:17:04 +00:00
|
|
|
rank_zero_warn(
|
2020-09-19 22:26:49 +00:00
|
|
|
f"{current} is supposed to be a `torch.Tensor`. Saving checkpoint may not work correctly."
|
|
|
|
f" HINT: check the value of {self.monitor} in your validation loop",
|
|
|
|
RuntimeWarning,
|
2020-05-13 23:17:04 +00:00
|
|
|
)
|
2020-04-19 20:41:54 +00:00
|
|
|
current = torch.tensor(current)
|
|
|
|
|
2020-09-19 22:26:49 +00:00
|
|
|
monitor_op = {"min": torch.lt, "max": torch.gt}[self.mode]
|
2020-09-29 13:36:45 +00:00
|
|
|
return monitor_op(current, self.best_k_models[self.kth_best_model_path]).item()
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-09-18 21:09:11 +00:00
|
|
|
@classmethod
|
2020-09-19 22:26:49 +00:00
|
|
|
def _format_checkpoint_name(
|
|
|
|
cls,
|
|
|
|
filename: Optional[str],
|
|
|
|
epoch: int,
|
2020-11-02 14:05:58 +00:00
|
|
|
step: int,
|
2020-09-19 22:26:49 +00:00
|
|
|
metrics: Dict[str, Any],
|
|
|
|
prefix: str = "",
|
|
|
|
) -> str:
|
2020-09-18 21:09:11 +00:00
|
|
|
if not filename:
|
|
|
|
# filename is not set, use default name
|
2020-11-04 10:07:42 +00:00
|
|
|
filename = "{epoch}" + cls.CHECKPOINT_JOIN_CHAR + "{step}"
|
|
|
|
|
2020-09-18 21:09:11 +00:00
|
|
|
# check and parse user passed keys in the string
|
2020-09-19 22:26:49 +00:00
|
|
|
groups = re.findall(r"(\{.*?)[:\}]", filename)
|
2020-09-30 20:15:29 +00:00
|
|
|
if len(groups) >= 0:
|
2020-11-02 14:05:58 +00:00
|
|
|
metrics.update({"epoch": epoch, 'step': step})
|
2020-09-18 21:09:11 +00:00
|
|
|
for group in groups:
|
|
|
|
name = group[1:]
|
2020-09-19 22:26:49 +00:00
|
|
|
filename = filename.replace(group, name + "={" + name)
|
2020-09-18 21:09:11 +00:00
|
|
|
if name not in metrics:
|
|
|
|
metrics[name] = 0
|
|
|
|
filename = filename.format(**metrics)
|
2020-11-21 12:38:42 +00:00
|
|
|
|
|
|
|
if prefix:
|
|
|
|
filename = cls.CHECKPOINT_JOIN_CHAR.join([prefix, filename])
|
|
|
|
|
|
|
|
return filename
|
2020-09-18 21:09:11 +00:00
|
|
|
|
2021-02-08 19:28:38 +00:00
|
|
|
def format_checkpoint_name(self, epoch: int, step: int, metrics: Dict[str, Any], ver: Optional[int] = None) -> str:
|
2020-04-05 09:38:52 +00:00
|
|
|
"""Generate a filename according to the defined template.
|
|
|
|
|
|
|
|
Example::
|
2020-03-05 04:02:19 +00:00
|
|
|
|
2020-03-30 22:37:02 +00:00
|
|
|
>>> tmpdir = os.path.dirname(__file__)
|
2020-10-23 04:29:12 +00:00
|
|
|
>>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}')
|
2020-11-02 14:05:58 +00:00
|
|
|
>>> os.path.basename(ckpt.format_checkpoint_name(0, 1, metrics={}))
|
2020-03-30 22:37:02 +00:00
|
|
|
'epoch=0.ckpt'
|
2020-10-23 04:29:12 +00:00
|
|
|
>>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch:03d}')
|
2020-11-02 14:05:58 +00:00
|
|
|
>>> os.path.basename(ckpt.format_checkpoint_name(5, 2, metrics={}))
|
2020-03-30 22:37:02 +00:00
|
|
|
'epoch=005.ckpt'
|
2020-10-23 04:29:12 +00:00
|
|
|
>>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}-{val_loss:.2f}')
|
2020-11-02 14:05:58 +00:00
|
|
|
>>> os.path.basename(ckpt.format_checkpoint_name(2, 3, metrics=dict(val_loss=0.123456)))
|
2020-03-30 22:37:02 +00:00
|
|
|
'epoch=2-val_loss=0.12.ckpt'
|
2020-10-23 04:29:12 +00:00
|
|
|
>>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{missing:d}')
|
2020-11-02 14:05:58 +00:00
|
|
|
>>> os.path.basename(ckpt.format_checkpoint_name(0, 4, metrics={}))
|
2020-03-30 22:37:02 +00:00
|
|
|
'missing=0.ckpt'
|
2020-11-02 14:05:58 +00:00
|
|
|
>>> ckpt = ModelCheckpoint(filename='{step}')
|
|
|
|
>>> os.path.basename(ckpt.format_checkpoint_name(0, 0, {}))
|
|
|
|
'step=0.ckpt'
|
2020-10-23 04:29:12 +00:00
|
|
|
|
2020-03-05 04:02:19 +00:00
|
|
|
"""
|
2021-02-08 19:28:38 +00:00
|
|
|
filename = self._format_checkpoint_name(self.filename, epoch, step, metrics, prefix=self.prefix)
|
2020-09-18 21:09:11 +00:00
|
|
|
if ver is not None:
|
2020-09-19 22:26:49 +00:00
|
|
|
filename = self.CHECKPOINT_JOIN_CHAR.join((filename, f"v{ver}"))
|
2020-12-16 05:40:33 +00:00
|
|
|
|
2020-12-06 17:28:50 +00:00
|
|
|
ckpt_name = f"{filename}{self.FILE_EXTENSION}"
|
2020-09-18 21:09:11 +00:00
|
|
|
return os.path.join(self.dirpath, ckpt_name) if self.dirpath else ckpt_name
|
2020-03-03 20:16:57 +00:00
|
|
|
|
2021-02-02 17:06:11 +00:00
|
|
|
def __resolve_ckpt_dir(self, trainer):
|
2020-06-29 01:36:46 +00:00
|
|
|
"""
|
2020-07-27 16:53:11 +00:00
|
|
|
Determines model checkpoint save directory at runtime. References attributes from the
|
|
|
|
trainer's logger to determine where to save checkpoints.
|
|
|
|
The base path for saving weights is set in this priority:
|
|
|
|
|
|
|
|
1. Checkpoint callback's path (if passed in)
|
|
|
|
2. The default_root_dir from trainer if trainer has no logger
|
|
|
|
3. The weights_save_path from trainer, if user provides it
|
|
|
|
4. User provided weights_saved_path
|
|
|
|
|
|
|
|
The base path gets extended with logger name and version (if these are available)
|
|
|
|
and subfolder "checkpoints".
|
2020-06-29 01:36:46 +00:00
|
|
|
"""
|
2020-12-19 12:53:06 +00:00
|
|
|
# Todo: required argument `pl_module` is not used
|
2020-06-29 01:36:46 +00:00
|
|
|
if self.dirpath is not None:
|
|
|
|
return # short circuit
|
|
|
|
|
2020-06-30 22:09:16 +00:00
|
|
|
if trainer.logger is not None:
|
2020-07-27 16:53:11 +00:00
|
|
|
if trainer.weights_save_path != trainer.default_root_dir:
|
|
|
|
# the user has changed weights_save_path, it overrides anything
|
|
|
|
save_dir = trainer.weights_save_path
|
|
|
|
else:
|
|
|
|
save_dir = trainer.logger.save_dir or trainer.default_root_dir
|
2020-06-29 01:36:46 +00:00
|
|
|
|
2020-09-19 22:26:49 +00:00
|
|
|
version = (
|
|
|
|
trainer.logger.version
|
2021-02-08 19:28:38 +00:00
|
|
|
if isinstance(trainer.logger.version, str) else f"version_{trainer.logger.version}"
|
2020-09-19 22:26:49 +00:00
|
|
|
)
|
2020-10-01 05:21:38 +00:00
|
|
|
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
version, name = trainer.training_type_plugin.broadcast((version, trainer.logger.name))
|
2020-10-01 05:21:38 +00:00
|
|
|
|
2021-02-08 19:28:38 +00:00
|
|
|
ckpt_path = os.path.join(save_dir, str(name), version, "checkpoints")
|
2020-06-29 01:36:46 +00:00
|
|
|
else:
|
2020-07-27 16:53:11 +00:00
|
|
|
ckpt_path = os.path.join(trainer.weights_save_path, "checkpoints")
|
2020-06-29 01:36:46 +00:00
|
|
|
|
|
|
|
self.dirpath = ckpt_path
|
2020-06-30 22:09:16 +00:00
|
|
|
|
2021-01-05 02:54:49 +00:00
|
|
|
if not trainer.fast_dev_run and trainer.is_global_zero:
|
2020-09-30 20:15:29 +00:00
|
|
|
self._fs.makedirs(self.dirpath, exist_ok=True)
|
2020-06-29 01:36:46 +00:00
|
|
|
|
2020-09-27 03:14:04 +00:00
|
|
|
def _add_backward_monitor_support(self, trainer):
|
|
|
|
metrics = trainer.logger_connector.callback_metrics
|
2021-02-17 10:46:58 +00:00
|
|
|
deprecation_warning = False
|
2020-03-12 14:50:00 +00:00
|
|
|
|
2020-09-27 03:14:04 +00:00
|
|
|
if self.monitor is None and 'val_loss' in metrics:
|
|
|
|
self.monitor = 'val_loss'
|
2021-02-17 10:46:58 +00:00
|
|
|
deprecation_warning = True
|
2020-08-17 14:29:28 +00:00
|
|
|
|
2020-10-01 06:31:11 +00:00
|
|
|
if self.save_top_k is None and self.monitor is not None:
|
2021-02-17 10:46:58 +00:00
|
|
|
# TODO: Remove `Optional` from `save_top_k` when this is deleted in v1.4
|
2020-09-28 00:05:02 +00:00
|
|
|
self.save_top_k = 1
|
|
|
|
|
2021-02-17 10:46:58 +00:00
|
|
|
if deprecation_warning:
|
|
|
|
warning_cache.warn(
|
|
|
|
"Relying on `self.log('val_loss', ...)` to set the ModelCheckpoint monitor is deprecated in v1.2"
|
|
|
|
" and will be removed in v1.4. Please, create your own `mc = ModelCheckpoint(monitor='your_monitor')`"
|
|
|
|
" and use it as `Trainer(callbacks=[mc])`.", DeprecationWarning
|
|
|
|
)
|
|
|
|
|
2020-09-27 03:14:04 +00:00
|
|
|
def _validate_monitor_key(self, trainer):
|
2020-09-07 13:31:42 +00:00
|
|
|
metrics = trainer.logger_connector.callback_metrics
|
2020-07-20 23:00:20 +00:00
|
|
|
|
2020-09-21 02:58:43 +00:00
|
|
|
# validate metric
|
2020-09-27 03:14:04 +00:00
|
|
|
if self.monitor is not None and not self._is_valid_monitor_key(metrics):
|
2020-09-25 13:54:04 +00:00
|
|
|
m = (
|
|
|
|
f"ModelCheckpoint(monitor='{self.monitor}') not found in the returned metrics:"
|
2020-09-27 03:14:04 +00:00
|
|
|
f" {list(metrics.keys())}. "
|
|
|
|
f"HINT: Did you call self.log('{self.monitor}', tensor) in the LightningModule?"
|
2020-09-25 13:54:04 +00:00
|
|
|
)
|
2020-09-21 02:58:43 +00:00
|
|
|
raise MisconfigurationException(m)
|
2020-07-22 17:53:10 +00:00
|
|
|
|
2020-12-16 05:40:33 +00:00
|
|
|
def _get_metric_interpolated_filepath_name(
|
|
|
|
self,
|
2021-02-05 08:49:06 +00:00
|
|
|
ckpt_name_metrics: Dict[str, Any],
|
2020-12-16 05:40:33 +00:00
|
|
|
epoch: int,
|
|
|
|
step: int,
|
2021-01-27 15:27:43 +00:00
|
|
|
trainer,
|
|
|
|
del_filepath: Optional[str] = None,
|
2020-12-16 05:40:33 +00:00
|
|
|
) -> str:
|
2021-01-27 15:27:43 +00:00
|
|
|
filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics)
|
|
|
|
|
2021-02-05 08:49:06 +00:00
|
|
|
version_cnt = self.STARTING_VERSION
|
2021-01-27 15:27:43 +00:00
|
|
|
while self.file_exists(filepath, trainer) and filepath != del_filepath:
|
|
|
|
filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt)
|
|
|
|
version_cnt += 1
|
|
|
|
|
2020-09-27 03:14:04 +00:00
|
|
|
return filepath
|
2020-03-30 22:37:02 +00:00
|
|
|
|
2020-09-27 03:14:04 +00:00
|
|
|
def _monitor_candidates(self, trainer):
|
2021-01-26 22:29:34 +00:00
|
|
|
monitor_candidates = deepcopy(trainer.logger_connector.callback_metrics)
|
|
|
|
monitor_candidates.update(step=trainer.global_step, epoch=trainer.current_epoch)
|
|
|
|
return monitor_candidates
|
2020-03-30 22:37:02 +00:00
|
|
|
|
2021-02-05 08:49:06 +00:00
|
|
|
def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics):
|
2020-09-27 03:14:04 +00:00
|
|
|
should_save_last = self.monitor is None or self.save_last
|
|
|
|
if not should_save_last:
|
|
|
|
return
|
2020-06-30 22:09:16 +00:00
|
|
|
|
2020-09-27 03:14:04 +00:00
|
|
|
# when user ALSO asked for the 'last.ckpt' change the name
|
2020-08-08 10:02:43 +00:00
|
|
|
if self.save_last:
|
2020-09-30 20:15:29 +00:00
|
|
|
last_filepath = self._format_checkpoint_name(
|
2020-11-02 14:05:58 +00:00
|
|
|
self.CHECKPOINT_NAME_LAST,
|
|
|
|
trainer.current_epoch,
|
|
|
|
trainer.global_step,
|
2021-02-05 08:49:06 +00:00
|
|
|
ckpt_name_metrics,
|
|
|
|
prefix=self.prefix
|
2020-09-18 21:09:11 +00:00
|
|
|
)
|
2020-12-06 17:28:50 +00:00
|
|
|
last_filepath = os.path.join(self.dirpath, f"{last_filepath}{self.FILE_EXTENSION}")
|
2020-12-16 05:40:33 +00:00
|
|
|
else:
|
|
|
|
last_filepath = self._get_metric_interpolated_filepath_name(
|
2021-02-08 19:28:38 +00:00
|
|
|
ckpt_name_metrics,
|
|
|
|
trainer.current_epoch,
|
|
|
|
trainer.global_step,
|
|
|
|
trainer,
|
2020-12-16 05:40:33 +00:00
|
|
|
)
|
2020-09-27 03:14:04 +00:00
|
|
|
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
if trainer.training_type_plugin.rpc_enabled:
|
2020-12-08 22:02:10 +00:00
|
|
|
# RPCPlugin manages saving all model states
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
trainer.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer, pl_module)
|
2020-12-08 22:02:10 +00:00
|
|
|
else:
|
|
|
|
self._save_model(last_filepath, trainer, pl_module)
|
2020-09-30 20:15:29 +00:00
|
|
|
if (
|
2021-02-08 19:28:38 +00:00
|
|
|
self.last_model_path and self.last_model_path != last_filepath
|
|
|
|
and (self.save_top_k != -1 or self.save_last) and trainer.is_global_zero
|
2020-09-30 20:15:29 +00:00
|
|
|
):
|
2020-09-27 03:14:04 +00:00
|
|
|
self._del_model(self.last_model_path)
|
|
|
|
self.last_model_path = last_filepath
|
|
|
|
|
|
|
|
if self.monitor is None:
|
|
|
|
self.best_model_path = self.last_model_path
|
|
|
|
|
2020-12-16 05:40:33 +00:00
|
|
|
def _save_top_k_checkpoints(self, trainer, pl_module, metrics):
|
2020-09-27 03:14:04 +00:00
|
|
|
current = metrics.get(self.monitor)
|
2020-11-02 14:05:58 +00:00
|
|
|
epoch = metrics.get("epoch")
|
|
|
|
step = metrics.get("step")
|
2020-09-27 03:14:04 +00:00
|
|
|
|
2021-02-17 16:40:13 +00:00
|
|
|
# when `val_loss` is being logged and no ModelCheckpoint is being provided
|
|
|
|
# `val_loss` will be selected for monitor and need to be reduced to
|
|
|
|
# prevent processes divergence
|
|
|
|
# TODO: Move this logic to logger_connector. This also needs to be fixed for any
|
|
|
|
# other monitor logged value which aren't produced from a Metric.
|
|
|
|
if self.monitor == "val_loss":
|
|
|
|
current = trainer.training_type_plugin.reduce(current, reduce_op="mean")
|
|
|
|
|
2020-10-01 06:31:11 +00:00
|
|
|
if self.check_monitor_top_k(current):
|
2020-12-16 05:40:33 +00:00
|
|
|
self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics)
|
2020-09-27 03:14:04 +00:00
|
|
|
elif self.verbose:
|
2021-02-08 19:28:38 +00:00
|
|
|
rank_zero_info(f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}")
|
2020-09-27 03:14:04 +00:00
|
|
|
|
2020-09-21 02:58:43 +00:00
|
|
|
def _is_valid_monitor_key(self, metrics):
|
|
|
|
return self.monitor in metrics or len(metrics) == 0
|
|
|
|
|
2020-09-30 12:34:02 +00:00
|
|
|
def _update_best_and_save(
|
2021-02-08 19:28:38 +00:00
|
|
|
self, current: torch.Tensor, epoch: int, step: int, trainer, pl_module, ckpt_name_metrics
|
2020-09-19 22:26:49 +00:00
|
|
|
):
|
2020-11-02 14:05:58 +00:00
|
|
|
k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k
|
2020-04-16 16:40:51 +00:00
|
|
|
|
2020-12-16 05:40:33 +00:00
|
|
|
del_filepath = None
|
2020-09-30 12:34:02 +00:00
|
|
|
if len(self.best_k_models) == k and k > 0:
|
2020-12-16 05:40:33 +00:00
|
|
|
del_filepath = self.kth_best_model_path
|
|
|
|
self.best_k_models.pop(del_filepath)
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-11-18 08:09:44 +00:00
|
|
|
# do not save nan, replace with +/- inf
|
2021-01-07 15:57:26 +00:00
|
|
|
if isinstance(current, torch.Tensor) and torch.isnan(current):
|
2020-11-18 08:09:44 +00:00
|
|
|
current = torch.tensor(float('inf' if self.mode == "min" else '-inf'))
|
2020-10-05 11:36:12 +00:00
|
|
|
|
2021-01-27 15:27:43 +00:00
|
|
|
filepath = self._get_metric_interpolated_filepath_name(ckpt_name_metrics, epoch, step, trainer, del_filepath)
|
2020-12-16 05:40:33 +00:00
|
|
|
|
2020-11-18 08:09:44 +00:00
|
|
|
# save the current score
|
|
|
|
self.current_score = current
|
2020-02-23 02:45:34 +00:00
|
|
|
self.best_k_models[filepath] = current
|
2020-11-18 08:09:44 +00:00
|
|
|
|
2020-09-30 12:34:02 +00:00
|
|
|
if len(self.best_k_models) == k:
|
2020-02-23 02:45:34 +00:00
|
|
|
# monitor dict has reached k elements
|
2020-09-19 22:26:49 +00:00
|
|
|
_op = max if self.mode == "min" else min
|
2021-02-08 19:28:38 +00:00
|
|
|
self.kth_best_model_path = _op(self.best_k_models, key=self.best_k_models.get)
|
2020-05-31 12:47:13 +00:00
|
|
|
self.kth_value = self.best_k_models[self.kth_best_model_path]
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-09-19 22:26:49 +00:00
|
|
|
_op = min if self.mode == "min" else max
|
2020-05-31 12:47:13 +00:00
|
|
|
self.best_model_path = _op(self.best_k_models, key=self.best_k_models.get)
|
|
|
|
self.best_model_score = self.best_k_models[self.best_model_path]
|
2020-02-23 02:45:34 +00:00
|
|
|
|
2020-09-19 22:26:49 +00:00
|
|
|
if self.verbose:
|
2020-09-30 20:15:29 +00:00
|
|
|
rank_zero_info(
|
2020-11-02 14:05:58 +00:00
|
|
|
f"Epoch {epoch:d}, global step {step:d}: {self.monitor} reached {current:0.5f}"
|
|
|
|
f' (best {self.best_model_score:0.5f}), saving model to "{filepath}" as top {k}'
|
2020-09-19 22:26:49 +00:00
|
|
|
)
|
2020-07-20 23:00:20 +00:00
|
|
|
self._save_model(filepath, trainer, pl_module)
|
2020-04-16 16:40:51 +00:00
|
|
|
|
2020-12-16 05:40:33 +00:00
|
|
|
if del_filepath is not None and filepath != del_filepath:
|
|
|
|
self._del_model(del_filepath)
|
2020-08-28 14:50:52 +00:00
|
|
|
|
2020-09-27 12:39:40 +00:00
|
|
|
def to_yaml(self, filepath: Optional[Union[str, Path]] = None):
|
|
|
|
"""
|
|
|
|
Saves the `best_k_models` dict containing the checkpoint
|
|
|
|
paths with the corresponding scores to a YAML file.
|
|
|
|
"""
|
|
|
|
best_k = {k: v.item() for k, v in self.best_k_models.items()}
|
|
|
|
if filepath is None:
|
|
|
|
filepath = os.path.join(self.dirpath, "best_k_models.yaml")
|
2020-10-02 18:49:46 +00:00
|
|
|
with self._fs.open(filepath, "w") as fp:
|
2020-09-27 12:39:40 +00:00
|
|
|
yaml.dump(best_k, fp)
|
2021-01-27 15:27:43 +00:00
|
|
|
|
|
|
|
def file_exists(self, filepath: Union[str, Path], trainer) -> bool:
|
|
|
|
"""
|
|
|
|
Checks if a file exists on rank 0 and broadcasts the result to all other ranks, preventing
|
|
|
|
the internal state to diverge between ranks.
|
|
|
|
"""
|
|
|
|
exists = self._fs.exists(filepath)
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
exists = trainer.training_type_plugin.broadcast(exists)
|
2021-01-27 15:27:43 +00:00
|
|
|
return exists
|