2020-08-20 02:03:22 +00:00
|
|
|
# Copyright The PyTorch Lightning team.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
2020-04-16 16:04:12 +00:00
|
|
|
"""
|
2020-11-13 15:05:54 +00:00
|
|
|
Weights and Biases Logger
|
|
|
|
-------------------------
|
2020-02-11 04:55:22 +00:00
|
|
|
"""
|
2021-05-27 18:15:02 +00:00
|
|
|
import operator
|
2020-01-14 03:25:27 +00:00
|
|
|
import os
|
2020-03-04 14:33:39 +00:00
|
|
|
from argparse import Namespace
|
2021-05-27 18:15:02 +00:00
|
|
|
from pathlib import Path
|
2020-12-21 09:15:04 +00:00
|
|
|
from typing import Any, Dict, Optional, Union
|
2021-05-27 18:15:02 +00:00
|
|
|
from weakref import ReferenceType
|
2020-01-14 03:25:27 +00:00
|
|
|
|
2020-03-03 01:49:14 +00:00
|
|
|
import torch.nn as nn
|
|
|
|
|
2021-05-27 18:15:02 +00:00
|
|
|
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
|
2021-01-05 19:34:47 +00:00
|
|
|
from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
|
2021-01-15 17:23:56 +00:00
|
|
|
from pytorch_lightning.utilities import _module_available, rank_zero_only
|
2021-02-02 17:06:11 +00:00
|
|
|
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
2021-05-27 18:15:02 +00:00
|
|
|
from pytorch_lightning.utilities.imports import _compare_version
|
2021-08-22 11:58:48 +00:00
|
|
|
from pytorch_lightning.utilities.warnings import rank_zero_warn
|
2021-02-27 01:52:23 +00:00
|
|
|
|
2021-01-05 19:34:47 +00:00
|
|
|
_WANDB_AVAILABLE = _module_available("wandb")
|
2021-05-27 18:15:02 +00:00
|
|
|
_WANDB_GREATER_EQUAL_0_10_22 = _compare_version("wandb", operator.ge, "0.10.22")
|
2021-01-05 19:34:47 +00:00
|
|
|
|
2020-01-14 03:25:27 +00:00
|
|
|
try:
|
2021-01-24 12:32:02 +00:00
|
|
|
import wandb
|
2021-01-25 19:31:38 +00:00
|
|
|
from wandb.wandb_run import Run
|
2021-01-05 19:34:47 +00:00
|
|
|
except ImportError:
|
|
|
|
# needed for test mocks, these tests shall be updated
|
|
|
|
wandb, Run = None, None
|
2020-01-14 03:25:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
class WandbLogger(LightningLoggerBase):
|
2020-09-19 16:51:43 +00:00
|
|
|
r"""
|
2021-05-27 18:15:02 +00:00
|
|
|
Log using `Weights and Biases <https://docs.wandb.ai/integrations/lightning>`_.
|
2020-11-13 15:05:54 +00:00
|
|
|
|
|
|
|
Install it with pip:
|
2020-04-16 16:04:12 +00:00
|
|
|
|
|
|
|
.. code-block:: bash
|
|
|
|
|
|
|
|
pip install wandb
|
2020-01-14 03:25:27 +00:00
|
|
|
|
|
|
|
Args:
|
2020-04-16 16:04:12 +00:00
|
|
|
name: Display name for the run.
|
2021-01-24 22:44:09 +00:00
|
|
|
save_dir: Path where data is saved (wandb dir by default).
|
2020-04-16 16:04:12 +00:00
|
|
|
offline: Run offline (data can be streamed later to wandb servers).
|
|
|
|
id: Sets the version, mainly used to resume a previous run.
|
2021-01-24 22:44:09 +00:00
|
|
|
version: Same as id.
|
2020-04-16 16:04:12 +00:00
|
|
|
anonymous: Enables or explicitly disables anonymous logging.
|
|
|
|
project: The name of the project to which this run will belong.
|
2021-05-27 18:15:02 +00:00
|
|
|
log_model: Log checkpoints created by :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint`
|
|
|
|
as W&B artifacts.
|
|
|
|
|
|
|
|
* if ``log_model == 'all'``, checkpoints are logged during training.
|
|
|
|
* if ``log_model == True``, checkpoints are logged at the end of training, except when
|
|
|
|
:paramref:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint.save_top_k` ``== -1``
|
|
|
|
which also logs every checkpoint during training.
|
|
|
|
* if ``log_model == False`` (default), no checkpoint is logged.
|
|
|
|
|
2020-11-22 05:38:58 +00:00
|
|
|
prefix: A string to put at the beginning of metric keys.
|
2021-01-24 22:44:09 +00:00
|
|
|
experiment: WandB experiment object. Automatically set when creating a run.
|
2021-05-04 09:45:36 +00:00
|
|
|
\**kwargs: Arguments passed to :func:`wandb.init` like `entity`, `group`, `tags`, etc.
|
2020-04-16 16:04:12 +00:00
|
|
|
|
2021-02-25 20:08:32 +00:00
|
|
|
Raises:
|
|
|
|
ImportError:
|
|
|
|
If required WandB package is not installed on the device.
|
|
|
|
MisconfigurationException:
|
|
|
|
If both ``log_model`` and ``offline``is set to ``True``.
|
|
|
|
|
2021-02-16 19:14:01 +00:00
|
|
|
Example::
|
2020-11-23 20:01:28 +00:00
|
|
|
|
2020-09-25 14:00:02 +00:00
|
|
|
from pytorch_lightning.loggers import WandbLogger
|
|
|
|
from pytorch_lightning import Trainer
|
2021-05-27 18:15:02 +00:00
|
|
|
|
|
|
|
# instrument experiment with W&B
|
|
|
|
wandb_logger = WandbLogger(project='MNIST', log_model='all')
|
2020-09-25 14:00:02 +00:00
|
|
|
trainer = Trainer(logger=wandb_logger)
|
2020-04-16 16:04:12 +00:00
|
|
|
|
2021-05-27 18:15:02 +00:00
|
|
|
# log gradients and model topology
|
|
|
|
wandb_logger.watch(model)
|
2020-12-19 12:52:11 +00:00
|
|
|
|
2020-04-16 16:04:12 +00:00
|
|
|
See Also:
|
2021-05-27 18:15:02 +00:00
|
|
|
- `Demo in Google Colab <http://wandb.me/lightning>`__ with model logging
|
2021-01-24 22:44:09 +00:00
|
|
|
- `W&B Documentation <https://docs.wandb.ai/integrations/lightning>`__
|
2020-04-16 16:04:12 +00:00
|
|
|
|
2020-01-14 03:25:27 +00:00
|
|
|
"""
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
LOGGER_JOIN_CHAR = "-"
|
2020-11-22 05:38:58 +00:00
|
|
|
|
2020-09-19 16:51:43 +00:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
name: Optional[str] = None,
|
|
|
|
save_dir: Optional[str] = None,
|
2021-01-24 22:44:09 +00:00
|
|
|
offline: Optional[bool] = False,
|
2020-09-19 16:51:43 +00:00
|
|
|
id: Optional[str] = None,
|
2021-05-04 09:45:36 +00:00
|
|
|
anonymous: Optional[bool] = None,
|
2020-09-19 16:51:43 +00:00
|
|
|
version: Optional[str] = None,
|
|
|
|
project: Optional[str] = None,
|
2021-01-24 22:44:09 +00:00
|
|
|
log_model: Optional[bool] = False,
|
2020-09-19 16:51:43 +00:00
|
|
|
experiment=None,
|
2021-07-26 11:37:35 +00:00
|
|
|
prefix: Optional[str] = "",
|
|
|
|
**kwargs,
|
2020-09-19 16:51:43 +00:00
|
|
|
):
|
2020-09-25 14:00:02 +00:00
|
|
|
if wandb is None:
|
2021-02-08 19:28:38 +00:00
|
|
|
raise ImportError(
|
2021-08-13 14:28:14 +00:00
|
|
|
"You want to use `wandb` logger which is not installed yet,"
|
|
|
|
" install it with `pip install wandb`." # pragma: no-cover
|
2021-02-08 19:28:38 +00:00
|
|
|
)
|
2021-02-02 17:06:11 +00:00
|
|
|
|
|
|
|
if offline and log_model:
|
|
|
|
raise MisconfigurationException(
|
2021-07-26 11:37:35 +00:00
|
|
|
f"Providing log_model={log_model} and offline={offline} is an invalid configuration"
|
|
|
|
" since model checkpoints cannot be uploaded in offline mode.\n"
|
|
|
|
"Hint: Set `offline=False` to log your model."
|
2021-02-02 17:06:11 +00:00
|
|
|
)
|
|
|
|
|
2021-05-27 18:15:02 +00:00
|
|
|
if log_model and not _WANDB_GREATER_EQUAL_0_10_22:
|
2021-08-20 10:39:25 +00:00
|
|
|
rank_zero_warn(
|
2021-07-26 11:37:35 +00:00
|
|
|
f"Providing log_model={log_model} requires wandb version >= 0.10.22"
|
|
|
|
" for logging associated model metadata.\n"
|
|
|
|
"Hint: Upgrade with `pip install --ugrade wandb`."
|
2021-05-27 18:15:02 +00:00
|
|
|
)
|
|
|
|
|
2020-01-14 03:25:27 +00:00
|
|
|
super().__init__()
|
2021-01-24 22:44:09 +00:00
|
|
|
self._offline = offline
|
2020-04-02 12:55:34 +00:00
|
|
|
self._log_model = log_model
|
2020-11-22 05:38:58 +00:00
|
|
|
self._prefix = prefix
|
2021-01-24 22:44:09 +00:00
|
|
|
self._experiment = experiment
|
2021-05-27 18:15:02 +00:00
|
|
|
self._logged_model_time = {}
|
|
|
|
self._checkpoint_callback = None
|
2021-05-04 09:45:36 +00:00
|
|
|
# set wandb init arguments
|
2021-07-26 11:37:35 +00:00
|
|
|
anonymous_lut = {True: "allow", False: None}
|
2021-05-04 09:45:36 +00:00
|
|
|
self._wandb_init = dict(
|
|
|
|
name=name,
|
|
|
|
project=project,
|
|
|
|
id=version or id,
|
|
|
|
dir=save_dir,
|
2021-07-26 11:37:35 +00:00
|
|
|
resume="allow",
|
|
|
|
anonymous=anonymous_lut.get(anonymous, anonymous),
|
2021-05-04 09:45:36 +00:00
|
|
|
)
|
|
|
|
self._wandb_init.update(**kwargs)
|
|
|
|
# extract parameters
|
2021-07-26 11:37:35 +00:00
|
|
|
self._save_dir = self._wandb_init.get("dir")
|
|
|
|
self._name = self._wandb_init.get("name")
|
|
|
|
self._id = self._wandb_init.get("id")
|
2020-01-14 03:25:27 +00:00
|
|
|
|
|
|
|
def __getstate__(self):
|
|
|
|
state = self.__dict__.copy()
|
2020-04-03 19:03:00 +00:00
|
|
|
# args needed to reload correct experiment
|
2021-07-26 11:37:35 +00:00
|
|
|
state["_id"] = self._experiment.id if self._experiment is not None else None
|
2020-04-03 19:03:00 +00:00
|
|
|
|
2020-01-14 03:25:27 +00:00
|
|
|
# cannot be pickled
|
2021-07-26 11:37:35 +00:00
|
|
|
state["_experiment"] = None
|
2020-01-14 03:25:27 +00:00
|
|
|
return state
|
|
|
|
|
|
|
|
@property
|
2020-06-30 22:09:16 +00:00
|
|
|
@rank_zero_experiment
|
2020-02-25 19:52:39 +00:00
|
|
|
def experiment(self) -> Run:
|
2020-01-17 11:03:31 +00:00
|
|
|
r"""
|
|
|
|
|
2020-04-16 16:04:12 +00:00
|
|
|
Actual wandb object. To use wandb features in your
|
|
|
|
:class:`~pytorch_lightning.core.lightning.LightningModule` do the following.
|
2020-01-17 11:03:31 +00:00
|
|
|
|
2020-04-16 16:04:12 +00:00
|
|
|
Example::
|
2020-01-17 11:03:31 +00:00
|
|
|
|
2020-04-16 16:04:12 +00:00
|
|
|
self.logger.experiment.some_wandb_function()
|
2020-01-17 11:03:31 +00:00
|
|
|
|
2020-04-16 16:04:12 +00:00
|
|
|
"""
|
2020-01-14 03:25:27 +00:00
|
|
|
if self._experiment is None:
|
|
|
|
if self._offline:
|
2021-07-26 11:37:35 +00:00
|
|
|
os.environ["WANDB_MODE"] = "dryrun"
|
2021-08-10 08:14:48 +00:00
|
|
|
if wandb.run is None:
|
|
|
|
self._experiment = wandb.init(**self._wandb_init)
|
|
|
|
else:
|
2021-08-20 10:39:25 +00:00
|
|
|
rank_zero_warn(
|
2021-08-10 08:14:48 +00:00
|
|
|
"There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"
|
|
|
|
" this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`."
|
|
|
|
)
|
|
|
|
self._experiment = wandb.run
|
2021-02-02 17:06:11 +00:00
|
|
|
|
2021-04-23 01:14:46 +00:00
|
|
|
# define default x-axis (for latest wandb versions)
|
|
|
|
if getattr(self._experiment, "define_metric", None):
|
|
|
|
self._experiment.define_metric("trainer/global_step")
|
2021-07-26 11:37:35 +00:00
|
|
|
self._experiment.define_metric("*", step_metric="trainer/global_step", step_sync=True)
|
2021-02-27 01:52:23 +00:00
|
|
|
|
2020-01-14 03:25:27 +00:00
|
|
|
return self._experiment
|
|
|
|
|
2021-08-04 10:36:57 +00:00
|
|
|
def watch(self, model: nn.Module, log: str = "gradients", log_freq: int = 100, log_graph: bool = True):
|
|
|
|
self.experiment.watch(model, log=log, log_freq=log_freq, log_graph=log_graph)
|
2020-01-14 03:25:27 +00:00
|
|
|
|
|
|
|
@rank_zero_only
|
2020-03-04 14:33:39 +00:00
|
|
|
def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:
|
|
|
|
params = self._convert_params(params)
|
2020-07-08 05:45:25 +00:00
|
|
|
params = self._flatten_dict(params)
|
2020-10-26 11:57:03 +00:00
|
|
|
params = self._sanitize_callable_params(params)
|
2020-04-24 14:29:24 +00:00
|
|
|
self.experiment.config.update(params, allow_val_change=True)
|
2020-01-14 03:25:27 +00:00
|
|
|
|
|
|
|
@rank_zero_only
|
2020-03-04 14:33:39 +00:00
|
|
|
def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
|
2021-07-26 11:37:35 +00:00
|
|
|
assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0"
|
2020-11-22 05:38:58 +00:00
|
|
|
|
|
|
|
metrics = self._add_prefix(metrics)
|
2021-02-27 01:52:23 +00:00
|
|
|
if step is not None:
|
2021-07-26 11:37:35 +00:00
|
|
|
self.experiment.log({**metrics, "trainer/global_step": step})
|
2021-01-24 22:44:09 +00:00
|
|
|
else:
|
|
|
|
self.experiment.log(metrics)
|
2020-01-14 03:25:27 +00:00
|
|
|
|
2020-07-09 11:15:41 +00:00
|
|
|
@property
|
|
|
|
def save_dir(self) -> Optional[str]:
|
2021-08-26 15:01:42 +00:00
|
|
|
"""
|
|
|
|
Gets the save directory.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
The path to the save directory.
|
|
|
|
"""
|
2020-07-09 11:15:41 +00:00
|
|
|
return self._save_dir
|
|
|
|
|
2020-01-14 03:25:27 +00:00
|
|
|
@property
|
2020-06-29 01:36:46 +00:00
|
|
|
def name(self) -> Optional[str]:
|
2021-08-26 15:01:42 +00:00
|
|
|
"""
|
|
|
|
Gets the name of the experiment.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
The name of the experiment if the experiment exists else the name given to the constructor.
|
|
|
|
"""
|
2020-04-03 19:03:00 +00:00
|
|
|
# don't create an experiment if we don't have one
|
2020-07-09 11:15:41 +00:00
|
|
|
return self._experiment.project_name() if self._experiment else self._name
|
2020-01-14 03:25:27 +00:00
|
|
|
|
|
|
|
@property
|
2020-06-29 01:36:46 +00:00
|
|
|
def version(self) -> Optional[str]:
|
2021-08-26 15:01:42 +00:00
|
|
|
"""
|
|
|
|
Gets the id of the experiment.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
The id of the experiment if the experiment exists else the id given to the constructor.
|
|
|
|
"""
|
2020-04-03 19:03:00 +00:00
|
|
|
# don't create an experiment if we don't have one
|
2020-07-09 11:15:41 +00:00
|
|
|
return self._experiment.id if self._experiment else self._id
|
2020-10-26 11:22:09 +00:00
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
def after_save_checkpoint(self, checkpoint_callback: "ReferenceType[ModelCheckpoint]") -> None:
|
2021-05-27 18:15:02 +00:00
|
|
|
# log checkpoints as artifacts
|
2021-07-26 11:37:35 +00:00
|
|
|
if self._log_model == "all" or self._log_model is True and checkpoint_callback.save_top_k == -1:
|
2021-05-27 18:15:02 +00:00
|
|
|
self._scan_and_log_checkpoints(checkpoint_callback)
|
|
|
|
elif self._log_model is True:
|
|
|
|
self._checkpoint_callback = checkpoint_callback
|
|
|
|
|
2020-12-01 06:05:00 +00:00
|
|
|
@rank_zero_only
|
2020-10-26 11:22:09 +00:00
|
|
|
def finalize(self, status: str) -> None:
|
2021-05-27 18:15:02 +00:00
|
|
|
# log checkpoints as artifacts
|
|
|
|
if self._checkpoint_callback:
|
|
|
|
self._scan_and_log_checkpoints(self._checkpoint_callback)
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
def _scan_and_log_checkpoints(self, checkpoint_callback: "ReferenceType[ModelCheckpoint]") -> None:
|
2021-05-27 18:15:02 +00:00
|
|
|
# get checkpoints to be saved with associated score
|
|
|
|
checkpoints = {
|
|
|
|
checkpoint_callback.last_model_path: checkpoint_callback.current_score,
|
|
|
|
checkpoint_callback.best_model_path: checkpoint_callback.best_model_score,
|
2021-07-26 11:37:35 +00:00
|
|
|
**checkpoint_callback.best_k_models,
|
2021-05-27 18:15:02 +00:00
|
|
|
}
|
2021-07-26 12:38:12 +00:00
|
|
|
checkpoints = sorted((Path(p).stat().st_mtime, p, s) for p, s in checkpoints.items() if Path(p).is_file())
|
2021-05-27 18:15:02 +00:00
|
|
|
checkpoints = [
|
|
|
|
c for c in checkpoints if c[1] not in self._logged_model_time.keys() or self._logged_model_time[c[1]] < c[0]
|
|
|
|
]
|
|
|
|
|
|
|
|
# log iteratively all new checkpoints
|
|
|
|
for t, p, s in checkpoints:
|
2021-07-26 11:37:35 +00:00
|
|
|
metadata = (
|
|
|
|
{
|
|
|
|
"score": s,
|
|
|
|
"original_filename": Path(p).name,
|
|
|
|
"ModelCheckpoint": {
|
|
|
|
k: getattr(checkpoint_callback, k)
|
|
|
|
for k in [
|
|
|
|
"monitor",
|
|
|
|
"mode",
|
|
|
|
"save_last",
|
|
|
|
"save_top_k",
|
|
|
|
"save_weights_only",
|
|
|
|
"_every_n_train_steps",
|
|
|
|
"_every_n_val_epochs",
|
|
|
|
]
|
|
|
|
# ensure it does not break if `ModelCheckpoint` args change
|
|
|
|
if hasattr(checkpoint_callback, k)
|
|
|
|
},
|
2021-05-27 18:15:02 +00:00
|
|
|
}
|
2021-07-26 11:37:35 +00:00
|
|
|
if _WANDB_GREATER_EQUAL_0_10_22
|
|
|
|
else None
|
|
|
|
)
|
2021-05-27 18:15:02 +00:00
|
|
|
artifact = wandb.Artifact(name=f"model-{self.experiment.id}", type="model", metadata=metadata)
|
2021-07-26 11:37:35 +00:00
|
|
|
artifact.add_file(p, name="model.ckpt")
|
2021-05-27 18:15:02 +00:00
|
|
|
aliases = ["latest", "best"] if p == checkpoint_callback.best_model_path else ["latest"]
|
|
|
|
self.experiment.log_artifact(artifact, aliases=aliases)
|
|
|
|
# remember logged models - timestamp needed in case filename didn't change (lastkckpt or custom name)
|
|
|
|
self._logged_model_time[p] = t
|