From 854d1662907fac135381d42abf9a642a779d4e46 Mon Sep 17 00:00:00 2001 From: "Antonios P. Sarikas" <110358278+adosar@users.noreply.github.com> Date: Sun, 4 Aug 2024 03:56:25 +0300 Subject: [PATCH] Docs: Add note about version counter in `ModelCheckpoint` (#20146) --- .../source-pytorch/common/checkpointing_intermediate.rst | 9 +++++++++ src/lightning/pytorch/callbacks/model_checkpoint.py | 4 +++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst index ec124d4e6e..81147d11ad 100644 --- a/docs/source-pytorch/common/checkpointing_intermediate.rst +++ b/docs/source-pytorch/common/checkpointing_intermediate.rst @@ -83,6 +83,15 @@ Which filename="sample-mnist-{epoch:02d}-{global_step}", ) + .. note:: + + It is recommended that you pass formatting options to ``filename`` to include the monitored metric like shown + in the example above. Otherwise, if ``save_top_k >= 2`` and ``enable_version_counter=True`` (default), a + version is appended to the ``filename`` to prevent filename collisions. You should not rely on the appended + version to retrieve the top-k model, since there is no relationship between version count and model performance. + For example, ``filename-v2.ckpt`` doesn't necessarily correspond to the top-2 model. + + - You can customize the checkpointing behavior to monitor any quantity of your training or validation steps. For example, if you want to update your checkpoints based on your validation loss: | diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py index ba3014274b..9587da0f46 100644 --- a/src/lightning/pytorch/callbacks/model_checkpoint.py +++ b/src/lightning/pytorch/callbacks/model_checkpoint.py @@ -94,7 +94,9 @@ class ModelCheckpoint(Checkpoint): Please note that the monitors are checked every ``every_n_epochs`` epochs. If ``save_top_k >= 2`` and the callback is called multiple times inside an epoch, and the filename remains unchanged, the name of the saved file will be appended with a version count starting with ``v1`` to avoid - collisions unless ``enable_version_counter`` is set to False. + collisions unless ``enable_version_counter`` is set to False. The version counter is unrelated to the top-k + ranking of the checkpoint, and we recommend formatting the filename to include the monitored metric to avoid + collisions. mode: one of {min, max}. If ``save_top_k != 0``, the decision to overwrite the current save file is made based on either the maximization or the minimization of the monitored quantity.