2020-05-05 02:16:54 +00:00
|
|
|
.. testsetup:: *
|
|
|
|
|
|
|
|
import os
|
|
|
|
from pytorch_lightning.trainer.trainer import Trainer
|
|
|
|
from pytorch_lightning.core.lightning import LightningModule
|
|
|
|
|
2020-08-13 22:56:51 +00:00
|
|
|
.. _weights-loading:
|
2020-05-05 02:16:54 +00:00
|
|
|
|
2020-03-02 22:12:22 +00:00
|
|
|
Saving and loading weights
|
|
|
|
==========================
|
|
|
|
|
|
|
|
Lightning can automate saving and loading checkpoints.
|
|
|
|
|
|
|
|
Checkpoint saving
|
|
|
|
-----------------
|
2020-03-03 02:50:38 +00:00
|
|
|
A Lightning checkpoint has everything needed to restore a training session including:
|
|
|
|
|
|
|
|
- 16-bit scaling factor (apex)
|
|
|
|
- Current epoch
|
|
|
|
- Global step
|
|
|
|
- Model state_dict
|
|
|
|
- State of all optimizers
|
|
|
|
- State of all learningRate schedulers
|
|
|
|
- State of all callbacks
|
|
|
|
- The hyperparameters used for that model if passed in as hparams (Argparse.Namespace)
|
|
|
|
|
|
|
|
Automatic saving
|
|
|
|
^^^^^^^^^^^^^^^^
|
2020-03-02 22:12:22 +00:00
|
|
|
|
|
|
|
Checkpointing is enabled by default to the current working directory.
|
|
|
|
To change the checkpoint path pass in:
|
|
|
|
|
2020-06-29 01:36:46 +00:00
|
|
|
.. code-block:: python
|
2020-03-02 22:12:22 +00:00
|
|
|
|
2020-06-12 18:37:52 +00:00
|
|
|
trainer = Trainer(default_root_dir='/your/path/to/save/checkpoints')
|
2020-03-02 22:12:22 +00:00
|
|
|
|
|
|
|
To modify the behavior of checkpointing pass in your own callback.
|
|
|
|
|
2020-06-17 17:42:28 +00:00
|
|
|
.. code-block:: python
|
2020-03-02 22:12:22 +00:00
|
|
|
|
|
|
|
from pytorch_lightning.callbacks import ModelCheckpoint
|
|
|
|
|
|
|
|
# DEFAULTS used by the Trainer
|
|
|
|
checkpoint_callback = ModelCheckpoint(
|
|
|
|
filepath=os.getcwd(),
|
2020-06-27 01:45:41 +00:00
|
|
|
save_top_k=1,
|
2020-03-02 22:12:22 +00:00
|
|
|
verbose=True,
|
|
|
|
monitor='val_loss',
|
|
|
|
mode='min',
|
|
|
|
prefix=''
|
|
|
|
)
|
|
|
|
|
|
|
|
trainer = Trainer(checkpoint_callback=checkpoint_callback)
|
|
|
|
|
|
|
|
|
|
|
|
Or disable it by passing
|
|
|
|
|
2020-05-05 02:16:54 +00:00
|
|
|
.. testcode::
|
2020-03-02 22:12:22 +00:00
|
|
|
|
2020-05-05 02:16:54 +00:00
|
|
|
trainer = Trainer(checkpoint_callback=False)
|
2020-03-02 22:12:22 +00:00
|
|
|
|
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
The Lightning checkpoint also saves the arguments passed into the LightningModule init
|
|
|
|
under the `module_arguments` key in the checkpoint.
|
2020-03-02 22:12:22 +00:00
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
.. code-block:: python
|
2020-03-02 22:12:22 +00:00
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
class MyLightningModule(LightningModule):
|
2020-03-02 22:12:22 +00:00
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
def __init__(self, learning_rate, *args, **kwargs):
|
|
|
|
super().__init__()
|
2020-03-02 22:12:22 +00:00
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
# all init args were saved to the checkpoint
|
|
|
|
checkpoint = torch.load(CKPT_PATH)
|
|
|
|
print(checkpoint['module_arguments'])
|
|
|
|
# {'learning_rate': the_value}
|
2020-03-02 22:12:22 +00:00
|
|
|
|
2020-03-03 02:50:38 +00:00
|
|
|
Manual saving
|
|
|
|
^^^^^^^^^^^^^
|
2020-04-05 15:10:44 +00:00
|
|
|
You can manually save checkpoints and restore your model from the checkpointed state.
|
2020-03-03 02:50:38 +00:00
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
2020-05-05 02:16:54 +00:00
|
|
|
model = MyLightningModule(hparams)
|
2020-04-05 15:10:44 +00:00
|
|
|
trainer.fit(model)
|
|
|
|
trainer.save_checkpoint("example.ckpt")
|
|
|
|
new_model = MyModel.load_from_checkpoint(checkpoint_path="example.ckpt")
|
2020-03-03 02:50:38 +00:00
|
|
|
|
2020-03-02 22:12:22 +00:00
|
|
|
Checkpoint Loading
|
|
|
|
------------------
|
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
To load a model along with its weights, biases and `module_arguments` use following method.
|
2020-03-02 22:12:22 +00:00
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
model = MyLightingModule.load_from_checkpoint(PATH)
|
2020-03-29 19:29:48 +00:00
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
print(model.learning_rate)
|
|
|
|
# prints the learning_rate you used in this checkpoint
|
2020-03-30 22:28:51 +00:00
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
model.eval()
|
|
|
|
y_hat = model(x)
|
2020-03-30 22:28:51 +00:00
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
But if you don't want to use the values saved in the checkpoint, pass in your own here
|
2020-03-30 22:28:51 +00:00
|
|
|
|
2020-05-05 02:16:54 +00:00
|
|
|
.. testcode::
|
2020-03-30 22:28:51 +00:00
|
|
|
|
2020-05-05 02:16:54 +00:00
|
|
|
class LitModel(LightningModule):
|
2020-03-30 22:28:51 +00:00
|
|
|
|
|
|
|
def __init__(self, in_dim, out_dim):
|
2020-05-24 22:59:08 +00:00
|
|
|
super().__init__()
|
2020-06-30 23:35:54 +00:00
|
|
|
self.save_hyperparameters()
|
|
|
|
self.l1 = nn.Linear(self.hparams.in_dim, self.hparams.out_dim)
|
2020-03-30 22:28:51 +00:00
|
|
|
|
|
|
|
you can restore the model like this
|
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
2020-05-24 22:59:08 +00:00
|
|
|
# if you train and save the model like this it will use these values when loading
|
|
|
|
# the weights. But you can overwrite this
|
|
|
|
LitModel(in_dim=32, out_dim=10)
|
|
|
|
|
|
|
|
# uses in_dim=32, out_dim=10
|
|
|
|
model = LitModel.load_from_checkpoint(PATH)
|
|
|
|
|
|
|
|
# uses in_dim=128, out_dim=10
|
2020-05-05 02:16:54 +00:00
|
|
|
model = LitModel.load_from_checkpoint(PATH, in_dim=128, out_dim=10)
|
2020-03-30 22:28:51 +00:00
|
|
|
|
|
|
|
|
|
|
|
Restoring Training State
|
|
|
|
------------------------
|
|
|
|
|
|
|
|
If you don't just want to load weights, but instead restore the full training,
|
|
|
|
do the following:
|
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
model = LitModel()
|
|
|
|
trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt')
|
2020-03-29 19:29:48 +00:00
|
|
|
|
2020-03-30 22:28:51 +00:00
|
|
|
# automatically restores model, epoch, step, LR schedulers, apex, etc...
|
|
|
|
trainer.fit(model)
|