lightning/tests/checkpointing/test_trainer_checkpoint.py

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from copy import deepcopy

import torch

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from tests.helpers import BoringModel


def test_finetuning_with_resume_from_checkpoint(tmpdir):
    """
    This test validates that generated ModelCheckpoint is pointing to the right best_model_path during test
    """

    checkpoint_callback = ModelCheckpoint(monitor="val_loss", dirpath=tmpdir, filename="{epoch:02d}", save_top_k=-1)

    class ExtendedBoringModel(BoringModel):
        def configure_optimizers(self):
            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)
            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
            return [optimizer], [lr_scheduler]

        def validation_step(self, batch, batch_idx):
            output = self.layer(batch)
            loss = self.loss(batch, output)
            self.log("val_loss", loss, on_epoch=True, prog_bar=True)

    model = ExtendedBoringModel()
    model.validation_epoch_end = None
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=12,
        limit_val_batches=6,
        limit_test_batches=12,
        callbacks=[checkpoint_callback],
        logger=False,
    )
    trainer.fit(model)
    assert os.listdir(tmpdir) == ["epoch=00.ckpt"]

    best_model_paths = [checkpoint_callback.best_model_path]
    results = []

    for idx in range(3, 6):
        # load from checkpoint
        trainer = pl.Trainer(
            default_root_dir=tmpdir,
            max_epochs=idx,
            limit_train_batches=12,
            limit_val_batches=12,
            limit_test_batches=12,
            resume_from_checkpoint=best_model_paths[-1],
            progress_bar_refresh_rate=0,
        )
        trainer.fit(model)
        trainer.test()
        results.append(deepcopy(trainer.callback_metrics))
        best_model_paths.append(trainer.checkpoint_callback.best_model_path)

    for idx, best_model_path in enumerate(best_model_paths):
        if idx == 0:
            assert best_model_path.endswith(f"epoch=0{idx}.ckpt")
        else:
            assert f"epoch={idx + 1}" in best_model_path


def test_accumulated_gradient_batches_with_resume_from_checkpoint(tmpdir):
    """
    This test validates that accumulated gradient is properly recomputed and reset on the trainer.
    """

    ckpt = ModelCheckpoint(dirpath=tmpdir, save_last=True)
    model = BoringModel()
    trainer_kwargs = dict(
        max_epochs=1, accumulate_grad_batches={0: 2}, callbacks=ckpt, limit_train_batches=1, limit_val_batches=0
    )
    trainer = Trainer(**trainer_kwargs)
    trainer.fit(model)

    trainer_kwargs["max_epochs"] = 2
    trainer_kwargs["resume_from_checkpoint"] = ckpt.last_model_path
    trainer = Trainer(**trainer_kwargs)
    trainer.fit(model)
[bug-fix] Trainer.test points to latest best_model_path (#5161) * resolve bug * update code * add set -e * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update test * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update on comments * resolve test * convert to set * update * add error triggering * update * update on comments * update * resolve import * update * update * Update pytorch_lightning/plugins/rpc_plugin.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * update Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> (cherry picked from commit d5b367871fa3924090ec74bf903bd172bd3e2343) 2021-01-05 10:01:59 +00:00			`# Copyright The PyTorch Lightning team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`import os`
Fix pre-commit isort failure on tests/checkpointing/.py (#5427) Remove tests.checkpointing from skipped module in pyproject.toml * Fix pre-commit isort failure on tests/checkpointing/*.py 2021-01-12 08:31:51 +00:00			`from copy import deepcopy`
[bug-fix] Trainer.test points to latest best_model_path (#5161) * resolve bug * update code * add set -e * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update test * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update on comments * resolve test * convert to set * update * add error triggering * update * update on comments * update * resolve import * update * update * Update pytorch_lightning/plugins/rpc_plugin.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * update Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> (cherry picked from commit d5b367871fa3924090ec74bf903bd172bd3e2343) 2021-01-05 10:01:59 +00:00
			`import torch`

			`import pytorch_lightning as pl`
3/n inter batch parallelism (#9052) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com> 2021-08-24 18:45:54 +00:00			`from pytorch_lightning import Trainer`
[bug-fix] Trainer.test points to latest best_model_path (#5161) * resolve bug * update code * add set -e * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update test * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update on comments * resolve test * convert to set * update * add error triggering * update * update on comments * update * resolve import * update * update * Update pytorch_lightning/plugins/rpc_plugin.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * update Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> (cherry picked from commit d5b367871fa3924090ec74bf903bd172bd3e2343) 2021-01-05 10:01:59 +00:00			`from pytorch_lightning.callbacks import ModelCheckpoint`
fix miss-leading imports in tests (#5873) * fix imorts * . 2021-02-09 10:10:52 +00:00			`from tests.helpers import BoringModel`
[bug-fix] Trainer.test points to latest best_model_path (#5161) * resolve bug * update code * add set -e * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update test * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update on comments * resolve test * convert to set * update * add error triggering * update * update on comments * update * resolve import * update * update * Update pytorch_lightning/plugins/rpc_plugin.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * update Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> (cherry picked from commit d5b367871fa3924090ec74bf903bd172bd3e2343) 2021-01-05 10:01:59 +00:00

			`def test_finetuning_with_resume_from_checkpoint(tmpdir):`
			`"""`
			`This test validates that generated ModelCheckpoint is pointing to the right best_model_path during test`
			`"""`

Replace `yapf` with `black` (#7783) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-07-26 11:37:35 +00:00			`checkpoint_callback = ModelCheckpoint(monitor="val_loss", dirpath=tmpdir, filename="{epoch:02d}", save_top_k=-1)`
[bug-fix] Trainer.test points to latest best_model_path (#5161) * resolve bug * update code * add set -e * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update test * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update on comments * resolve test * convert to set * update * add error triggering * update * update on comments * update * resolve import * update * update * Update pytorch_lightning/plugins/rpc_plugin.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * update Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> (cherry picked from commit d5b367871fa3924090ec74bf903bd172bd3e2343) 2021-01-05 10:01:59 +00:00
			`class ExtendedBoringModel(BoringModel):`
			`def configure_optimizers(self):`
			`optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)`
			`lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)`
			`return [optimizer], [lr_scheduler]`

			`def validation_step(self, batch, batch_idx):`
			`output = self.layer(batch)`
			`loss = self.loss(batch, output)`
			`self.log("val_loss", loss, on_epoch=True, prog_bar=True)`

			`model = ExtendedBoringModel()`
			`model.validation_epoch_end = None`
			`trainer = Trainer(`
			`default_root_dir=tmpdir,`
			`max_epochs=1,`
			`limit_train_batches=12,`
			`limit_val_batches=6,`
			`limit_test_batches=12,`
			`callbacks=[checkpoint_callback],`
			`logger=False,`
			`)`
			`trainer.fit(model)`
Replace `yapf` with `black` (#7783) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-07-26 11:37:35 +00:00			`assert os.listdir(tmpdir) == ["epoch=00.ckpt"]`
[bug-fix] Trainer.test points to latest best_model_path (#5161) * resolve bug * update code * add set -e * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update test * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> * Update tests/checkpointing/test_trainer_checkpoint.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * update on comments * resolve test * convert to set * update * add error triggering * update * update on comments * update * resolve import * update * update * Update pytorch_lightning/plugins/rpc_plugin.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * update Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> (cherry picked from commit d5b367871fa3924090ec74bf903bd172bd3e2343) 2021-01-05 10:01:59 +00:00
			`best_model_paths = [checkpoint_callback.best_model_path]`
			`results = []`

			`for idx in range(3, 6):`
			`# load from checkpoint`
			`trainer = pl.Trainer(`
			`default_root_dir=tmpdir,`
			`max_epochs=idx,`
			`limit_train_batches=12,`
			`limit_val_batches=12,`
			`limit_test_batches=12,`
			`resume_from_checkpoint=best_model_paths[-1],`
			`progress_bar_refresh_rate=0,`
			`)`
			`trainer.fit(model)`
			`trainer.test()`
			`results.append(deepcopy(trainer.callback_metrics))`
			`best_model_paths.append(trainer.checkpoint_callback.best_model_path)`

			`for idx, best_model_path in enumerate(best_model_paths):`
			`if idx == 0:`
			`assert best_model_path.endswith(f"epoch=0{idx}.ckpt")`
			`else:`
			`assert f"epoch={idx + 1}" in best_model_path`
[bugfix] Re-compute accumulated_grad_batches (#8493) * resolve resolution * update changelog * typo * optimize test * update on comments * resolve comments * update 2021-07-21 10:46:25 +00:00

			`def test_accumulated_gradient_batches_with_resume_from_checkpoint(tmpdir):`
			`"""`
			`This test validates that accumulated gradient is properly recomputed and reset on the trainer.`
			`"""`

fix restoring finetune callbacks after accelerator setup on training resume (#8501) Co-authored-by: tchaton <thomas@grid.ai> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-07-23 17:49:32 +00:00			`ckpt = ModelCheckpoint(dirpath=tmpdir, save_last=True)`
[bugfix] Re-compute accumulated_grad_batches (#8493) * resolve resolution * update changelog * typo * optimize test * update on comments * resolve comments * update 2021-07-21 10:46:25 +00:00			`model = BoringModel()`
			`trainer_kwargs = dict(`
fix restoring finetune callbacks after accelerator setup on training resume (#8501) Co-authored-by: tchaton <thomas@grid.ai> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-07-23 17:49:32 +00:00			`max_epochs=1, accumulate_grad_batches={0: 2}, callbacks=ckpt, limit_train_batches=1, limit_val_batches=0`
[bugfix] Re-compute accumulated_grad_batches (#8493) * resolve resolution * update changelog * typo * optimize test * update on comments * resolve comments * update 2021-07-21 10:46:25 +00:00			`)`
			`trainer = Trainer(**trainer_kwargs)`
			`trainer.fit(model)`

Replace `yapf` with `black` (#7783) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2021-07-26 11:37:35 +00:00			`trainer_kwargs["max_epochs"] = 2`
			`trainer_kwargs["resume_from_checkpoint"] = ckpt.last_model_path`
[bugfix] Re-compute accumulated_grad_batches (#8493) * resolve resolution * update changelog * typo * optimize test * update on comments * resolve comments * update 2021-07-21 10:46:25 +00:00			`trainer = Trainer(**trainer_kwargs)`
			`trainer.fit(model)`