[bug-fix] Trainer.test points to latest best_model_path (#5161)

* resolve bug

* update code

* add set -e

* Update pytorch_lightning/callbacks/model_checkpoint.py

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* update test

* Update tests/checkpointing/test_trainer_checkpoint.py

Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>

* Update tests/checkpointing/test_trainer_checkpoint.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* update on comments

* resolve test

* convert to set

* update

* add error triggering

* update

* update on comments

* update

* resolve import

* update

* update

* Update pytorch_lightning/plugins/rpc_plugin.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* update

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-62-109.ec2.internal>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

(cherry picked from commit d5b367871f)
This commit is contained in:
chaton 2021-01-05 11:01:59 +01:00 committed by Jirka Borovec
parent 704e00ee7f
commit 56437e98a6
10 changed files with 111 additions and 12 deletions

View File

@ -30,6 +30,7 @@ steps:
MKL_THREADING_LAYER: GNU
commands:
- set -e
- python --version
- pip --version
- nvidia-smi

View File

@ -76,12 +76,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Fixed
- Skip restore from `resume_from_checkpoint` in while `testing` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161))
- Allowed `log_momentum` for adaptive optimizers in `LearningRateMonitor` ([#5333](https://github.com/PyTorchLightning/pytorch-lightning/pull/5333))
- Disabled checkpointing, earlystopping and logger with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277))
## [1.1.2] - 2020-12-23
### Added

View File

@ -199,6 +199,7 @@ class ModelCheckpoint(Callback):
"best_model_score": self.best_model_score,
"best_model_path": self.best_model_path,
"current_score": self.current_score,
"dirpath": self.dirpath
}
def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]):

View File

@ -18,10 +18,13 @@ import torch
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
from pytorch_lightning.utilities import _RPC_AVAILABLE
from pytorch_lightning.utilities import _RPC_AVAILABLE, _module_available
DEFAULT_RPC_TIMEOUT_SEC = 60.
if _RPC_AVAILABLE:
from torch.distributed import rpc
if _module_available("torch.distributed.rpc.constants") and hasattr(torch.distributed.rpc.constants, "DEFAULT_RPC_TIMEOUT_SEC"):
from torch.distributed.rpc.constants import DEFAULT_RPC_TIMEOUT_SEC
class RPCPlugin(DDPPlugin):
@ -33,7 +36,8 @@ class RPCPlugin(DDPPlugin):
that need to be addressed when using RPC communication when building custom RPC Plugins.
"""
def __init__(self, **kwargs):
def __init__(self, rpc_timeout_sec: float = DEFAULT_RPC_TIMEOUT_SEC, **kwargs):
self.rpc_timeout_sec = rpc_timeout_sec
self.rpc_initialized = False
super().__init__(**kwargs)
@ -42,6 +46,7 @@ class RPCPlugin(DDPPlugin):
world_size: int) -> None:
os.environ['MASTER_PORT'] = os.getenv('RPC_MASTER_PORT', '15000')
rpc.init_rpc(f"worker{global_rank}", rank=global_rank, world_size=world_size)
rpc._set_rpc_timeout(self.rpc_timeout_sec)
self.rpc_initialized = True
def rpc_save_model(self,

View File

@ -20,6 +20,7 @@ from typing import Optional, Union
import torch
import pytorch_lightning
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType, _OMEGACONF_AVAILABLE, rank_zero_info, rank_zero_warn
from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
@ -62,7 +63,7 @@ class CheckpointConnector:
rank_zero_info(f'restored hpc model from: {checkpoint_path}')
# 2. Attempt to restore states from `resume_from_checkpoint` file
elif self.trainer.resume_from_checkpoint is not None:
elif self.trainer.resume_from_checkpoint is not None and not self.trainer.testing:
self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer.on_gpu)
# wait for all to catch up

View File

@ -184,7 +184,7 @@ class TrainLoop:
# if cluster resets state, the model will update with the saved weights
self.trainer.model = model
# restore training and model before hpc is called
# restore training state and model weights before hpc is called
self.trainer.checkpoint_connector.restore_weights(model)
# on pretrain routine end

View File

@ -11,20 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from argparse import Namespace
import os
from pathlib import Path
import pickle
import platform
import re
from argparse import Namespace
from pathlib import Path
from unittest import mock
from unittest.mock import Mock
import cloudpickle
from omegaconf import Container, OmegaConf
import pytest
import torch
import yaml
from omegaconf import Container, OmegaConf
import pytorch_lightning as pl
import tests.base.develop_utils as tutils
@ -34,6 +34,7 @@ from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.base import BoringModel
import tests.base.develop_utils as tutils
class LogInTwoMethods(BoringModel):
@ -745,9 +746,9 @@ def test_checkpoint_repeated_strategy_extended(enable_pl_optimizer, tmpdir):
model = ExtendedBoringModel()
trainer.test(model)
assert not trainer.checkpoint_connector.has_trained
assert trainer.global_step == epochs * limit_train_batches
assert trainer.current_epoch == epochs
# resume_from_checkpoint is resumed when calling `.fit`
assert trainer.global_step == 0
assert trainer.current_epoch == 0
trainer.fit(model)
assert not trainer.checkpoint_connector.has_trained
assert trainer.global_step == epochs * limit_train_batches

View File

@ -0,0 +1,87 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from copy import deepcopy
import os
import torch
import pytorch_lightning as pl
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.utilities.cloud_io import load as pl_load
from tests.base import BoringModel
def test_finetuning_with_resume_from_checkpoint(tmpdir):
"""
This test validates that generated ModelCheckpoint is pointing to the right best_model_path during test
"""
seed_everything(3)
checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath=tmpdir, filename="{epoch:02d}", save_top_k=-1)
class ExtendedBoringModel(BoringModel):
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
return [optimizer], [lr_scheduler]
def validation_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("val_loss", loss, on_epoch=True, prog_bar=True)
model = ExtendedBoringModel()
model.validation_epoch_end = None
trainer = Trainer(
default_root_dir=tmpdir,
max_epochs=1,
limit_train_batches=12,
limit_val_batches=6,
limit_test_batches=12,
callbacks=[checkpoint_callback],
logger=False,
)
trainer.fit(model)
assert os.listdir(tmpdir) == ['epoch=00.ckpt']
best_model_paths = [checkpoint_callback.best_model_path]
results = []
for idx in range(3, 6):
# load from checkpoint
trainer = pl.Trainer(
default_root_dir=tmpdir,
max_epochs=idx,
limit_train_batches=12,
limit_val_batches=12,
limit_test_batches=12,
resume_from_checkpoint=best_model_paths[-1],
progress_bar_refresh_rate=0,
)
trainer.fit(model)
trainer.test()
results.append(deepcopy(trainer.callback_metrics))
best_model_paths.append(trainer.checkpoint_callback.best_model_path)
for idx in range(len(results) - 1):
assert results[idx]["val_loss"] > results[idx + 1]["val_loss"]
for idx, best_model_path in enumerate(best_model_paths):
if idx == 0:
assert best_model_path.endswith(f"epoch=0{idx}.ckpt")
else:
assert f"epoch={idx + 1}" in best_model_path

View File

@ -47,7 +47,8 @@ def test_ddp_sequential_plugin_ddp_rpc_manual(tmpdir, args=None):
limit_test_batches=2,
gpus=2,
distributed_backend="ddp",
plugins=[DDPSequentialPlugin(balance=[2, 1])],
plugins=[DDPSequentialPlugin(balance=[2, 1], rpc_timeout_sec=5 * 60)],
enable_pl_optimizer=True,
)
trainer.fit(model)

View File

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Running special tests
set -e
export PL_RUNNING_SPECIAL_TESTS=1
DEFAULTS="-m coverage run --source pytorch_lightning -a -m pytest --verbose --capture=no"
python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp