Make model test more robust (#18043)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Adrian Wälchli 2023-07-10 22:36:18 +02:00 committed by GitHub
parent 69d7cfe5d8
commit a97c559d92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 23 deletions

View File

@ -59,14 +59,18 @@ def run_model_test(
logger = get_default_logger(save_dir, version=version)
trainer_options.update(logger=logger)
trainer = Trainer(**trainer_options)
initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()])
with torch.no_grad():
initial_values = torch.cat([x.view(-1) for x in model.parameters()])
trainer.fit(model, datamodule=data)
post_train_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()])
with torch.no_grad():
post_train_values = torch.cat([x.view(-1) for x in model.parameters()])
assert trainer.state.finished, f"Training failed with {trainer.state}"
# Check that the model is actually changed post-training
change_ratio = torch.norm(initial_values - post_train_values)
assert change_ratio >= min_change_ratio, f"the model is changed of {change_ratio} and shall be >={min_change_ratio}"
# Check that the model has changed post-training
change_ratio = torch.norm(initial_values - post_train_values) / torch.norm(initial_values)
assert change_ratio >= min_change_ratio, (
f"The change in the model's parameter norm is {change_ratio:.1f}"
f" relative to the initial norm, but expected a change by >={min_change_ratio}"
)
# test model loading
_ = load_model_from_checkpoint(trainer.checkpoint_callback.best_model_path, type(model))

View File

@ -18,7 +18,7 @@ import torch
import tests_pytorch.helpers.pipelines as tpipes
import tests_pytorch.helpers.utils as tutils
from lightning.pytorch import Trainer
from lightning.pytorch import seed_everything, Trainer
from lightning.pytorch.callbacks import Callback, EarlyStopping, ModelCheckpoint
from lightning.pytorch.demos.boring_classes import BoringModel
from tests_pytorch.helpers.datamodules import ClassifDataModule
@ -29,6 +29,8 @@ from tests_pytorch.helpers.simple_models import ClassificationModel
@mock.patch("lightning.fabric.plugins.environments.slurm.SLURMEnvironment.detect", return_value=True)
def test_cpu_slurm_save_load(_, tmpdir):
"""Verify model save/load/checkpoint on CPU."""
seed_everything(42)
model = BoringModel()
# logger file to get meta
@ -101,6 +103,8 @@ def test_cpu_slurm_save_load(_, tmpdir):
def test_early_stopping_cpu_model(tmpdir):
seed_everything(42)
class ModelTrainVal(BoringModel):
def validation_step(self, *args, **kwargs):
output = super().validation_step(*args, **kwargs)
@ -129,6 +133,8 @@ def test_early_stopping_cpu_model(tmpdir):
@RunIf(skip_windows=True, sklearn=True)
def test_multi_cpu_model_ddp(tmpdir):
"""Make sure DDP works."""
seed_everything(42)
trainer_options = {
"default_root_dir": tmpdir,
"enable_progress_bar": False,
@ -150,6 +156,7 @@ def test_lbfgs_cpu_model(tmpdir):
Testing LBFGS optimizer
"""
seed_everything(42)
class ModelSpecifiedOptimizer(BoringModel):
def __init__(self, optimizer_name, learning_rate):
@ -172,6 +179,8 @@ def test_lbfgs_cpu_model(tmpdir):
def test_default_logger_callbacks_cpu_model(tmpdir):
"""Test each of the trainer options."""
seed_everything(42)
trainer_options = {
"default_root_dir": tmpdir,
"max_epochs": 1,
@ -192,6 +201,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir):
def test_running_test_after_fitting(tmpdir):
"""Verify test() on fitted model."""
seed_everything(42)
class ModelTrainValTest(BoringModel):
def validation_step(self, *args, **kwargs):
@ -238,6 +248,7 @@ def test_running_test_no_val(tmpdir):
It performs train and test only
"""
seed_everything(42)
class ModelTrainTest(BoringModel):
def test_step(self, *args, **kwargs):
@ -276,20 +287,9 @@ def test_running_test_no_val(tmpdir):
tutils.assert_ok_model_acc(trainer, key="test_loss")
def test_simple_cpu(tmpdir):
"""Verify continue training session on CPU."""
model = BoringModel()
# fit model
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.1, limit_train_batches=20)
trainer.fit(model)
# traning complete
assert trainer.state.finished, "amp + ddp model failed to complete"
def test_cpu_model(tmpdir):
"""Make sure model trains on CPU."""
seed_everything(42)
trainer_options = {
"default_root_dir": tmpdir,
"enable_progress_bar": False,
@ -304,6 +304,7 @@ def test_cpu_model(tmpdir):
def test_all_features_cpu_model(tmpdir):
"""Test each of the trainer options."""
seed_everything(42)
trainer_options = {
"default_root_dir": tmpdir,
"gradient_clip_val": 1.0,
@ -316,5 +317,4 @@ def test_all_features_cpu_model(tmpdir):
}
model = BoringModel()
tpipes.run_model_test(trainer_options, model, min_acc=0.01)
tpipes.run_model_test(trainer_options, model)

View File

@ -22,7 +22,7 @@ import torch
import tests_pytorch.helpers.pipelines as tpipes
from lightning.fabric.plugins.environments import TorchElasticEnvironment
from lightning.fabric.utilities.device_parser import _parse_gpu_ids
from lightning.pytorch import Trainer
from lightning.pytorch import seed_everything, Trainer
from lightning.pytorch.accelerators import CPUAccelerator, CUDAAccelerator
from lightning.pytorch.demos.boring_classes import BoringModel
from lightning.pytorch.utilities.exceptions import MisconfigurationException
@ -36,6 +36,7 @@ PRETEND_N_OF_GPUS = 16
@RunIf(min_cuda_gpus=2, sklearn=True)
def test_multi_gpu_none_backend(tmpdir):
"""Make sure when using multiple GPUs the user can't use `accelerator = None`."""
seed_everything(42)
trainer_options = {
"default_root_dir": tmpdir,
"enable_progress_bar": False,
@ -55,6 +56,7 @@ def test_multi_gpu_none_backend(tmpdir):
@RunIf(min_cuda_gpus=2)
@pytest.mark.parametrize("devices", [1, [0], [1]])
def test_single_gpu_model(tmpdir, devices):
seed_everything(42)
trainer_options = {
"default_root_dir": tmpdir,
"enable_progress_bar": False,

View File

@ -17,7 +17,7 @@ from torch.multiprocessing import ProcessRaisedException
import tests_pytorch.helpers.pipelines as tpipes
from lightning.pytorch.callbacks import EarlyStopping
from lightning.pytorch.demos.boring_classes import BoringModel
from lightning.pytorch.trainer import Trainer
from lightning.pytorch.trainer import seed_everything, Trainer
from tests_pytorch.helpers.datamodules import ClassifDataModule
from tests_pytorch.helpers.runif import RunIf
from tests_pytorch.helpers.simple_models import ClassificationModel
@ -26,6 +26,8 @@ from tests_pytorch.strategies.test_ddp_strategy import UnusedParametersModel
@RunIf(min_cuda_gpus=2, sklearn=True)
def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
seed_everything(42)
trainer_options = {
"default_root_dir": tmpdir,
"callbacks": [EarlyStopping(monitor="train_acc")],
@ -44,6 +46,8 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
@RunIf(min_cuda_gpus=2)
def test_multi_gpu_model_ddp_spawn(tmpdir):
seed_everything(42)
trainer_options = {
"default_root_dir": tmpdir,
"max_epochs": 1,