Make model test more robust (#18043)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2023-07-10 22:36:18 +02:00 · 2023-07-10 22:36:18 +02:00 · a97c559d92
parent 69d7cfe5d8
commit a97c559d92
4 changed files with 33 additions and 23 deletions
--- a/tests/tests_pytorch/helpers/pipelines.py
+++ b/tests/tests_pytorch/helpers/pipelines.py
@ -59,14 +59,18 @@ def run_model_test(
    logger = get_default_logger(save_dir, version=version)
    trainer_options.update(logger=logger)
    trainer = Trainer(**trainer_options)
-    initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()])
+    with torch.no_grad():
+        initial_values = torch.cat([x.view(-1) for x in model.parameters()])
    trainer.fit(model, datamodule=data)
-    post_train_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()])
+    with torch.no_grad():
+        post_train_values = torch.cat([x.view(-1) for x in model.parameters()])

-    assert trainer.state.finished, f"Training failed with {trainer.state}"
-    # Check that the model is actually changed post-training
-    change_ratio = torch.norm(initial_values - post_train_values)
-    assert change_ratio >= min_change_ratio, f"the model is changed of {change_ratio} and shall be >={min_change_ratio}"
+    # Check that the model has changed post-training
+    change_ratio = torch.norm(initial_values - post_train_values) / torch.norm(initial_values)
+    assert change_ratio >= min_change_ratio, (
+        f"The change in the model's parameter norm is {change_ratio:.1f}"
+        f" relative to the initial norm, but expected a change by >={min_change_ratio}"
+    )

    # test model loading
    _ = load_model_from_checkpoint(trainer.checkpoint_callback.best_model_path, type(model))
--- a/tests/tests_pytorch/models/test_cpu.py
+++ b/tests/tests_pytorch/models/test_cpu.py
@ -18,7 +18,7 @@ import torch

 import tests_pytorch.helpers.pipelines as tpipes
 import tests_pytorch.helpers.utils as tutils
-from lightning.pytorch import Trainer
+from lightning.pytorch import seed_everything, Trainer
 from lightning.pytorch.callbacks import Callback, EarlyStopping, ModelCheckpoint
 from lightning.pytorch.demos.boring_classes import BoringModel
 from tests_pytorch.helpers.datamodules import ClassifDataModule
@ -29,6 +29,8 @@ from tests_pytorch.helpers.simple_models import ClassificationModel
@mock.patch("lightning.fabric.plugins.environments.slurm.SLURMEnvironment.detect", return_value=True)
 def test_cpu_slurm_save_load(_, tmpdir):
    """Verify model save/load/checkpoint on CPU."""
+    seed_everything(42)
+
    model = BoringModel()

    # logger file to get meta
@ -101,6 +103,8 @@ def test_cpu_slurm_save_load(_, tmpdir):


 def test_early_stopping_cpu_model(tmpdir):
+    seed_everything(42)
+
    class ModelTrainVal(BoringModel):
        def validation_step(self, *args, **kwargs):
            output = super().validation_step(*args, **kwargs)
@ -129,6 +133,8 @@ def test_early_stopping_cpu_model(tmpdir):
@RunIf(skip_windows=True, sklearn=True)
 def test_multi_cpu_model_ddp(tmpdir):
    """Make sure DDP works."""
+    seed_everything(42)
+
    trainer_options = {
        "default_root_dir": tmpdir,
        "enable_progress_bar": False,
@ -150,6 +156,7 @@ def test_lbfgs_cpu_model(tmpdir):

    Testing LBFGS optimizer
    """
+    seed_everything(42)

    class ModelSpecifiedOptimizer(BoringModel):
        def __init__(self, optimizer_name, learning_rate):
@ -172,6 +179,8 @@ def test_lbfgs_cpu_model(tmpdir):

 def test_default_logger_callbacks_cpu_model(tmpdir):
    """Test each of the trainer options."""
+    seed_everything(42)
+
    trainer_options = {
        "default_root_dir": tmpdir,
        "max_epochs": 1,
@ -192,6 +201,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir):

 def test_running_test_after_fitting(tmpdir):
    """Verify test() on fitted model."""
+    seed_everything(42)

    class ModelTrainValTest(BoringModel):
        def validation_step(self, *args, **kwargs):
@ -238,6 +248,7 @@ def test_running_test_no_val(tmpdir):

    It performs train and test only
    """
+    seed_everything(42)

    class ModelTrainTest(BoringModel):
        def test_step(self, *args, **kwargs):
@ -276,20 +287,9 @@ def test_running_test_no_val(tmpdir):
    tutils.assert_ok_model_acc(trainer, key="test_loss")


-def test_simple_cpu(tmpdir):
-    """Verify continue training session on CPU."""
-    model = BoringModel()
-
-    # fit model
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.1, limit_train_batches=20)
-    trainer.fit(model)
-
-    # traning complete
-    assert trainer.state.finished, "amp + ddp model failed to complete"
-
-
 def test_cpu_model(tmpdir):
    """Make sure model trains on CPU."""
+    seed_everything(42)
    trainer_options = {
        "default_root_dir": tmpdir,
        "enable_progress_bar": False,
@ -304,6 +304,7 @@ def test_cpu_model(tmpdir):

 def test_all_features_cpu_model(tmpdir):
    """Test each of the trainer options."""
+    seed_everything(42)
    trainer_options = {
        "default_root_dir": tmpdir,
        "gradient_clip_val": 1.0,
@ -316,5 +317,4 @@ def test_all_features_cpu_model(tmpdir):
    }

    model = BoringModel()
-
-    tpipes.run_model_test(trainer_options, model, min_acc=0.01)
+    tpipes.run_model_test(trainer_options, model)
--- a/tests/tests_pytorch/models/test_gpu.py
+++ b/tests/tests_pytorch/models/test_gpu.py
@ -22,7 +22,7 @@ import torch
 import tests_pytorch.helpers.pipelines as tpipes
 from lightning.fabric.plugins.environments import TorchElasticEnvironment
 from lightning.fabric.utilities.device_parser import _parse_gpu_ids
-from lightning.pytorch import Trainer
+from lightning.pytorch import seed_everything, Trainer
 from lightning.pytorch.accelerators import CPUAccelerator, CUDAAccelerator
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
@ -36,6 +36,7 @@ PRETEND_N_OF_GPUS = 16
@RunIf(min_cuda_gpus=2, sklearn=True)
 def test_multi_gpu_none_backend(tmpdir):
    """Make sure when using multiple GPUs the user can't use `accelerator = None`."""
+    seed_everything(42)
    trainer_options = {
        "default_root_dir": tmpdir,
        "enable_progress_bar": False,
@ -55,6 +56,7 @@ def test_multi_gpu_none_backend(tmpdir):
@RunIf(min_cuda_gpus=2)
@pytest.mark.parametrize("devices", [1, [0], [1]])
 def test_single_gpu_model(tmpdir, devices):
+    seed_everything(42)
    trainer_options = {
        "default_root_dir": tmpdir,
        "enable_progress_bar": False,
--- a/tests/tests_pytorch/strategies/test_ddp_spawn.py
+++ b/tests/tests_pytorch/strategies/test_ddp_spawn.py
@ -17,7 +17,7 @@ from torch.multiprocessing import ProcessRaisedException
 import tests_pytorch.helpers.pipelines as tpipes
 from lightning.pytorch.callbacks import EarlyStopping
 from lightning.pytorch.demos.boring_classes import BoringModel
-from lightning.pytorch.trainer import Trainer
+from lightning.pytorch.trainer import seed_everything, Trainer
 from tests_pytorch.helpers.datamodules import ClassifDataModule
 from tests_pytorch.helpers.runif import RunIf
 from tests_pytorch.helpers.simple_models import ClassificationModel
@ -26,6 +26,8 @@ from tests_pytorch.strategies.test_ddp_strategy import UnusedParametersModel

@RunIf(min_cuda_gpus=2, sklearn=True)
 def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
+    seed_everything(42)
+
    trainer_options = {
        "default_root_dir": tmpdir,
        "callbacks": [EarlyStopping(monitor="train_acc")],
@ -44,6 +46,8 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):

@RunIf(min_cuda_gpus=2)
 def test_multi_gpu_model_ddp_spawn(tmpdir):
+    seed_everything(42)
+
    trainer_options = {
        "default_root_dir": tmpdir,
        "max_epochs": 1,