lightning/tests/trainer/optimization/test_optimizers.py

619 lines
24 KiB
Python

# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest import mock
import pytest
import torch
from torch import optim
from pytorch_lightning import Callback, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers.boring_model import BoringModel
from tests.helpers.runif import RunIf
def test_optimizer_with_scheduling(tmpdir):
"""Verify that learning rate scheduling is working."""
model = BoringModel()
trainer = Trainer(
default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.1, limit_train_batches=0.2, val_check_interval=0.5
)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
init_lr = 0.1
adjusted_lr = [pg["lr"] for pg in trainer.optimizers[0].param_groups]
assert len(trainer.lr_schedulers) == 1
assert all(a == adjusted_lr[0] for a in adjusted_lr)
assert init_lr * 0.1 == adjusted_lr[0]
def test_multi_optimizer_with_scheduling(tmpdir):
"""Verify that learning rate scheduling is working."""
class TestModel(BoringModel):
init_lr = 5e-4
def training_step(self, batch, batch_idx, optimizer_idx):
return super().training_step(batch, batch_idx)
def configure_optimizers(self):
optimizer1 = optim.Adam(self.parameters(), lr=self.init_lr)
optimizer2 = optim.Adam(self.parameters(), lr=self.init_lr)
lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, step_size=1)
lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, step_size=1)
return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2]
model = TestModel()
model.training_epoch_end = None
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.1, limit_train_batches=0.2)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
adjusted_lr1 = [pg["lr"] for pg in trainer.optimizers[0].param_groups]
adjusted_lr2 = [pg["lr"] for pg in trainer.optimizers[1].param_groups]
assert len(trainer.lr_schedulers) == 2
assert all(a == adjusted_lr1[0] for a in adjusted_lr1)
assert all(a == adjusted_lr2[0] for a in adjusted_lr2)
assert model.init_lr * 0.1 == adjusted_lr1[0]
assert model.init_lr * 0.1 == adjusted_lr2[0]
def test_reducelronplateau_with_no_monitor_raises(tmpdir):
"""Test exception when a ReduceLROnPlateau is used with no monitor."""
model = BoringModel()
optimizer = optim.Adam(model.parameters())
model.configure_optimizers = lambda: ([optimizer], [optim.lr_scheduler.ReduceLROnPlateau(optimizer)])
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
with pytest.raises(
MisconfigurationException, match="`configure_optimizers` must include a monitor when a `ReduceLROnPlateau`"
):
trainer.fit(model)
def test_reducelronplateau_with_no_monitor_in_lr_scheduler_dict_raises(tmpdir):
"""Test exception when lr_scheduler dict has a ReduceLROnPlateau with no monitor."""
model = BoringModel()
optimizer = optim.Adam(model.parameters())
model.configure_optimizers = lambda: {
"optimizer": optimizer,
"lr_scheduler": {"scheduler": optim.lr_scheduler.ReduceLROnPlateau(optimizer)},
}
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
with pytest.raises(MisconfigurationException, match="must include a monitor when a `ReduceLROnPlateau`"):
trainer.fit(model)
def test_reducelronplateau_scheduling(tmpdir):
class TestModel(BoringModel):
def training_step(self, batch, batch_idx):
self.log("foo", batch_idx)
return super().training_step(batch, batch_idx)
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters())
return {
"optimizer": optimizer,
"lr_scheduler": optim.lr_scheduler.ReduceLROnPlateau(optimizer),
"monitor": "foo",
}
model = TestModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
lr_scheduler = trainer.lr_schedulers[0]
assert lr_scheduler == dict(
scheduler=lr_scheduler["scheduler"],
monitor="foo",
interval="epoch",
frequency=1,
reduce_on_plateau=True,
strict=True,
opt_idx=None,
name=None,
)
def test_optimizer_return_options(tmpdir):
trainer = Trainer(default_root_dir=tmpdir)
model = BoringModel()
# single optimizer
opt_a = optim.Adam(model.parameters(), lr=0.002)
opt_b = optim.SGD(model.parameters(), lr=0.002)
scheduler_a = optim.lr_scheduler.StepLR(opt_a, 10)
scheduler_b = optim.lr_scheduler.StepLR(opt_b, 10)
# single optimizer
model.configure_optimizers = lambda: opt_a
opt, lr_sched, freq = trainer.init_optimizers(model)
assert len(opt) == 1 and len(lr_sched) == len(freq) == 0
# opt tuple
model.configure_optimizers = lambda: (opt_a, opt_b)
opt, lr_sched, freq = trainer.init_optimizers(model)
assert opt == [opt_a, opt_b]
assert len(lr_sched) == len(freq) == 0
# opt list
model.configure_optimizers = lambda: [opt_a, opt_b]
opt, lr_sched, freq = trainer.init_optimizers(model)
assert opt == [opt_a, opt_b]
assert len(lr_sched) == len(freq) == 0
ref_lr_sched = dict(
scheduler=scheduler_a,
interval="epoch",
frequency=1,
reduce_on_plateau=False,
monitor=None,
strict=True,
name=None,
opt_idx=None,
)
# opt tuple of 2 lists
model.configure_optimizers = lambda: ([opt_a], [scheduler_a])
opt, lr_sched, freq = trainer.init_optimizers(model)
assert len(opt) == len(lr_sched) == 1
assert len(freq) == 0
assert opt[0] == opt_a
assert lr_sched[0] == ref_lr_sched
# opt tuple of 1 list
model.configure_optimizers = lambda: ([opt_a], scheduler_a)
opt, lr_sched, freq = trainer.init_optimizers(model)
assert len(opt) == len(lr_sched) == 1
assert len(freq) == 0
assert opt[0] == opt_a
assert lr_sched[0] == ref_lr_sched
# opt single dictionary
model.configure_optimizers = lambda: {"optimizer": opt_a, "lr_scheduler": scheduler_a}
opt, lr_sched, freq = trainer.init_optimizers(model)
assert len(opt) == len(lr_sched) == 1
assert len(freq) == 0
assert opt[0] == opt_a
assert lr_sched[0] == ref_lr_sched
# opt multiple dictionaries with frequencies
model.configure_optimizers = lambda: (
{"optimizer": opt_a, "lr_scheduler": scheduler_a, "frequency": 1},
{"optimizer": opt_b, "lr_scheduler": scheduler_b, "frequency": 5},
)
opt, lr_sched, freq = trainer.init_optimizers(model)
assert len(opt) == len(lr_sched) == len(freq) == 2
assert opt[0] == opt_a
ref_lr_sched["opt_idx"] = 0
assert lr_sched[0] == ref_lr_sched
ref_lr_sched["scheduler"] = scheduler_b
ref_lr_sched["opt_idx"] = 1
assert lr_sched[1] == ref_lr_sched
assert freq == [1, 5]
def test_none_optimizer(tmpdir):
model = BoringModel()
model.configure_optimizers = lambda: None
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.1, limit_train_batches=0.2)
with pytest.warns(UserWarning, match="will run with no optimizer"):
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_configure_optimizer_from_dict(tmpdir):
"""Tests if `configure_optimizer` method could return a dictionary with `optimizer` field only."""
class TestModel(BoringModel):
def configure_optimizers(self):
config = {"optimizer": optim.SGD(params=self.parameters(), lr=1e-03)}
return config
model = TestModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
@pytest.mark.parametrize(
"schedulers, kwargs, intervals, frequencies, expected_steps, max_epochs",
[
(
(optim.lr_scheduler.OneCycleLR, optim.lr_scheduler.OneCycleLR),
(dict(max_lr=0.01, total_steps=3), dict(max_lr=0.01, total_steps=2)),
("step", "step"),
(3, 2),
(4, 3),
1,
),
(
(optim.lr_scheduler.OneCycleLR, optim.lr_scheduler.OneCycleLR),
(dict(max_lr=0.01, total_steps=5), dict(max_lr=0.01, total_steps=5)),
("step", "step"),
(None, None),
(6, 6),
1,
),
(
(optim.lr_scheduler.StepLR, optim.lr_scheduler.CosineAnnealingLR),
(dict(step_size=5), dict(T_max=2)),
("epoch", "epoch"),
(5, 10),
(2, 3),
3,
),
],
)
def test_step_scheduling_for_multiple_optimizers_with_frequency(
tmpdir, schedulers, kwargs, intervals, frequencies, expected_steps, max_epochs
):
"""Test that step LR schedulers for multiple optimizers follow the optimizer frequencies when corresponding
frequency is set."""
class DummyModel(BoringModel):
def training_step(self, batch, batch_idx, optimizer_idx):
return super().training_step(batch, batch_idx)
def training_epoch_end(self, outputs) -> None:
pass
def configure_optimizers(self):
optimizer1 = optim.Adam(self.parameters(), lr=0.01)
optimizer2 = optim.Adam(self.parameters(), lr=0.01)
lr_scheduler_config_1 = {"scheduler": schedulers[0](optimizer1, **kwargs[0]), "interval": intervals[0]}
lr_scheduler_config_2 = {"scheduler": schedulers[1](optimizer2, **kwargs[1]), "interval": intervals[1]}
return [
{"optimizer": optimizer1, "frequency": frequencies[0], "lr_scheduler": lr_scheduler_config_1},
{"optimizer": optimizer2, "frequency": frequencies[1], "lr_scheduler": lr_scheduler_config_2},
]
model = DummyModel()
trainer = Trainer(default_root_dir=tmpdir, limit_val_batches=1, limit_train_batches=5, max_epochs=max_epochs)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.lr_schedulers[0]["opt_idx"] == 0
assert trainer.lr_schedulers[1]["opt_idx"] == 1
# Step count is 1 greater than the expected value because scheduler.step() is called once during initialization
assert trainer.lr_schedulers[0]["scheduler"]._step_count == expected_steps[0]
assert trainer.lr_schedulers[1]["scheduler"]._step_count == expected_steps[1]
@pytest.mark.parametrize("fn", ("validate", "test"))
def test_init_optimizers_during_evaluation(tmpdir, fn):
"""Test that optimizers is an empty list during evaluation."""
class TestModel(BoringModel):
def configure_optimizers(self):
optimizer1 = optim.Adam(self.parameters(), lr=0.1)
optimizer2 = optim.Adam(self.parameters(), lr=0.1)
lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, step_size=1)
lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, step_size=1)
return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2]
trainer = Trainer(default_root_dir=tmpdir, limit_val_batches=10, limit_test_batches=10)
validate_or_test = getattr(trainer, fn)
validate_or_test(TestModel(), ckpt_path=None)
assert len(trainer.lr_schedulers) == 0
assert len(trainer.optimizers) == 0
assert len(trainer.optimizer_frequencies) == 0
def test_multiple_optimizers_callbacks(tmpdir):
"""Tests that multiple optimizers can be used with callbacks."""
class CB(Callback):
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
pass
def on_train_epoch_start(self, trainer, pl_module):
pass
class TestModel(BoringModel):
def __init__(self):
super().__init__()
self.layer_1 = torch.nn.Linear(32, 2)
self.layer_2 = torch.nn.Linear(32, 2)
def training_step(self, batch, batch_idx, optimizer_idx):
if optimizer_idx == 0:
a = batch[0]
acc = self.layer_1(a)
else:
a = batch[0]
acc = self.layer_2(a)
acc = self.loss(acc, acc)
return acc
def configure_optimizers(self):
a = optim.RMSprop(self.layer_1.parameters(), 1e-2)
b = optim.RMSprop(self.layer_2.parameters(), 1e-2)
return a, b
model = TestModel()
model.training_epoch_end = None
trainer = Trainer(
callbacks=[CB()],
default_root_dir=tmpdir,
limit_train_batches=1,
limit_val_batches=2,
max_epochs=1,
weights_summary=None,
)
trainer.fit(model)
@pytest.mark.parametrize("complete_epoch", [True, False])
@mock.patch("torch.optim.lr_scheduler.ReduceLROnPlateau.step")
def test_lr_scheduler_strict(step_mock, tmpdir, complete_epoch):
"""Test "strict" support in lr_scheduler dict."""
model = BoringModel()
optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)
max_epochs = 1 if complete_epoch else None
max_steps = None if complete_epoch else 1
trainer = Trainer(default_root_dir=tmpdir, max_epochs=max_epochs, max_steps=max_steps)
model.configure_optimizers = lambda: {
"optimizer": optimizer,
"lr_scheduler": {"scheduler": scheduler, "monitor": "giraffe", "strict": True},
}
if complete_epoch:
with pytest.raises(
MisconfigurationException,
match=r"ReduceLROnPlateau conditioned on metric .* which is not available\. Available metrics are:",
):
trainer.fit(model)
else:
trainer.fit(model)
step_mock.assert_not_called()
model.configure_optimizers = lambda: {
"optimizer": optimizer,
"lr_scheduler": {"scheduler": scheduler, "monitor": "giraffe", "strict": False},
}
if complete_epoch:
with pytest.warns(
RuntimeWarning, match=r"ReduceLROnPlateau conditioned on metric .* which is not available but strict"
):
trainer.fit(model)
step_mock.assert_not_called()
def test_unknown_configure_optimizers_raises(tmpdir):
"""Test exception with an unsupported configure_optimizers return."""
model = BoringModel()
model.configure_optimizers = lambda: 1
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
with pytest.raises(MisconfigurationException, match="Unknown configuration for model optimizers"):
trainer.fit(model)
def test_lr_scheduler_with_unknown_interval_raises(tmpdir):
"""Test exception when lr_scheduler dict has unknown interval param value."""
model = BoringModel()
optimizer = optim.Adam(model.parameters())
model.configure_optimizers = lambda: {
"optimizer": optimizer,
"lr_scheduler": {"scheduler": optim.lr_scheduler.StepLR(optimizer, 1), "interval": "incorrect_unknown_value"},
}
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
with pytest.raises(MisconfigurationException, match=r'The "interval" key in lr scheduler dict must be'):
trainer.fit(model)
def test_lr_scheduler_with_extra_keys_warns(tmpdir):
"""Test warning when lr_scheduler dict has extra keys."""
model = BoringModel()
optimizer = optim.Adam(model.parameters())
model.configure_optimizers = lambda: {
"optimizer": optimizer,
"lr_scheduler": {"scheduler": optim.lr_scheduler.StepLR(optimizer, 1), "foo": 1, "bar": 2},
}
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
with pytest.warns(RuntimeWarning, match=r"Found unsupported keys in the lr scheduler dict: \[.+\]"):
trainer.fit(model)
def test_lr_scheduler_with_no_actual_scheduler_raises(tmpdir):
"""Test exception when lr_scheduler dict has no scheduler."""
model = BoringModel()
model.configure_optimizers = lambda: {"optimizer": optim.Adam(model.parameters()), "lr_scheduler": {}}
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
with pytest.raises(MisconfigurationException, match='The lr scheduler dict must have the key "scheduler"'):
trainer.fit(model)
def test_invalid_optimizer_in_scheduler(tmpdir):
"""Test exception when optimizer attatched to lr_schedulers wasn't returned."""
class InvalidOptimizerModel(BoringModel):
def configure_optimizers(self):
opt1 = optim.SGD(self.layer.parameters(), lr=0.1)
opt2 = optim.SGD(self.layer.parameters(), lr=0.1)
lr_scheduler = optim.lr_scheduler.StepLR(opt2, step_size=1)
return [opt1], [lr_scheduler]
model = InvalidOptimizerModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
with pytest.raises(MisconfigurationException, match="attatched with an optimizer that wasn't returned"):
trainer.fit(model)
def test_invalid_optimizer_dict_raises(tmpdir):
"""Test exception when lr_scheduler dict has no scheduler."""
class DummyModel(BoringModel):
def configure_optimizers(self):
return [{"optimizer": optim.Adam(self.parameters())}, optim.Adam(self.parameters())]
model = DummyModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
with pytest.raises(MisconfigurationException, match="Unknown configuration for model optimizers"):
trainer.fit(model)
def test_warn_invalid_scheduler_key_in_manual_optimization(tmpdir):
"""Test warning when invalid scheduler keys are provided in manual optimization."""
class TestModel(BoringModel):
def __init__(self):
super().__init__()
self.automatic_optimization = False
def configure_optimizers(self):
opt = optim.SGD(self.layer.parameters(), lr=0.1)
sch = optim.lr_scheduler.StepLR(opt, step_size=1)
return [opt], [{"scheduler": sch, "interval": "epoch"}]
model = TestModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
with pytest.warns(RuntimeWarning, match="the keys will be ignored"):
trainer.fit(model)
@RunIf(min_gpus=2, special=True)
def test_optimizer_state_on_device(tmpdir):
"""Test that optimizers that create state initially at instantiation still end up with the state on the GPU."""
class TestModel(BoringModel):
def configure_optimizers(self):
# Adagrad creates state tensors immediately, model is not yet on GPU.
return optim.Adagrad(self.parameters())
def on_train_start(self, *args, **kwargs):
opt = self.optimizers()
_, state = next(iter(opt.state.items()))
assert state["sum"].device == torch.device("cuda", self.local_rank) == self.device
model = TestModel()
trainer = Trainer(default_root_dir=tmpdir, gpus=2, accelerator="ddp", fast_dev_run=True)
trainer.fit(model)
@pytest.mark.parametrize("check_val_every_n_epoch", [1, 2])
@mock.patch("torch.optim.lr_scheduler.StepLR.step")
def test_lr_scheduler_epoch_step_frequency(mocked_sched, check_val_every_n_epoch, tmpdir):
epochs = 4
expected_steps = epochs + 1 # every LRScheduler gets called once at init
model = BoringModel()
trainer = Trainer(
default_root_dir=tmpdir,
limit_train_batches=2,
limit_val_batches=2,
check_val_every_n_epoch=check_val_every_n_epoch,
max_epochs=epochs,
)
trainer.fit(model)
assert mocked_sched.call_count == expected_steps
@pytest.mark.parametrize("every_n_train_steps, epoch_interval", [(None, True), (2, False), (2, True)])
def test_lr_scheduler_state_updated_before_saving(tmpdir, every_n_train_steps, epoch_interval):
batches = 2
max_epochs = 1
lr, gamma = 1, 10
trainer = Trainer(
default_root_dir=tmpdir,
progress_bar_refresh_rate=0,
logger=False,
max_epochs=max_epochs,
limit_train_batches=batches,
limit_val_batches=1,
callbacks=[ModelCheckpoint(dirpath=tmpdir, every_n_train_steps=every_n_train_steps)],
)
class TestModel(BoringModel):
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.parameters(), lr=lr)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)
lr_scheduler_config = {"scheduler": lr_scheduler}
if not epoch_interval:
lr_scheduler_config["interval"] = "step"
return [optimizer], [lr_scheduler_config]
def on_save_checkpoint(self, checkpoint):
lr_scheduler_config = checkpoint["lr_schedulers"][0]
# 2 batches ran. since the lr_scheduler_config interval is `step`, the step count should be 2
assert self.trainer.global_step + 1 == batches # the global step hasn't been increased yet
compare_to = max_epochs if epoch_interval else batches
assert lr_scheduler_config["_step_count"] - 1 == compare_to # step count starts at 1
assert lr_scheduler_config["_last_lr"] == [lr * gamma ** compare_to]
self.on_save_checkpoint_called = True
model = TestModel()
trainer.fit(model)
assert model.on_save_checkpoint_called
@pytest.mark.parametrize("save_on_train_epoch_end", (False, True))
def test_plateau_scheduler_lr_step_interval_updated_after_saving(tmpdir, save_on_train_epoch_end):
batches = 4
trainer = Trainer(
default_root_dir=tmpdir,
progress_bar_refresh_rate=0,
logger=False,
max_epochs=1,
limit_train_batches=batches,
limit_val_batches=1,
callbacks=[ModelCheckpoint(dirpath=tmpdir, save_on_train_epoch_end=save_on_train_epoch_end)],
)
class TestModel(BoringModel):
def training_step(self, batch, batch_idx, optimizer_idx):
self.log("foo", batch_idx)
return super().training_step(batch, batch_idx)
def configure_optimizers(self):
optimizer_1 = torch.optim.Adam(self.parameters())
optimizer_2 = torch.optim.Adam(self.parameters())
lr_scheduler1 = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_1)
lr_scheduler_config_1 = {"scheduler": lr_scheduler1, "interval": "step", "monitor": "foo"}
lr_scheduler2 = torch.optim.lr_scheduler.StepLR(optimizer_2, step_size=1)
lr_scheduler_config_2 = {"scheduler": lr_scheduler2, "interval": "step"}
return [optimizer_1, optimizer_2], [lr_scheduler_config_1, lr_scheduler_config_2]
def on_save_checkpoint(self, checkpoint):
lr_scheduler_config_1 = checkpoint["lr_schedulers"][0]
last_epoch = lr_scheduler_config_1["last_epoch"]
assert last_epoch == batches - (not save_on_train_epoch_end) # last epoch starts at 0
lr_scheduler_config_2 = checkpoint["lr_schedulers"][1]
assert lr_scheduler_config_2["_step_count"] - 1 == batches # step count starts at 1
self.on_save_checkpoint_called = True
model = TestModel()
model.training_epoch_end = None
trainer.fit(model)
assert model.on_save_checkpoint_called