2020-10-13 11:18:07 +00:00
|
|
|
# Copyright The PyTorch Lightning team.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
2021-07-30 13:03:15 +00:00
|
|
|
import gc
|
2021-04-06 11:41:07 +00:00
|
|
|
import logging
|
2020-02-18 16:23:22 +00:00
|
|
|
import math
|
2019-10-23 10:10:13 +00:00
|
|
|
import os
|
2020-06-09 20:51:30 +00:00
|
|
|
import pickle
|
2020-06-15 10:35:26 +00:00
|
|
|
import sys
|
2021-01-23 23:52:04 +00:00
|
|
|
from argparse import Namespace
|
|
|
|
from copy import deepcopy
|
|
|
|
from pathlib import Path
|
2021-09-02 16:35:22 +00:00
|
|
|
from unittest import mock
|
2020-12-01 00:09:46 +00:00
|
|
|
from unittest.mock import ANY, call, patch
|
2020-03-12 16:41:37 +00:00
|
|
|
|
2020-06-09 20:51:30 +00:00
|
|
|
import cloudpickle
|
2019-10-23 10:10:13 +00:00
|
|
|
import pytest
|
|
|
|
import torch
|
2021-01-23 23:52:04 +00:00
|
|
|
from omegaconf import OmegaConf
|
2021-07-21 09:37:05 +00:00
|
|
|
from torch.nn.parallel.distributed import DistributedDataParallel
|
2021-03-01 13:36:46 +00:00
|
|
|
from torch.optim import SGD
|
2021-09-14 14:40:19 +00:00
|
|
|
from torch.utils.data import DataLoader, IterableDataset
|
2019-10-23 10:10:13 +00:00
|
|
|
|
2021-02-08 10:52:02 +00:00
|
|
|
import tests.helpers.utils as tutils
|
2021-01-27 16:38:14 +00:00
|
|
|
from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer
|
2021-09-04 23:33:43 +00:00
|
|
|
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer
|
2021-04-27 20:23:55 +00:00
|
|
|
from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter
|
2020-10-21 18:34:29 +00:00
|
|
|
from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv
|
2020-05-13 13:05:15 +00:00
|
|
|
from pytorch_lightning.loggers import TensorBoardLogger
|
2021-04-27 12:46:45 +00:00
|
|
|
from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler
|
|
|
|
from pytorch_lightning.plugins import DDPSpawnPlugin
|
2021-05-04 10:50:56 +00:00
|
|
|
from pytorch_lightning.trainer.states import TrainerFn
|
2021-05-12 20:10:15 +00:00
|
|
|
from pytorch_lightning.utilities import DeviceType, DistributedType
|
2020-06-16 10:34:55 +00:00
|
|
|
from pytorch_lightning.utilities.cloud_io import load as pl_load
|
2021-06-28 19:26:03 +00:00
|
|
|
from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException
|
2021-04-27 12:46:45 +00:00
|
|
|
from pytorch_lightning.utilities.seed import seed_everything
|
2021-02-09 10:10:52 +00:00
|
|
|
from tests.base import EvalModelTemplate
|
|
|
|
from tests.helpers import BoringModel, RandomDataset
|
2021-09-14 14:40:19 +00:00
|
|
|
from tests.helpers.boring_model import RandomIterableDataset, RandomIterableDatasetWithLen
|
2021-03-02 09:36:01 +00:00
|
|
|
from tests.helpers.runif import RunIf
|
2020-04-27 11:41:30 +00:00
|
|
|
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
@pytest.mark.parametrize("url_ckpt", [True, False])
|
2020-06-11 21:12:48 +00:00
|
|
|
def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
|
2019-12-04 11:48:53 +00:00
|
|
|
"""Tests use case where trainer saves the model, and user loads it from tags independently."""
|
2020-06-11 21:12:48 +00:00
|
|
|
# set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
|
2020-10-21 18:34:29 +00:00
|
|
|
monkeypatch.setenv("TORCH_HOME", str(tmpdir))
|
2019-10-23 10:10:13 +00:00
|
|
|
|
2020-05-10 17:15:28 +00:00
|
|
|
model = EvalModelTemplate()
|
2019-10-23 10:10:13 +00:00
|
|
|
|
|
|
|
# logger file to get meta
|
2020-04-22 00:33:10 +00:00
|
|
|
logger = tutils.get_default_logger(tmpdir)
|
2019-10-23 10:10:13 +00:00
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, logger=logger, callbacks=[ModelCheckpoint(dirpath=tmpdir)])
|
2019-10-23 10:10:13 +00:00
|
|
|
# fit model
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2019-10-23 10:10:13 +00:00
|
|
|
# training complete
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2019-10-23 10:10:13 +00:00
|
|
|
|
|
|
|
# save model
|
2020-10-21 18:34:29 +00:00
|
|
|
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
|
2019-10-23 10:10:13 +00:00
|
|
|
trainer.save_checkpoint(new_weights_path)
|
|
|
|
|
2020-04-26 21:30:50 +00:00
|
|
|
# assert ckpt has hparams
|
|
|
|
ckpt = torch.load(new_weights_path)
|
2020-11-03 11:13:10 +00:00
|
|
|
assert LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), "hyper_parameters missing from checkpoints"
|
2020-04-26 21:30:50 +00:00
|
|
|
|
2020-05-13 13:05:15 +00:00
|
|
|
# load new model
|
|
|
|
hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
|
2020-10-21 18:34:29 +00:00
|
|
|
hparams_path = os.path.join(hparams_path, "hparams.yaml")
|
|
|
|
ckpt_path = (
|
|
|
|
f"http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}"
|
2021-07-26 11:37:35 +00:00
|
|
|
if url_ckpt
|
|
|
|
else new_weights_path
|
2020-03-03 02:05:38 +00:00
|
|
|
)
|
2021-07-26 11:37:35 +00:00
|
|
|
model_2 = EvalModelTemplate.load_from_checkpoint(checkpoint_path=ckpt_path, hparams_file=hparams_path)
|
2019-10-23 10:10:13 +00:00
|
|
|
model_2.eval()
|
|
|
|
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
@pytest.mark.parametrize("url_ckpt", [True, False])
|
2020-06-11 21:12:48 +00:00
|
|
|
def test_no_val_end_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
|
2019-12-04 11:48:53 +00:00
|
|
|
"""Tests use case where trainer saves the model, and user loads it from tags independently."""
|
2020-06-11 21:12:48 +00:00
|
|
|
# set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
|
2020-10-21 18:34:29 +00:00
|
|
|
monkeypatch.setenv("TORCH_HOME", tmpdir)
|
2019-10-23 10:10:13 +00:00
|
|
|
|
2020-05-10 17:15:28 +00:00
|
|
|
model = EvalModelTemplate()
|
2019-10-23 10:10:13 +00:00
|
|
|
|
|
|
|
# logger file to get meta
|
2020-04-22 00:33:10 +00:00
|
|
|
logger = tutils.get_default_logger(tmpdir)
|
2019-10-23 10:10:13 +00:00
|
|
|
|
2020-05-01 14:43:58 +00:00
|
|
|
# fit model
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, logger=logger, callbacks=[ModelCheckpoint(dirpath=tmpdir)])
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2019-10-23 10:10:13 +00:00
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
# training complete
|
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2019-10-23 10:10:13 +00:00
|
|
|
|
|
|
|
# save model
|
2020-10-21 18:34:29 +00:00
|
|
|
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
|
2019-10-23 10:10:13 +00:00
|
|
|
trainer.save_checkpoint(new_weights_path)
|
|
|
|
|
|
|
|
# load new model
|
2020-05-13 13:05:15 +00:00
|
|
|
hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
|
2020-10-21 18:34:29 +00:00
|
|
|
hparams_path = os.path.join(hparams_path, "hparams.yaml")
|
|
|
|
ckpt_path = (
|
|
|
|
f"http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}"
|
2021-07-26 11:37:35 +00:00
|
|
|
if url_ckpt
|
|
|
|
else new_weights_path
|
2020-03-03 02:05:38 +00:00
|
|
|
)
|
2021-07-26 11:37:35 +00:00
|
|
|
model_2 = EvalModelTemplate.load_from_checkpoint(checkpoint_path=ckpt_path, hparams_file=hparams_path)
|
2019-10-23 10:10:13 +00:00
|
|
|
model_2.eval()
|
|
|
|
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
@pytest.mark.parametrize("url_ckpt", [True, False])
|
2020-08-13 20:25:43 +00:00
|
|
|
def test_strict_model_load(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
|
|
|
|
"""Tests use case where trainer saves the model, and user loads it from tags independently."""
|
|
|
|
# set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
|
2020-10-21 18:34:29 +00:00
|
|
|
monkeypatch.setenv("TORCH_HOME", tmpdir)
|
2020-08-13 20:25:43 +00:00
|
|
|
|
|
|
|
model = EvalModelTemplate()
|
|
|
|
# Extra layer
|
|
|
|
model.c_d3 = torch.nn.Linear(model.hidden_dim, model.hidden_dim)
|
|
|
|
|
|
|
|
# logger file to get meta
|
|
|
|
logger = tutils.get_default_logger(tmpdir)
|
|
|
|
|
|
|
|
# fit model
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, logger=logger, callbacks=[ModelCheckpoint(dirpath=tmpdir)])
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-08-13 20:25:43 +00:00
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
# training complete
|
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-08-13 20:25:43 +00:00
|
|
|
|
|
|
|
# save model
|
2020-10-21 18:34:29 +00:00
|
|
|
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
|
2020-08-13 20:25:43 +00:00
|
|
|
trainer.save_checkpoint(new_weights_path)
|
|
|
|
|
|
|
|
# load new model
|
|
|
|
hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
|
2020-10-21 18:34:29 +00:00
|
|
|
hparams_path = os.path.join(hparams_path, "hparams.yaml")
|
|
|
|
ckpt_path = (
|
|
|
|
f"http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}"
|
2021-07-26 11:37:35 +00:00
|
|
|
if url_ckpt
|
|
|
|
else new_weights_path
|
2020-10-21 18:34:29 +00:00
|
|
|
)
|
2020-08-13 20:25:43 +00:00
|
|
|
|
|
|
|
try:
|
2021-07-26 11:37:35 +00:00
|
|
|
EvalModelTemplate.load_from_checkpoint(checkpoint_path=ckpt_path, hparams_file=hparams_path)
|
2021-01-04 08:07:56 +00:00
|
|
|
# todo: specify the possible exception
|
2020-08-13 20:25:43 +00:00
|
|
|
except Exception:
|
|
|
|
failed = True
|
|
|
|
else:
|
|
|
|
failed = False
|
|
|
|
|
|
|
|
assert failed, "Model should not been loaded since the extra layer added."
|
|
|
|
|
|
|
|
failed = False
|
|
|
|
try:
|
2021-07-26 11:37:35 +00:00
|
|
|
EvalModelTemplate.load_from_checkpoint(checkpoint_path=ckpt_path, hparams_file=hparams_path, strict=False)
|
2021-01-04 08:07:56 +00:00
|
|
|
# todo: specify the possible exception
|
2020-08-13 20:25:43 +00:00
|
|
|
except Exception:
|
|
|
|
failed = True
|
|
|
|
|
|
|
|
assert not failed, "Model should be loaded due to strict=False."
|
|
|
|
|
|
|
|
|
2021-03-01 13:36:46 +00:00
|
|
|
@pytest.mark.parametrize("accumulate_grad_batches", (1, 2, 3))
|
|
|
|
def test_trainer_accumulate_grad_batches_zero_grad(tmpdir, accumulate_grad_batches):
|
|
|
|
with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad:
|
|
|
|
model = BoringModel()
|
|
|
|
trainer = Trainer(
|
|
|
|
default_root_dir=tmpdir,
|
|
|
|
limit_train_batches=20,
|
|
|
|
limit_val_batches=1,
|
|
|
|
max_epochs=1,
|
|
|
|
weights_summary=None,
|
|
|
|
accumulate_grad_batches=accumulate_grad_batches,
|
2020-10-21 18:34:29 +00:00
|
|
|
)
|
2021-03-01 13:36:46 +00:00
|
|
|
trainer.fit(model)
|
|
|
|
|
|
|
|
assert sgd_zero_grad.call_count == math.ceil(trainer.limit_train_batches / accumulate_grad_batches)
|
2019-10-23 10:10:13 +00:00
|
|
|
|
|
|
|
|
2020-08-15 19:06:37 +00:00
|
|
|
@pytest.mark.parametrize(
|
2020-10-21 18:34:29 +00:00
|
|
|
["accumulate_grad_batches", "limit_train_batches"],
|
2020-08-15 19:06:37 +00:00
|
|
|
[
|
2021-07-26 11:37:35 +00:00
|
|
|
({1: 2, 3: 4}, 1.0),
|
|
|
|
({1: 2, 3: 4}, 0.5), # not to be divisible by accumulate_grad_batches on purpose
|
2021-03-01 13:36:46 +00:00
|
|
|
(3, 1.0),
|
|
|
|
(3, 0.8), # not to be divisible by accumulate_grad_batches on purpose
|
|
|
|
(4, 1.0),
|
|
|
|
(4, 0.7), # not to be divisible by accumulate_grad_batches on purpose
|
2020-08-15 19:06:37 +00:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_gradient_accumulation_scheduling_last_batch(tmpdir, accumulate_grad_batches, limit_train_batches):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Verify optimizer.step() applied to last batch while grad accumulation."""
|
2020-08-15 19:06:37 +00:00
|
|
|
|
2021-06-11 11:47:00 +00:00
|
|
|
class TestModel(BoringModel):
|
|
|
|
def state_dict(self, *args, **kwargs):
|
|
|
|
return deepcopy(super().state_dict(*args, **kwargs))
|
2020-08-15 19:06:37 +00:00
|
|
|
|
2021-06-11 11:47:00 +00:00
|
|
|
def check(self, d1, d2, equal=True):
|
|
|
|
keys = d1.keys() | d2.keys()
|
|
|
|
values = [torch.equal(d1[k], d2[k]) for k in keys]
|
|
|
|
return all(values) if equal else not any(values)
|
2020-08-15 19:06:37 +00:00
|
|
|
|
2021-06-11 11:47:00 +00:00
|
|
|
def backward(self, *args, **kwargs) -> None:
|
|
|
|
pre_bwd_state_dict = self.state_dict()
|
|
|
|
assert self.check(self.start_state_dict, pre_bwd_state_dict)
|
2020-08-15 19:06:37 +00:00
|
|
|
|
2021-06-11 11:47:00 +00:00
|
|
|
out = super().backward(*args, **kwargs)
|
|
|
|
|
|
|
|
# state dict is equal, just the gradients changed
|
|
|
|
assert self.check(pre_bwd_state_dict, self.state_dict())
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
2021-06-17 10:50:37 +00:00
|
|
|
def optimizer_step(self, *args, **kwargs):
|
|
|
|
pre_opt_step_state_dict = self.state_dict()
|
|
|
|
assert self.check(self.start_state_dict, pre_opt_step_state_dict)
|
2021-06-11 11:47:00 +00:00
|
|
|
|
2021-06-17 10:50:37 +00:00
|
|
|
# this calls `backward` and `on_after_backward` inside the closure
|
|
|
|
out = super().optimizer_step(*args, **kwargs)
|
2021-06-11 11:47:00 +00:00
|
|
|
|
2021-06-17 10:50:37 +00:00
|
|
|
# the state dict changed
|
|
|
|
assert self.check(pre_opt_step_state_dict, self.state_dict(), equal=False)
|
2021-06-11 11:47:00 +00:00
|
|
|
|
|
|
|
self.opt_step_called = True
|
2021-06-17 10:50:37 +00:00
|
|
|
return out
|
2021-06-11 11:47:00 +00:00
|
|
|
|
|
|
|
def on_train_batch_start(self, *_):
|
|
|
|
self.start_state_dict = self.state_dict()
|
|
|
|
self.opt_step_called = False
|
|
|
|
|
|
|
|
def on_train_batch_end(self, outputs, batch, batch_idx, *_):
|
|
|
|
end_state_dict = self.state_dict()
|
|
|
|
is_last_batch = (batch_idx + 1) == self.trainer.num_training_batches
|
|
|
|
|
|
|
|
if is_last_batch or self.opt_step_called:
|
|
|
|
assert self.check(self.start_state_dict, end_state_dict, equal=False)
|
|
|
|
else:
|
|
|
|
assert self.check(self.start_state_dict, end_state_dict)
|
|
|
|
|
|
|
|
model = TestModel()
|
2020-08-15 19:06:37 +00:00
|
|
|
trainer = Trainer(
|
|
|
|
accumulate_grad_batches=accumulate_grad_batches,
|
2020-11-02 23:44:11 +00:00
|
|
|
max_epochs=2,
|
2020-08-15 19:06:37 +00:00
|
|
|
limit_train_batches=limit_train_batches,
|
2020-11-02 23:44:11 +00:00
|
|
|
limit_val_batches=0,
|
2020-10-21 18:34:29 +00:00
|
|
|
default_root_dir=tmpdir,
|
2021-06-11 11:47:00 +00:00
|
|
|
progress_bar_refresh_rate=0,
|
2020-08-15 19:06:37 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
trainer.fit(model)
|
|
|
|
|
|
|
|
|
2019-12-03 13:01:04 +00:00
|
|
|
def test_loading_meta_tags(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""test for backward compatibility to meta_tags.csv."""
|
2020-05-13 13:05:15 +00:00
|
|
|
tutils.reset_seed()
|
2019-10-23 10:10:13 +00:00
|
|
|
|
2020-05-10 17:15:28 +00:00
|
|
|
hparams = EvalModelTemplate.get_default_hparams()
|
2019-10-23 10:10:13 +00:00
|
|
|
|
|
|
|
# save tags
|
2020-04-22 00:33:10 +00:00
|
|
|
logger = tutils.get_default_logger(tmpdir)
|
2020-10-21 18:34:29 +00:00
|
|
|
logger.log_hyperparams(Namespace(some_str="a_str", an_int=1, a_float=2.0))
|
2019-10-23 10:10:13 +00:00
|
|
|
logger.log_hyperparams(hparams)
|
|
|
|
logger.save()
|
|
|
|
|
2020-05-13 13:05:15 +00:00
|
|
|
# load hparams
|
2020-01-16 12:22:29 +00:00
|
|
|
path_expt_dir = tutils.get_data_path(logger, path_dir=tmpdir)
|
2020-05-13 13:05:15 +00:00
|
|
|
hparams_path = os.path.join(path_expt_dir, TensorBoardLogger.NAME_HPARAMS_FILE)
|
|
|
|
hparams = load_hparams_from_yaml(hparams_path)
|
|
|
|
|
|
|
|
# save as legacy meta_tags.csv
|
2020-10-21 18:34:29 +00:00
|
|
|
tags_path = os.path.join(path_expt_dir, "meta_tags.csv")
|
2020-05-13 13:05:15 +00:00
|
|
|
save_hparams_to_tags_csv(tags_path, hparams)
|
|
|
|
|
2020-01-17 11:03:31 +00:00
|
|
|
tags = load_hparams_from_tags_csv(tags_path)
|
2019-10-23 10:10:13 +00:00
|
|
|
|
2020-05-13 13:05:15 +00:00
|
|
|
assert hparams == tags
|
|
|
|
|
|
|
|
|
|
|
|
def test_loading_yaml(tmpdir):
|
|
|
|
tutils.reset_seed()
|
|
|
|
|
|
|
|
hparams = EvalModelTemplate.get_default_hparams()
|
|
|
|
|
|
|
|
# save tags
|
|
|
|
logger = tutils.get_default_logger(tmpdir)
|
2020-10-21 18:34:29 +00:00
|
|
|
logger.log_hyperparams(Namespace(some_str="a_str", an_int=1, a_float=2.0))
|
2020-05-13 13:05:15 +00:00
|
|
|
logger.log_hyperparams(hparams)
|
|
|
|
logger.save()
|
|
|
|
|
|
|
|
# load hparams
|
|
|
|
path_expt_dir = tutils.get_data_path(logger, path_dir=tmpdir)
|
2020-10-21 18:34:29 +00:00
|
|
|
hparams_path = os.path.join(path_expt_dir, "hparams.yaml")
|
2020-05-13 13:05:15 +00:00
|
|
|
tags = load_hparams_from_yaml(hparams_path)
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
assert tags["batch_size"] == 32 and tags["hidden_dim"] == 1000
|
2019-10-23 10:10:13 +00:00
|
|
|
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
@pytest.mark.parametrize(
|
2021-02-24 11:58:53 +00:00
|
|
|
"save_top_k,save_last,expected_files",
|
2020-10-21 18:34:29 +00:00
|
|
|
[
|
2021-02-24 11:58:53 +00:00
|
|
|
pytest.param(-1, False, [f"epoch={i}.ckpt" for i in range(5)], id="CASE K=-1 (all)"),
|
|
|
|
pytest.param(1, False, {"epoch=4.ckpt"}, id="CASE K=1 (2.5, epoch 4)"),
|
|
|
|
pytest.param(2, False, [f"epoch={i}.ckpt" for i in (2, 4)], id="CASE K=2 (2.5 epoch 4, 2.8 epoch 2)"),
|
|
|
|
pytest.param(4, False, [f"epoch={i}.ckpt" for i in range(1, 5)], id="CASE K=4 (save all 4 base)"),
|
|
|
|
pytest.param(3, False, [f"epoch={i}.ckpt" for i in range(2, 5)], id="CASE K=3 (save the 2nd, 3rd, 4th model)"),
|
|
|
|
pytest.param(1, True, {"epoch=4.ckpt", "last.ckpt"}, id="CASE K=1 (save the 4th model and the last model)"),
|
2020-10-21 18:34:29 +00:00
|
|
|
],
|
|
|
|
)
|
2021-02-24 11:58:53 +00:00
|
|
|
def test_model_checkpoint_options(tmpdir, save_top_k, save_last, expected_files):
|
2019-12-04 11:48:53 +00:00
|
|
|
"""Test ModelCheckpoint options."""
|
2020-03-24 18:55:27 +00:00
|
|
|
|
2020-05-17 13:24:17 +00:00
|
|
|
def mock_save_function(filepath, *args):
|
2020-10-21 18:34:29 +00:00
|
|
|
open(filepath, "a").close()
|
2019-11-19 23:43:35 +00:00
|
|
|
|
|
|
|
# simulated losses
|
|
|
|
losses = [10, 9, 2.8, 5, 2.5]
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
checkpoint_callback = ModelCheckpoint(
|
2021-02-06 15:06:17 +00:00
|
|
|
dirpath=tmpdir,
|
2021-07-26 11:37:35 +00:00
|
|
|
filename="{epoch}",
|
|
|
|
monitor="checkpoint_on",
|
2021-02-06 15:06:17 +00:00
|
|
|
save_top_k=save_top_k,
|
|
|
|
save_last=save_last,
|
2021-07-26 11:37:35 +00:00
|
|
|
verbose=True,
|
2020-10-21 18:34:29 +00:00
|
|
|
)
|
2020-02-16 05:03:05 +00:00
|
|
|
trainer = Trainer()
|
2021-05-04 10:50:56 +00:00
|
|
|
trainer.state.fn = TrainerFn.FITTING
|
2021-04-26 16:55:26 +00:00
|
|
|
trainer.save_checkpoint = mock_save_function
|
2020-02-16 05:03:05 +00:00
|
|
|
|
|
|
|
# emulate callback's calls during the training
|
2019-11-19 23:43:35 +00:00
|
|
|
for i, loss in enumerate(losses):
|
2021-06-22 09:49:32 +00:00
|
|
|
trainer.fit_loop.current_epoch = i
|
|
|
|
trainer.fit_loop.global_step = i
|
2021-07-07 12:13:30 +00:00
|
|
|
trainer.callback_metrics.update({"checkpoint_on": loss})
|
2021-02-18 14:59:54 +00:00
|
|
|
checkpoint_callback.on_validation_end(trainer, trainer.lightning_module)
|
2019-11-19 23:43:35 +00:00
|
|
|
|
2020-03-30 22:37:02 +00:00
|
|
|
file_lists = set(os.listdir(tmpdir))
|
2019-11-19 23:43:35 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
assert len(file_lists) == len(
|
|
|
|
expected_files
|
|
|
|
), f"Should save {len(expected_files)} models when save_top_k={save_top_k} but found={file_lists}"
|
2019-11-19 23:43:35 +00:00
|
|
|
|
|
|
|
# verify correct naming
|
2020-03-30 22:37:02 +00:00
|
|
|
for fname in expected_files:
|
2020-03-03 20:16:57 +00:00
|
|
|
assert fname in file_lists
|
2019-11-19 23:43:35 +00:00
|
|
|
|
|
|
|
|
2020-05-17 13:24:17 +00:00
|
|
|
def test_model_checkpoint_only_weights(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Tests use case where ModelCheckpoint is configured to save only model weights, and user tries to load
|
|
|
|
checkpoint to resume training."""
|
2020-05-17 13:24:17 +00:00
|
|
|
model = EvalModelTemplate()
|
|
|
|
|
|
|
|
trainer = Trainer(
|
2020-06-29 01:36:46 +00:00
|
|
|
default_root_dir=tmpdir,
|
2020-05-17 13:24:17 +00:00
|
|
|
max_epochs=1,
|
2021-07-26 11:37:35 +00:00
|
|
|
callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_weights_only=True)],
|
2020-05-17 13:24:17 +00:00
|
|
|
)
|
|
|
|
# fit model
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-05-17 13:24:17 +00:00
|
|
|
# training complete
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-05-17 13:24:17 +00:00
|
|
|
|
|
|
|
checkpoint_path = list(trainer.checkpoint_callback.best_k_models.keys())[0]
|
|
|
|
|
|
|
|
# assert saved checkpoint has no trainer data
|
|
|
|
checkpoint = torch.load(checkpoint_path)
|
2020-10-21 18:34:29 +00:00
|
|
|
assert "optimizer_states" not in checkpoint, "checkpoint should contain only model weights"
|
|
|
|
assert "lr_schedulers" not in checkpoint, "checkpoint should contain only model weights"
|
2020-05-17 13:24:17 +00:00
|
|
|
|
|
|
|
# assert loading model works when checkpoint has only weights
|
|
|
|
assert EvalModelTemplate.load_from_checkpoint(checkpoint_path=checkpoint_path)
|
|
|
|
|
|
|
|
# directly save model
|
2020-10-21 18:34:29 +00:00
|
|
|
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
|
2020-05-17 13:24:17 +00:00
|
|
|
trainer.save_checkpoint(new_weights_path, weights_only=True)
|
|
|
|
# assert saved checkpoint has no trainer data
|
|
|
|
checkpoint = torch.load(new_weights_path)
|
2020-10-21 18:34:29 +00:00
|
|
|
assert "optimizer_states" not in checkpoint, "checkpoint should contain only model weights"
|
|
|
|
assert "lr_schedulers" not in checkpoint, "checkpoint should contain only model weights"
|
2020-05-17 13:24:17 +00:00
|
|
|
|
|
|
|
# assert restoring train state fails
|
2020-10-21 18:34:29 +00:00
|
|
|
with pytest.raises(KeyError, match="checkpoint contains only the model"):
|
2021-06-10 19:54:21 +00:00
|
|
|
trainer.checkpoint_connector.restore(new_weights_path)
|
2020-05-17 13:24:17 +00:00
|
|
|
|
|
|
|
|
2019-10-23 10:10:13 +00:00
|
|
|
def test_model_freeze_unfreeze():
|
2020-05-10 17:15:28 +00:00
|
|
|
model = EvalModelTemplate()
|
2019-10-23 10:10:13 +00:00
|
|
|
model.freeze()
|
|
|
|
model.unfreeze()
|
|
|
|
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
@pytest.mark.parametrize("url_ckpt", [True, False])
|
2021-01-08 21:13:12 +00:00
|
|
|
def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Verify resuming from checkpoint runs the right number of epochs."""
|
2020-06-11 21:12:48 +00:00
|
|
|
# set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
|
2020-10-21 18:34:29 +00:00
|
|
|
monkeypatch.setenv("TORCH_HOME", tmpdir)
|
2020-02-22 01:27:19 +00:00
|
|
|
|
2020-12-13 15:04:16 +00:00
|
|
|
class TestModel(BoringModel):
|
|
|
|
# Model that tracks epochs and batches seen
|
2021-02-17 17:21:49 +00:00
|
|
|
num_epochs_end_seen = 0
|
2020-12-13 15:04:16 +00:00
|
|
|
num_batches_seen = 0
|
|
|
|
num_on_load_checkpoint_called = 0
|
2020-02-22 01:27:19 +00:00
|
|
|
|
2020-12-13 15:04:16 +00:00
|
|
|
def on_epoch_end(self):
|
2021-02-17 17:21:49 +00:00
|
|
|
self.num_epochs_end_seen += 1
|
2020-02-22 01:27:19 +00:00
|
|
|
|
2020-12-13 15:04:16 +00:00
|
|
|
def on_train_batch_start(self, *_):
|
2020-02-22 01:27:19 +00:00
|
|
|
self.num_batches_seen += 1
|
|
|
|
|
2020-12-13 15:04:16 +00:00
|
|
|
def on_load_checkpoint(self, _):
|
2020-04-30 11:57:24 +00:00
|
|
|
self.num_on_load_checkpoint_called += 1
|
|
|
|
|
2020-12-13 15:04:16 +00:00
|
|
|
model = TestModel()
|
|
|
|
trainer = Trainer(
|
2020-02-22 01:27:19 +00:00
|
|
|
max_epochs=2,
|
2020-06-17 17:42:28 +00:00
|
|
|
limit_train_batches=0.65,
|
2020-06-17 12:03:28 +00:00
|
|
|
limit_val_batches=1,
|
2021-07-26 11:37:35 +00:00
|
|
|
callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_top_k=-1)],
|
2020-04-10 16:02:59 +00:00
|
|
|
default_root_dir=tmpdir,
|
2020-10-21 18:34:29 +00:00
|
|
|
val_check_interval=1.0,
|
2020-12-13 15:04:16 +00:00
|
|
|
progress_bar_refresh_rate=0,
|
|
|
|
logger=False,
|
|
|
|
weights_summary=None,
|
2020-02-22 01:27:19 +00:00
|
|
|
)
|
|
|
|
trainer.fit(model)
|
|
|
|
|
2021-02-17 17:21:49 +00:00
|
|
|
# `on_epoch_end` will be called once for val_sanity, twice for train, twice for val
|
|
|
|
assert model.num_epochs_end_seen == 1 + 2 + 2
|
2020-12-13 15:04:16 +00:00
|
|
|
assert model.num_batches_seen == trainer.num_training_batches * 2
|
2020-04-30 11:57:24 +00:00
|
|
|
assert model.num_on_load_checkpoint_called == 0
|
2020-02-22 01:27:19 +00:00
|
|
|
|
|
|
|
# Other checkpoints can be uncommented if/when resuming mid-epoch is supported
|
2020-12-13 15:04:16 +00:00
|
|
|
checkpoints = Path(trainer.checkpoint_callback.dirpath).glob("*.ckpt")
|
2020-06-11 21:12:48 +00:00
|
|
|
if url_ckpt:
|
|
|
|
# transform local paths into url checkpoints
|
|
|
|
ip, port = tmpdir_server
|
2020-12-13 15:04:16 +00:00
|
|
|
checkpoints = [f"http://{ip}:{port}/" + ckpt.name for ckpt in checkpoints]
|
2020-02-22 01:27:19 +00:00
|
|
|
|
2020-12-13 15:04:16 +00:00
|
|
|
for ckpt in checkpoints:
|
|
|
|
next_model = TestModel()
|
|
|
|
state = pl_load(ckpt)
|
2020-02-22 01:27:19 +00:00
|
|
|
|
|
|
|
# Resume training
|
2021-07-26 11:37:35 +00:00
|
|
|
new_trainer = Trainer(default_root_dir=tmpdir, resume_from_checkpoint=ckpt, max_epochs=2)
|
2020-02-22 01:27:19 +00:00
|
|
|
new_trainer.fit(next_model)
|
2020-12-13 15:04:16 +00:00
|
|
|
assert state["global_step"] + next_model.num_batches_seen == trainer.num_training_batches * trainer.max_epochs
|
2020-04-30 11:57:24 +00:00
|
|
|
assert next_model.num_on_load_checkpoint_called == 1
|
2020-02-22 01:27:19 +00:00
|
|
|
|
|
|
|
|
2020-02-18 16:23:22 +00:00
|
|
|
def test_trainer_max_steps_and_epochs(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Verify model trains according to specified max steps."""
|
2020-12-13 15:04:16 +00:00
|
|
|
model = BoringModel()
|
|
|
|
num_train_samples = math.floor(len(model.train_dataloader()) * 0.5)
|
2020-02-18 16:23:22 +00:00
|
|
|
|
|
|
|
# define less train steps than epochs
|
2020-12-13 15:04:16 +00:00
|
|
|
trainer_kwargs = {
|
2021-07-26 11:37:35 +00:00
|
|
|
"limit_train_batches": 0.5,
|
|
|
|
"default_root_dir": tmpdir,
|
|
|
|
"max_epochs": 3,
|
|
|
|
"max_steps": num_train_samples + 10,
|
|
|
|
"logger": False,
|
|
|
|
"weights_summary": None,
|
|
|
|
"progress_bar_refresh_rate": 0,
|
2020-12-13 15:04:16 +00:00
|
|
|
}
|
|
|
|
trainer = Trainer(**trainer_kwargs)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-02-18 16:23:22 +00:00
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-02-18 16:23:22 +00:00
|
|
|
assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps"
|
|
|
|
|
|
|
|
# define less train epochs than steps
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer_kwargs["max_epochs"] = 2
|
|
|
|
trainer_kwargs["max_steps"] = 3 * 2 * num_train_samples
|
2020-12-13 15:04:16 +00:00
|
|
|
trainer = Trainer(**trainer_kwargs)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-02-18 16:23:22 +00:00
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-04-02 16:28:44 +00:00
|
|
|
assert trainer.global_step == num_train_samples * trainer.max_epochs
|
2020-06-20 03:39:53 +00:00
|
|
|
assert trainer.current_epoch == trainer.max_epochs - 1, "Model did not stop at max_epochs"
|
2020-02-18 16:23:22 +00:00
|
|
|
|
2021-09-04 23:33:43 +00:00
|
|
|
# if max_steps is positive and max_epochs is negative, use max_steps
|
|
|
|
trainer_kwargs["max_epochs"] = -1
|
|
|
|
trainer_kwargs["max_steps"] = 3
|
|
|
|
trainer = Trainer(**trainer_kwargs)
|
|
|
|
trainer.fit(model)
|
|
|
|
|
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
|
|
|
assert trainer.global_step == 3
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"max_epochs,max_steps,incorrect_variable,incorrect_value",
|
|
|
|
[
|
|
|
|
(-100, None, "max_epochs", -100),
|
|
|
|
(1, -2, "max_steps", -2),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_trainer_max_steps_and_epochs_validation(max_epochs, max_steps, incorrect_variable, incorrect_value):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Don't allow max_epochs or max_steps to be less than -1 or a float."""
|
2021-09-04 23:33:43 +00:00
|
|
|
with pytest.raises(
|
|
|
|
MisconfigurationException,
|
|
|
|
match=f"`{incorrect_variable}` must be a positive integer or -1. You passed in {incorrect_value}",
|
|
|
|
):
|
|
|
|
Trainer(max_epochs=max_epochs, max_steps=max_steps)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"max_epochs,max_steps,is_done,correct_trainer_epochs",
|
|
|
|
[
|
|
|
|
(None, None, False, 1000),
|
|
|
|
(-1, None, False, -1),
|
|
|
|
(None, -1, False, None),
|
|
|
|
(5, -1, False, 5),
|
|
|
|
(-1, 10, False, -1),
|
|
|
|
(None, 0, True, None),
|
|
|
|
(0, None, True, 0),
|
|
|
|
(-1, 0, True, -1),
|
|
|
|
(0, -1, True, 0),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_trainer_max_steps_and_epochs_fit_loop_done(max_epochs, max_steps, is_done, correct_trainer_epochs):
|
|
|
|
trainer = Trainer(max_epochs=max_epochs, max_steps=max_steps)
|
|
|
|
|
|
|
|
assert trainer.max_epochs == correct_trainer_epochs
|
|
|
|
assert trainer.max_steps == max_steps
|
|
|
|
assert trainer.fit_loop.done is is_done
|
|
|
|
|
|
|
|
# Make sure there is no timer
|
|
|
|
timer_callbacks = [c for c in trainer.callbacks if isinstance(c, Timer)]
|
|
|
|
assert len(timer_callbacks) == 0
|
|
|
|
|
2020-02-18 16:23:22 +00:00
|
|
|
|
|
|
|
def test_trainer_min_steps_and_epochs(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Verify model trains according to specified min steps."""
|
2020-12-13 15:04:16 +00:00
|
|
|
model = EvalModelTemplate()
|
|
|
|
num_train_samples = math.floor(len(model.train_dataloader()) * 0.5)
|
|
|
|
|
|
|
|
trainer_kwargs = {
|
2021-07-26 11:37:35 +00:00
|
|
|
"limit_train_batches": 0.5,
|
|
|
|
"default_root_dir": tmpdir,
|
2020-12-13 15:04:16 +00:00
|
|
|
# define callback for stopping the model
|
2021-07-26 11:37:35 +00:00
|
|
|
"callbacks": [EarlyStopping(monitor="early_stop_on", min_delta=1.0)],
|
|
|
|
"val_check_interval": 2,
|
|
|
|
"min_epochs": 1,
|
|
|
|
"max_epochs": 7,
|
2020-12-13 15:04:16 +00:00
|
|
|
# define less min steps than 1 epoch
|
2021-07-26 11:37:35 +00:00
|
|
|
"min_steps": num_train_samples // 2,
|
|
|
|
"logger": False,
|
|
|
|
"weights_summary": None,
|
|
|
|
"progress_bar_refresh_rate": 0,
|
2020-12-13 15:04:16 +00:00
|
|
|
}
|
|
|
|
trainer = Trainer(**trainer_kwargs)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-02-18 16:23:22 +00:00
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-12-13 15:04:16 +00:00
|
|
|
assert trainer.current_epoch > 0
|
|
|
|
assert trainer.global_step >= num_train_samples, "Model did not train for at least min_epochs"
|
2020-02-18 16:23:22 +00:00
|
|
|
|
|
|
|
# define less epochs than min_steps
|
2020-12-13 15:04:16 +00:00
|
|
|
trainer_kwargs["min_steps"] = math.floor(num_train_samples * 1.5)
|
|
|
|
trainer = Trainer(**trainer_kwargs)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-02-18 16:23:22 +00:00
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-12-13 15:04:16 +00:00
|
|
|
assert trainer.current_epoch > 0
|
|
|
|
assert trainer.global_step >= math.floor(num_train_samples * 1.5), "Model did not train for at least min_steps"
|
2020-02-18 16:23:22 +00:00
|
|
|
|
2020-02-19 11:00:08 +00:00
|
|
|
|
2021-04-06 11:41:07 +00:00
|
|
|
def test_trainer_min_steps_and_min_epochs_not_reached(tmpdir, caplog):
|
2021-07-26 11:37:35 +00:00
|
|
|
"""Test that min_epochs/min_steps in Trainer are enforced even if EarlyStopping is triggered."""
|
2021-04-06 11:41:07 +00:00
|
|
|
|
|
|
|
class TestModel(BoringModel):
|
|
|
|
training_step_invoked = 0
|
|
|
|
|
|
|
|
def training_step(self, batch, batch_idx):
|
|
|
|
output = super().training_step(batch, batch_idx)
|
|
|
|
output["loss"] = output["loss"] * 0.0 # force minimal loss to trigger early stopping
|
|
|
|
self.log("loss", output["loss"])
|
|
|
|
self.training_step_invoked += 1
|
|
|
|
assert not self.trainer.should_stop
|
|
|
|
return output
|
|
|
|
|
|
|
|
model = TestModel()
|
2021-04-29 16:14:53 +00:00
|
|
|
early_stop = EarlyStopping(monitor="loss", patience=0, check_on_train_epoch_end=True)
|
2021-04-06 11:41:07 +00:00
|
|
|
min_epochs = 5
|
|
|
|
trainer = Trainer(
|
|
|
|
default_root_dir=tmpdir,
|
|
|
|
progress_bar_refresh_rate=0,
|
|
|
|
min_epochs=min_epochs,
|
|
|
|
limit_val_batches=0,
|
|
|
|
limit_train_batches=2,
|
2021-07-26 11:37:35 +00:00
|
|
|
callbacks=[early_stop],
|
2021-04-06 11:41:07 +00:00
|
|
|
)
|
|
|
|
with caplog.at_level(logging.INFO, logger="pytorch_lightning.trainer.trainer"):
|
|
|
|
trainer.fit(model)
|
|
|
|
|
|
|
|
message = f"minimum epochs ({min_epochs}) or minimum steps (None) has not been met. Training will continue"
|
2021-07-28 08:06:45 +00:00
|
|
|
num_messages = sum(1 for record in caplog.records if message in record.message)
|
2021-04-06 11:41:07 +00:00
|
|
|
assert num_messages == min_epochs - 2
|
|
|
|
assert model.training_step_invoked == min_epochs * 2
|
|
|
|
|
|
|
|
|
2020-10-22 12:58:59 +00:00
|
|
|
def test_trainer_max_steps_accumulate_batches(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Verify model trains according to specified max steps with grad accumulated batches."""
|
2020-12-13 15:04:16 +00:00
|
|
|
model = BoringModel()
|
|
|
|
num_train_samples = math.floor(len(model.train_dataloader()) * 0.5)
|
2020-10-22 12:58:59 +00:00
|
|
|
|
|
|
|
# define less train steps than epochs
|
2020-12-13 15:04:16 +00:00
|
|
|
trainer = Trainer(
|
|
|
|
limit_train_batches=0.5,
|
2020-10-22 12:58:59 +00:00
|
|
|
default_root_dir=tmpdir,
|
2020-12-13 15:04:16 +00:00
|
|
|
max_steps=num_train_samples + 10,
|
2020-10-22 12:58:59 +00:00
|
|
|
accumulate_grad_batches=10,
|
2020-12-13 15:04:16 +00:00
|
|
|
logger=False,
|
|
|
|
weights_summary=None,
|
|
|
|
progress_bar_refresh_rate=0,
|
2020-10-22 12:58:59 +00:00
|
|
|
)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-10-22 12:58:59 +00:00
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-10-22 12:58:59 +00:00
|
|
|
assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps"
|
|
|
|
|
|
|
|
|
2020-02-25 20:05:41 +00:00
|
|
|
def test_benchmark_option(tmpdir):
|
|
|
|
"""Verify benchmark option."""
|
|
|
|
|
2020-05-10 17:15:28 +00:00
|
|
|
model = EvalModelTemplate()
|
2020-05-04 20:51:39 +00:00
|
|
|
model.val_dataloader = model.val_dataloader__multiple
|
2020-02-25 20:05:41 +00:00
|
|
|
|
|
|
|
# verify torch.backends.cudnn.benchmark is not turned on
|
|
|
|
assert not torch.backends.cudnn.benchmark
|
|
|
|
|
2020-05-01 14:43:58 +00:00
|
|
|
# fit model
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, benchmark=True)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-02-25 20:05:41 +00:00
|
|
|
|
|
|
|
# verify training completed
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-02-25 20:05:41 +00:00
|
|
|
|
|
|
|
# verify torch.backends.cudnn.benchmark is not turned off
|
|
|
|
assert torch.backends.cudnn.benchmark
|
|
|
|
|
|
|
|
|
2021-03-11 02:46:37 +00:00
|
|
|
@pytest.mark.parametrize("ckpt_path", (None, "best", "specific"))
|
|
|
|
@pytest.mark.parametrize("save_top_k", (-1, 0, 1, 2))
|
2021-05-12 23:49:58 +00:00
|
|
|
@pytest.mark.parametrize("fn", ("validate", "test", "predict"))
|
2021-03-11 02:46:37 +00:00
|
|
|
def test_tested_checkpoint_path(tmpdir, ckpt_path, save_top_k, fn):
|
|
|
|
class TestModel(BoringModel):
|
|
|
|
def validation_step(self, batch, batch_idx):
|
|
|
|
self.log("foo", -batch_idx)
|
|
|
|
return super().validation_step(batch, batch_idx)
|
2020-06-15 12:02:37 +00:00
|
|
|
|
2021-05-12 23:49:58 +00:00
|
|
|
def test_step(self, *args):
|
|
|
|
return self.validation_step(*args)
|
|
|
|
|
2021-05-21 10:54:16 +00:00
|
|
|
def predict_step(self, batch, *_):
|
|
|
|
return self(batch)
|
2021-05-12 23:49:58 +00:00
|
|
|
|
2021-03-11 02:46:37 +00:00
|
|
|
model = TestModel()
|
2021-05-12 23:49:58 +00:00
|
|
|
model.test_epoch_end = None
|
2020-06-15 12:02:37 +00:00
|
|
|
trainer = Trainer(
|
|
|
|
max_epochs=2,
|
2021-05-12 23:49:58 +00:00
|
|
|
limit_val_batches=1,
|
|
|
|
limit_test_batches=1,
|
|
|
|
limit_predict_batches=1,
|
2020-06-15 12:02:37 +00:00
|
|
|
progress_bar_refresh_rate=0,
|
|
|
|
default_root_dir=tmpdir,
|
2021-03-11 02:46:37 +00:00
|
|
|
callbacks=[ModelCheckpoint(monitor="foo", save_top_k=save_top_k)],
|
2020-06-15 12:02:37 +00:00
|
|
|
)
|
|
|
|
trainer.fit(model)
|
2021-03-11 02:46:37 +00:00
|
|
|
|
2021-05-12 23:49:58 +00:00
|
|
|
trainer_fn = getattr(trainer, fn)
|
|
|
|
path_attr = f"{fn}{'d' if fn == 'validate' else 'ed'}_ckpt_path"
|
|
|
|
assert getattr(trainer, path_attr) is None
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
if ckpt_path == "best":
|
2020-06-15 12:02:37 +00:00
|
|
|
# ckpt_path is 'best', meaning we load the best weights
|
2020-09-30 12:34:02 +00:00
|
|
|
if save_top_k == 0:
|
2020-10-21 18:34:29 +00:00
|
|
|
with pytest.raises(MisconfigurationException, match=".*is not configured to save the best.*"):
|
2021-05-12 23:49:58 +00:00
|
|
|
trainer_fn(ckpt_path=ckpt_path)
|
2021-07-28 10:12:46 +00:00
|
|
|
with pytest.raises(MisconfigurationException, match=".*is not configured to save the best.*"):
|
|
|
|
trainer_fn(model, ckpt_path=ckpt_path)
|
2020-06-15 12:02:37 +00:00
|
|
|
else:
|
2021-05-12 23:49:58 +00:00
|
|
|
trainer_fn(ckpt_path=ckpt_path)
|
|
|
|
assert getattr(trainer, path_attr) == trainer.checkpoint_callback.best_model_path
|
2021-07-28 10:12:46 +00:00
|
|
|
|
|
|
|
trainer_fn(model, ckpt_path=ckpt_path)
|
|
|
|
assert getattr(trainer, path_attr) == trainer.checkpoint_callback.best_model_path
|
2020-06-15 12:02:37 +00:00
|
|
|
elif ckpt_path is None:
|
2021-07-28 10:12:46 +00:00
|
|
|
# ckpt_path is None, meaning we don't load any checkpoints and use the provided model
|
|
|
|
trainer_fn(model, ckpt_path=ckpt_path)
|
2021-05-12 23:49:58 +00:00
|
|
|
assert getattr(trainer, path_attr) is None
|
2021-07-28 10:12:46 +00:00
|
|
|
|
|
|
|
if save_top_k > 0:
|
|
|
|
# ckpt_path is None with no model provided means load the best weights
|
|
|
|
with pytest.warns(UserWarning, match="The best model of the previous `fit` call will be used"):
|
|
|
|
trainer_fn(ckpt_path=ckpt_path)
|
|
|
|
assert getattr(trainer, path_attr) == trainer.checkpoint_callback.best_model_path
|
2020-06-15 12:02:37 +00:00
|
|
|
else:
|
|
|
|
# specific checkpoint, pick one from saved ones
|
|
|
|
if save_top_k == 0:
|
|
|
|
with pytest.raises(FileNotFoundError):
|
2021-05-12 23:49:58 +00:00
|
|
|
trainer_fn(ckpt_path="random.ckpt")
|
2020-06-15 12:02:37 +00:00
|
|
|
else:
|
2020-10-21 18:34:29 +00:00
|
|
|
ckpt_path = str(
|
2021-07-26 11:37:35 +00:00
|
|
|
list((Path(tmpdir) / f"lightning_logs/version_{trainer.logger.version}/checkpoints").iterdir())[
|
|
|
|
0
|
|
|
|
].absolute()
|
2020-10-21 18:34:29 +00:00
|
|
|
)
|
2021-05-12 23:49:58 +00:00
|
|
|
trainer_fn(ckpt_path=ckpt_path)
|
|
|
|
assert getattr(trainer, path_attr) == ckpt_path
|
2020-06-15 12:02:37 +00:00
|
|
|
|
2021-07-28 10:12:46 +00:00
|
|
|
trainer_fn(model, ckpt_path=ckpt_path)
|
|
|
|
assert getattr(trainer, path_attr) == ckpt_path
|
|
|
|
|
2020-06-15 12:02:37 +00:00
|
|
|
|
2020-11-03 06:40:35 +00:00
|
|
|
def test_disabled_training(tmpdir):
|
|
|
|
"""Verify that `limit_train_batches=0` disables the training loop unless `fast_dev_run=True`."""
|
|
|
|
|
|
|
|
class CurrentModel(BoringModel):
|
|
|
|
|
|
|
|
training_step_invoked = False
|
|
|
|
training_epoch_end_invoked = False
|
|
|
|
|
|
|
|
def training_step(self, *args, **kwargs):
|
|
|
|
self.training_step_invoked = True
|
|
|
|
return super().training_step(*args, **kwargs)
|
|
|
|
|
|
|
|
def training_epoch_end(self, *args, **kwargs):
|
|
|
|
self.training_epoch_end_invoked = True
|
|
|
|
return super().training_epoch_end(*args, **kwargs)
|
|
|
|
|
|
|
|
model = CurrentModel()
|
|
|
|
|
|
|
|
trainer_options = dict(
|
|
|
|
default_root_dir=tmpdir,
|
|
|
|
progress_bar_refresh_rate=0,
|
|
|
|
max_epochs=2,
|
|
|
|
limit_train_batches=0.0,
|
|
|
|
limit_val_batches=0.2,
|
|
|
|
fast_dev_run=False,
|
|
|
|
)
|
|
|
|
|
|
|
|
before_state_dict = deepcopy(model.state_dict())
|
|
|
|
|
|
|
|
trainer = Trainer(**trainer_options)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-11-03 06:40:35 +00:00
|
|
|
|
|
|
|
after_state_dict = model.state_dict()
|
|
|
|
|
|
|
|
for key in before_state_dict.keys():
|
|
|
|
assert torch.all(torch.eq(before_state_dict[key], after_state_dict[key]))
|
|
|
|
|
|
|
|
# check that limit_train_batches=0 turns off training
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-11-03 06:40:35 +00:00
|
|
|
assert trainer.current_epoch == 0
|
|
|
|
assert not model.training_step_invoked, "`training_step` should not run when `limit_train_batches=0`"
|
|
|
|
assert not model.training_epoch_end_invoked, "`training_epoch_end` should not run when `limit_train_batches=0`"
|
|
|
|
|
|
|
|
# check that limit_train_batches has no influence when fast_dev_run is turned on
|
|
|
|
model = CurrentModel()
|
|
|
|
trainer_options.update(fast_dev_run=True)
|
|
|
|
before_state_dict = deepcopy(model.state_dict())
|
|
|
|
|
|
|
|
trainer = Trainer(**trainer_options)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-11-03 06:40:35 +00:00
|
|
|
|
|
|
|
after_state_dict = model.state_dict()
|
|
|
|
|
|
|
|
for key in before_state_dict.keys():
|
|
|
|
assert not torch.all(torch.eq(before_state_dict[key], after_state_dict[key]))
|
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-11-03 06:40:35 +00:00
|
|
|
assert trainer.current_epoch == 0
|
|
|
|
assert model.training_step_invoked, "did not run `training_step` with `fast_dev_run=True`"
|
|
|
|
assert model.training_epoch_end_invoked, "did not run `training_epoch_end` with `fast_dev_run=True`"
|
|
|
|
|
|
|
|
|
2020-06-29 01:36:46 +00:00
|
|
|
def test_disabled_validation(tmpdir):
|
2020-06-17 12:03:28 +00:00
|
|
|
"""Verify that `limit_val_batches=0` disables the validation loop unless `fast_dev_run=True`."""
|
2020-03-31 12:58:46 +00:00
|
|
|
|
2020-05-04 20:51:39 +00:00
|
|
|
class CurrentModel(EvalModelTemplate):
|
2020-03-31 12:58:46 +00:00
|
|
|
|
|
|
|
validation_step_invoked = False
|
2020-04-03 13:25:32 +00:00
|
|
|
validation_epoch_end_invoked = False
|
2020-03-31 12:58:46 +00:00
|
|
|
|
|
|
|
def validation_step(self, *args, **kwargs):
|
|
|
|
self.validation_step_invoked = True
|
|
|
|
return super().validation_step(*args, **kwargs)
|
|
|
|
|
2020-04-03 13:25:32 +00:00
|
|
|
def validation_epoch_end(self, *args, **kwargs):
|
|
|
|
self.validation_epoch_end_invoked = True
|
|
|
|
return super().validation_epoch_end(*args, **kwargs)
|
2020-03-31 12:58:46 +00:00
|
|
|
|
2020-05-10 17:15:28 +00:00
|
|
|
hparams = EvalModelTemplate.get_default_hparams()
|
2020-05-24 22:59:08 +00:00
|
|
|
model = CurrentModel(**hparams)
|
2020-03-31 12:58:46 +00:00
|
|
|
|
|
|
|
trainer_options = dict(
|
2020-06-29 01:36:46 +00:00
|
|
|
default_root_dir=tmpdir,
|
2020-04-24 18:45:43 +00:00
|
|
|
progress_bar_refresh_rate=0,
|
2020-03-31 12:58:46 +00:00
|
|
|
max_epochs=2,
|
2020-06-17 17:42:28 +00:00
|
|
|
limit_train_batches=0.4,
|
2020-06-17 12:03:28 +00:00
|
|
|
limit_val_batches=0.0,
|
2020-03-31 12:58:46 +00:00
|
|
|
fast_dev_run=False,
|
|
|
|
)
|
|
|
|
|
|
|
|
trainer = Trainer(**trainer_options)
|
2021-04-28 18:11:32 +00:00
|
|
|
trainer.fit(model)
|
2020-03-31 12:58:46 +00:00
|
|
|
|
2020-06-17 12:03:28 +00:00
|
|
|
# check that limit_val_batches=0 turns off validation
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-06-20 03:39:53 +00:00
|
|
|
assert trainer.current_epoch == 1
|
2020-10-21 18:34:29 +00:00
|
|
|
assert not model.validation_step_invoked, "`validation_step` should not run when `limit_val_batches=0`"
|
|
|
|
assert not model.validation_epoch_end_invoked, "`validation_epoch_end` should not run when `limit_val_batches=0`"
|
2020-03-31 12:58:46 +00:00
|
|
|
|
2020-06-17 12:03:28 +00:00
|
|
|
# check that limit_val_batches has no influence when fast_dev_run is turned on
|
2020-05-24 22:59:08 +00:00
|
|
|
model = CurrentModel(**hparams)
|
2020-03-31 12:58:46 +00:00
|
|
|
trainer_options.update(fast_dev_run=True)
|
|
|
|
trainer = Trainer(**trainer_options)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2020-03-31 12:58:46 +00:00
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-06-20 03:39:53 +00:00
|
|
|
assert trainer.current_epoch == 0
|
2020-10-21 18:34:29 +00:00
|
|
|
assert model.validation_step_invoked, "did not run `validation_step` with `fast_dev_run=True`"
|
|
|
|
assert model.validation_epoch_end_invoked, "did not run `validation_epoch_end` with `fast_dev_run=True`"
|
2020-03-31 12:58:46 +00:00
|
|
|
|
|
|
|
|
2021-09-02 16:35:22 +00:00
|
|
|
@mock.patch("torch.Tensor.backward")
|
|
|
|
def test_nan_loss_detection(backward_mock, tmpdir):
|
2021-05-07 13:59:32 +00:00
|
|
|
class CurrentModel(BoringModel):
|
|
|
|
test_batch_inf = 3
|
2020-03-31 12:58:46 +00:00
|
|
|
|
2021-05-07 13:59:32 +00:00
|
|
|
def training_step(self, batch, batch_idx):
|
|
|
|
output = super().training_step(batch, batch_idx)
|
|
|
|
if batch_idx == self.test_batch_inf:
|
2020-03-31 12:58:46 +00:00
|
|
|
if isinstance(output, dict):
|
2020-10-21 18:34:29 +00:00
|
|
|
output["loss"] *= torch.tensor(math.inf) # make loss infinite
|
2020-03-31 12:58:46 +00:00
|
|
|
else:
|
|
|
|
output /= 0
|
|
|
|
return output
|
|
|
|
|
2020-05-10 17:15:28 +00:00
|
|
|
model = CurrentModel()
|
2020-03-31 12:58:46 +00:00
|
|
|
|
|
|
|
# fit model
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_steps=(model.test_batch_inf + 1), terminate_on_nan=True)
|
2020-03-31 12:58:46 +00:00
|
|
|
|
2021-04-08 23:47:02 +00:00
|
|
|
with pytest.raises(ValueError, match=r".*The loss returned in `training_step` is.*"):
|
2020-03-31 12:58:46 +00:00
|
|
|
trainer.fit(model)
|
2021-05-07 13:59:32 +00:00
|
|
|
assert trainer.global_step == model.test_batch_inf
|
2021-09-02 16:35:22 +00:00
|
|
|
assert backward_mock.call_count == model.test_batch_inf
|
2020-03-31 12:58:46 +00:00
|
|
|
|
|
|
|
for param in model.parameters():
|
|
|
|
assert torch.isfinite(param).all()
|
|
|
|
|
|
|
|
|
2021-09-02 16:35:22 +00:00
|
|
|
@mock.patch("torch.Tensor.backward")
|
|
|
|
def test_nan_params_detection(backward_mock, tmpdir):
|
2021-05-07 13:59:32 +00:00
|
|
|
class CurrentModel(BoringModel):
|
|
|
|
test_batch_nan = 3
|
2020-03-31 12:58:46 +00:00
|
|
|
|
|
|
|
def on_after_backward(self):
|
2020-05-04 20:51:39 +00:00
|
|
|
if self.global_step == self.test_batch_nan:
|
2020-03-31 12:58:46 +00:00
|
|
|
# simulate parameter that became nan
|
2021-05-07 13:59:32 +00:00
|
|
|
torch.nn.init.constant_(self.layer.bias, math.nan)
|
2020-03-31 12:58:46 +00:00
|
|
|
|
2020-05-10 17:15:28 +00:00
|
|
|
model = CurrentModel()
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_steps=(model.test_batch_nan + 1), terminate_on_nan=True)
|
2020-03-31 12:58:46 +00:00
|
|
|
|
2021-05-07 13:59:32 +00:00
|
|
|
with pytest.raises(ValueError, match=r".*Detected nan and/or inf values in `layer.bias`.*"):
|
2020-03-31 12:58:46 +00:00
|
|
|
trainer.fit(model)
|
2020-05-04 20:51:39 +00:00
|
|
|
assert trainer.global_step == model.test_batch_nan
|
2021-09-02 16:35:22 +00:00
|
|
|
assert backward_mock.call_count == model.test_batch_nan + 1
|
2020-03-31 12:58:46 +00:00
|
|
|
|
|
|
|
# after aborting the training loop, model still has nan-valued params
|
|
|
|
params = torch.cat([param.view(-1) for param in model.parameters()])
|
|
|
|
assert not torch.isfinite(params).all()
|
2020-04-05 15:12:41 +00:00
|
|
|
|
|
|
|
|
2021-09-01 08:49:00 +00:00
|
|
|
def test_on_exception_hook(tmpdir):
|
|
|
|
"""Test the on_exception callback hook and the trainer interrupted flag."""
|
2020-04-05 15:12:41 +00:00
|
|
|
|
2021-09-01 08:49:00 +00:00
|
|
|
model = BoringModel()
|
2020-04-05 15:12:41 +00:00
|
|
|
|
|
|
|
class InterruptCallback(Callback):
|
|
|
|
def __init__(self):
|
|
|
|
super().__init__()
|
|
|
|
|
2020-08-07 13:29:57 +00:00
|
|
|
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
|
2020-04-05 15:12:41 +00:00
|
|
|
raise KeyboardInterrupt
|
|
|
|
|
2021-09-01 08:49:00 +00:00
|
|
|
def on_test_start(self, trainer, pl_module):
|
|
|
|
raise MisconfigurationException
|
|
|
|
|
2020-06-15 10:35:26 +00:00
|
|
|
class HandleInterruptCallback(Callback):
|
|
|
|
def __init__(self):
|
|
|
|
super().__init__()
|
2021-09-01 08:49:00 +00:00
|
|
|
self.exception = None
|
2020-06-15 10:35:26 +00:00
|
|
|
self.exc_info = None
|
|
|
|
|
2021-09-01 08:49:00 +00:00
|
|
|
def on_exception(self, trainer, pl_module, exception):
|
|
|
|
self.exception = exception
|
|
|
|
|
2020-06-15 10:35:26 +00:00
|
|
|
def on_keyboard_interrupt(self, trainer, pl_module):
|
|
|
|
self.exc_info = sys.exc_info()
|
|
|
|
|
2020-04-05 15:12:41 +00:00
|
|
|
interrupt_callback = InterruptCallback()
|
2020-06-15 10:35:26 +00:00
|
|
|
handle_interrupt_callback = HandleInterruptCallback()
|
2020-04-05 15:12:41 +00:00
|
|
|
|
2020-05-01 14:43:58 +00:00
|
|
|
trainer = Trainer(
|
2020-06-15 10:35:26 +00:00
|
|
|
callbacks=[interrupt_callback, handle_interrupt_callback],
|
2020-05-01 14:43:58 +00:00
|
|
|
max_epochs=1,
|
2020-06-17 12:03:28 +00:00
|
|
|
limit_val_batches=0.1,
|
2020-06-17 17:42:28 +00:00
|
|
|
limit_train_batches=0.2,
|
2020-05-01 14:43:58 +00:00
|
|
|
progress_bar_refresh_rate=0,
|
|
|
|
logger=False,
|
|
|
|
default_root_dir=tmpdir,
|
|
|
|
)
|
2020-04-05 15:12:41 +00:00
|
|
|
assert not trainer.interrupted
|
2021-09-01 08:49:00 +00:00
|
|
|
assert handle_interrupt_callback.exception is None
|
2020-06-15 10:35:26 +00:00
|
|
|
assert handle_interrupt_callback.exc_info is None
|
2020-04-05 15:12:41 +00:00
|
|
|
trainer.fit(model)
|
|
|
|
assert trainer.interrupted
|
2021-09-01 08:49:00 +00:00
|
|
|
assert isinstance(handle_interrupt_callback.exception, KeyboardInterrupt)
|
2020-06-15 10:35:26 +00:00
|
|
|
assert isinstance(handle_interrupt_callback.exc_info[1], KeyboardInterrupt)
|
2021-09-01 08:49:00 +00:00
|
|
|
with pytest.raises(MisconfigurationException):
|
|
|
|
trainer.test(model)
|
|
|
|
assert trainer.interrupted
|
|
|
|
assert isinstance(handle_interrupt_callback.exception, MisconfigurationException)
|
2020-04-10 01:08:28 +00:00
|
|
|
|
|
|
|
|
2021-08-13 11:10:20 +00:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"precision",
|
|
|
|
[32, pytest.param(16, marks=RunIf(min_gpus=1, amp_native=True))],
|
|
|
|
)
|
|
|
|
def test_gradient_clipping_by_norm(tmpdir, precision):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Test gradient clipping by norm."""
|
2021-04-06 13:27:37 +00:00
|
|
|
tutils.reset_seed()
|
|
|
|
|
2021-08-13 11:10:20 +00:00
|
|
|
model = EvalModelTemplate() # TODO: when precision=16, BoringModel produces NaN, but EvalModelTemplate not
|
2021-04-06 13:27:37 +00:00
|
|
|
trainer = Trainer(
|
2021-08-13 11:10:20 +00:00
|
|
|
default_root_dir=tmpdir,
|
2021-04-14 08:47:06 +00:00
|
|
|
max_steps=1,
|
2021-04-06 13:27:37 +00:00
|
|
|
max_epochs=1,
|
2021-08-13 11:10:20 +00:00
|
|
|
gpus=int(torch.cuda.is_available()),
|
|
|
|
precision=precision,
|
|
|
|
gradient_clip_algorithm="norm",
|
|
|
|
gradient_clip_val=1.0,
|
2021-04-06 13:27:37 +00:00
|
|
|
)
|
|
|
|
|
2021-09-15 12:58:01 +00:00
|
|
|
old_backward = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._backward
|
2021-04-06 13:27:37 +00:00
|
|
|
|
2021-08-13 11:10:20 +00:00
|
|
|
def backward(*args, **kwargs):
|
2021-04-06 13:27:37 +00:00
|
|
|
# test that gradient is clipped correctly
|
2021-08-13 11:10:20 +00:00
|
|
|
ret_val = old_backward(*args, **kwargs)
|
2020-10-21 18:34:29 +00:00
|
|
|
parameters = model.parameters()
|
|
|
|
grad_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), 2) for p in parameters]), 2)
|
2021-07-26 12:38:12 +00:00
|
|
|
assert (grad_norm - 1.0).abs() < 0.01, f"Gradient norm != 1.0: {grad_norm}"
|
2020-10-21 18:34:29 +00:00
|
|
|
return ret_val
|
|
|
|
|
2021-09-15 12:58:01 +00:00
|
|
|
trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._backward = backward
|
2020-10-01 05:36:34 +00:00
|
|
|
trainer.fit(model)
|
|
|
|
|
2020-10-03 16:33:29 +00:00
|
|
|
|
2021-08-13 11:10:20 +00:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"precision",
|
|
|
|
[32, pytest.param(16, marks=RunIf(min_gpus=1, amp_native=True))],
|
|
|
|
)
|
|
|
|
def test_gradient_clipping_by_value(tmpdir, precision):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Test gradient clipping by value."""
|
2021-04-06 13:27:37 +00:00
|
|
|
tutils.reset_seed()
|
|
|
|
|
|
|
|
model = BoringModel()
|
2021-08-13 11:10:20 +00:00
|
|
|
|
2021-04-14 08:47:06 +00:00
|
|
|
grad_clip_val = 1e-10
|
2021-04-06 13:27:37 +00:00
|
|
|
trainer = Trainer(
|
2021-04-14 08:47:06 +00:00
|
|
|
max_steps=1,
|
2021-04-06 13:27:37 +00:00
|
|
|
max_epochs=1,
|
2021-08-13 11:10:20 +00:00
|
|
|
precision=precision,
|
|
|
|
gpus=int(torch.cuda.is_available()),
|
2021-04-06 13:27:37 +00:00
|
|
|
gradient_clip_val=grad_clip_val,
|
2021-07-26 11:37:35 +00:00
|
|
|
gradient_clip_algorithm="value",
|
2021-04-06 13:27:37 +00:00
|
|
|
default_root_dir=tmpdir,
|
|
|
|
)
|
|
|
|
|
2021-09-15 12:58:01 +00:00
|
|
|
old_backward = trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._backward
|
2021-04-06 13:27:37 +00:00
|
|
|
|
2021-08-13 11:10:20 +00:00
|
|
|
def backward(*args, **kwargs):
|
2021-04-06 13:27:37 +00:00
|
|
|
# test that gradient is clipped correctly
|
2021-08-13 11:10:20 +00:00
|
|
|
ret_val = old_backward(*args, **kwargs)
|
2021-04-06 13:27:37 +00:00
|
|
|
parameters = model.parameters()
|
2021-04-14 08:47:06 +00:00
|
|
|
grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters]
|
|
|
|
grad_max = torch.max(torch.stack(grad_max_list))
|
2021-07-26 11:37:35 +00:00
|
|
|
assert (
|
|
|
|
abs(grad_max.item() - grad_clip_val) < 1e-11
|
|
|
|
), f"Gradient max value {grad_max} != grad_clip_val {grad_clip_val} ."
|
2021-04-06 13:27:37 +00:00
|
|
|
return ret_val
|
|
|
|
|
2021-09-15 12:58:01 +00:00
|
|
|
trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop._backward = backward
|
2021-04-06 13:27:37 +00:00
|
|
|
trainer.fit(model)
|
|
|
|
|
|
|
|
|
2020-04-10 15:45:29 +00:00
|
|
|
def test_gpu_choice(tmpdir):
|
2021-02-06 15:06:17 +00:00
|
|
|
trainer_options = dict(default_root_dir=tmpdir)
|
2020-04-10 15:45:29 +00:00
|
|
|
# Only run if CUDA is available
|
|
|
|
if not torch.cuda.is_available():
|
|
|
|
return
|
|
|
|
|
|
|
|
num_gpus = torch.cuda.device_count()
|
|
|
|
Trainer(**trainer_options, gpus=num_gpus, auto_select_gpus=True)
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
with pytest.raises(RuntimeError, match=r".*No GPUs available.*"):
|
2020-04-10 15:45:29 +00:00
|
|
|
Trainer(**trainer_options, gpus=num_gpus + 1, auto_select_gpus=True)
|
2020-04-16 03:17:31 +00:00
|
|
|
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
@pytest.mark.parametrize("limit_val_batches", [0.0, 1, 1.0, 0.5, 5])
|
2020-07-23 11:07:03 +00:00
|
|
|
def test_num_sanity_val_steps(tmpdir, limit_val_batches):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Test that the number of sanity check batches is clipped to `limit_val_batches`."""
|
2020-08-21 18:11:31 +00:00
|
|
|
model = EvalModelTemplate()
|
|
|
|
model.validation_step = model.validation_step__multiple_dataloaders
|
|
|
|
model.validation_epoch_end = model.validation_epoch_end__multiple_dataloaders
|
|
|
|
num_sanity_val_steps = 4
|
|
|
|
|
|
|
|
trainer = Trainer(
|
|
|
|
default_root_dir=tmpdir,
|
|
|
|
num_sanity_val_steps=num_sanity_val_steps,
|
|
|
|
limit_val_batches=limit_val_batches,
|
|
|
|
max_steps=1,
|
|
|
|
)
|
|
|
|
assert trainer.num_sanity_val_steps == num_sanity_val_steps
|
|
|
|
|
2020-10-04 21:05:26 +00:00
|
|
|
with patch.object(
|
2021-06-29 09:06:44 +00:00
|
|
|
trainer.fit_loop.epoch_loop.val_loop.epoch_loop,
|
2021-06-18 12:54:59 +00:00
|
|
|
"evaluation_step",
|
2021-07-26 11:37:35 +00:00
|
|
|
wraps=trainer.fit_loop.epoch_loop.val_loop.epoch_loop.evaluation_step,
|
2020-10-04 21:05:26 +00:00
|
|
|
) as mocked:
|
|
|
|
val_dataloaders = model.val_dataloader__multiple_mixed_length()
|
|
|
|
trainer.fit(model, val_dataloaders=val_dataloaders)
|
|
|
|
|
|
|
|
assert mocked.call_count == sum(
|
|
|
|
min(num_sanity_val_steps, num_batches) for num_batches in trainer.num_val_batches
|
|
|
|
)
|
|
|
|
|
2020-08-21 18:11:31 +00:00
|
|
|
|
2021-05-12 20:10:15 +00:00
|
|
|
@pytest.mark.parametrize("limit_val_batches", [0.0, 1, 1.0, 0.3])
|
2020-08-21 18:11:31 +00:00
|
|
|
def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Test that `num_sanity_val_steps=-1` runs through all validation data once, and as many batches as limited by
|
|
|
|
`limit_val_batches` Trainer argument."""
|
2020-07-23 11:07:03 +00:00
|
|
|
model = EvalModelTemplate()
|
|
|
|
model.validation_step = model.validation_step__multiple_dataloaders
|
|
|
|
model.validation_epoch_end = model.validation_epoch_end__multiple_dataloaders
|
|
|
|
trainer = Trainer(
|
2021-07-26 11:37:35 +00:00
|
|
|
default_root_dir=tmpdir, num_sanity_val_steps=-1, limit_val_batches=limit_val_batches, max_steps=1
|
2020-07-23 11:07:03 +00:00
|
|
|
)
|
2020-10-21 18:34:29 +00:00
|
|
|
assert trainer.num_sanity_val_steps == float("inf")
|
2020-07-23 11:07:03 +00:00
|
|
|
|
2020-10-04 21:05:26 +00:00
|
|
|
with patch.object(
|
2021-06-29 09:06:44 +00:00
|
|
|
trainer.fit_loop.epoch_loop.val_loop.epoch_loop,
|
2021-06-18 12:54:59 +00:00
|
|
|
"evaluation_step",
|
2021-07-26 11:37:35 +00:00
|
|
|
wraps=trainer.fit_loop.epoch_loop.val_loop.epoch_loop.evaluation_step,
|
2020-10-04 21:05:26 +00:00
|
|
|
) as mocked:
|
|
|
|
val_dataloaders = model.val_dataloader__multiple()
|
|
|
|
trainer.fit(model, val_dataloaders=val_dataloaders)
|
|
|
|
|
|
|
|
assert mocked.call_count == sum(trainer.num_val_batches)
|
|
|
|
|
2020-07-23 11:07:03 +00:00
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"trainer_kwargs,expected",
|
|
|
|
[
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator=None, gpus=None),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="dp", gpus=None),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp", gpus=None),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp", num_processes=2, gpus=None),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp", num_nodes=2, gpus=None),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp_cpu", num_processes=2, gpus=None),
|
2021-05-14 19:53:26 +00:00
|
|
|
dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp2", gpus=None),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=None, _device_type=DeviceType.CPU, num_gpus=0, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator=None, gpus=1),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=None, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="dp", gpus=1),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp", gpus=1),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp_cpu", num_processes=2, gpus=1),
|
2021-05-14 19:53:26 +00:00
|
|
|
dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp2", gpus=1),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=1, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator=None, gpus=2),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=DistributedType.DDP_SPAWN, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="dp", gpus=2),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=DistributedType.DP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp", gpus=2),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.GPU, num_gpus=2, num_processes=2),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2020-12-13 15:04:16 +00:00
|
|
|
(
|
2020-12-09 08:18:23 +00:00
|
|
|
dict(accelerator="ddp2", gpus=2),
|
2021-05-12 20:10:15 +00:00
|
|
|
dict(_distrib_type=DistributedType.DDP2, _device_type=DeviceType.GPU, num_gpus=2, num_processes=1),
|
2020-10-21 18:34:29 +00:00
|
|
|
),
|
2021-08-02 11:12:28 +00:00
|
|
|
(
|
|
|
|
dict(accelerator="ddp2", num_processes=2, gpus=None),
|
|
|
|
dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
|
|
|
|
),
|
|
|
|
(
|
|
|
|
dict(accelerator="dp", num_processes=2, gpus=None),
|
|
|
|
dict(_distrib_type=DistributedType.DDP, _device_type=DeviceType.CPU, num_gpus=0, num_processes=2),
|
|
|
|
),
|
2020-10-21 18:34:29 +00:00
|
|
|
],
|
|
|
|
)
|
2020-12-13 15:04:16 +00:00
|
|
|
def test_trainer_config(trainer_kwargs, expected, monkeypatch):
|
|
|
|
if trainer_kwargs["gpus"] is not None:
|
|
|
|
monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
|
|
|
|
monkeypatch.setattr(torch.cuda, "device_count", lambda: trainer_kwargs["gpus"])
|
2020-04-16 03:17:31 +00:00
|
|
|
trainer = Trainer(**trainer_kwargs)
|
2021-05-12 20:10:15 +00:00
|
|
|
assert len(expected) == 4
|
2020-12-13 15:04:16 +00:00
|
|
|
for k, v in expected.items():
|
|
|
|
assert getattr(trainer, k) == v, f"Failed {k}: {v}"
|
2020-05-17 13:14:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_trainer_subclassing():
|
|
|
|
model = EvalModelTemplate()
|
|
|
|
|
|
|
|
# First way of pulling out args from signature is to list them
|
|
|
|
class TrainerSubclass(Trainer):
|
2020-10-21 18:34:29 +00:00
|
|
|
def __init__(self, custom_arg, *args, custom_kwarg="test", **kwargs):
|
2020-05-17 13:14:54 +00:00
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
self.custom_arg = custom_arg
|
|
|
|
self.custom_kwarg = custom_kwarg
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
trainer = TrainerSubclass(123, custom_kwarg="custom", fast_dev_run=True)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-05-17 13:14:54 +00:00
|
|
|
assert trainer.custom_arg == 123
|
2020-10-21 18:34:29 +00:00
|
|
|
assert trainer.custom_kwarg == "custom"
|
2020-05-17 13:14:54 +00:00
|
|
|
assert trainer.fast_dev_run
|
|
|
|
|
|
|
|
# Second way is to pop from the dict
|
|
|
|
# It's a special case because Trainer does not have any positional args
|
|
|
|
class TrainerSubclass(Trainer):
|
|
|
|
def __init__(self, **kwargs):
|
2020-10-21 18:34:29 +00:00
|
|
|
self.custom_arg = kwargs.pop("custom_arg", 0)
|
|
|
|
self.custom_kwarg = kwargs.pop("custom_kwarg", "test")
|
2020-05-17 13:14:54 +00:00
|
|
|
super().__init__(**kwargs)
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
trainer = TrainerSubclass(custom_kwarg="custom", fast_dev_run=True)
|
2021-01-12 00:36:48 +00:00
|
|
|
trainer.fit(model)
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2020-10-21 18:34:29 +00:00
|
|
|
assert trainer.custom_kwarg == "custom"
|
2020-05-17 13:14:54 +00:00
|
|
|
assert trainer.fast_dev_run
|
|
|
|
|
|
|
|
# when we pass in an unknown arg, the base class should complain
|
2020-05-26 23:04:42 +00:00
|
|
|
with pytest.raises(TypeError, match=r"__init__\(\) got an unexpected keyword argument 'abcdefg'"):
|
2020-10-21 18:34:29 +00:00
|
|
|
TrainerSubclass(abcdefg="unknown_arg")
|
2020-06-09 20:51:30 +00:00
|
|
|
|
|
|
|
|
2020-10-21 18:34:29 +00:00
|
|
|
@pytest.mark.parametrize(
|
2021-07-26 11:37:35 +00:00
|
|
|
"trainer_params", [OmegaConf.create(dict(max_epochs=1, gpus=1)), OmegaConf.create(dict(max_epochs=1, gpus=[0]))]
|
2020-10-21 18:34:29 +00:00
|
|
|
)
|
2021-03-02 08:03:32 +00:00
|
|
|
@RunIf(min_gpus=1)
|
2020-06-20 03:42:11 +00:00
|
|
|
def test_trainer_omegaconf(trainer_params):
|
|
|
|
Trainer(**trainer_params)
|
|
|
|
|
|
|
|
|
2020-06-09 20:51:30 +00:00
|
|
|
def test_trainer_pickle(tmpdir):
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(max_epochs=1, default_root_dir=tmpdir)
|
2020-06-09 20:51:30 +00:00
|
|
|
pickle.dumps(trainer)
|
|
|
|
cloudpickle.dumps(trainer)
|
2020-07-25 03:57:31 +00:00
|
|
|
|
|
|
|
|
2021-03-11 02:46:37 +00:00
|
|
|
@pytest.mark.parametrize("stage", ("fit", "validate", "test"))
|
|
|
|
def test_trainer_setup_call(tmpdir, stage):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Test setup call gets the correct stage."""
|
2020-07-25 03:57:31 +00:00
|
|
|
|
2021-03-11 02:46:37 +00:00
|
|
|
class CurrentModel(BoringModel):
|
2020-07-25 03:57:31 +00:00
|
|
|
def setup(self, stage):
|
|
|
|
self.stage = stage
|
|
|
|
|
2021-08-04 15:43:34 +00:00
|
|
|
class CurrentCallback(Callback):
|
|
|
|
def setup(self, trainer, model, stage):
|
2020-11-14 00:34:46 +00:00
|
|
|
assert model is not None
|
2020-07-25 03:57:31 +00:00
|
|
|
self.stage = stage
|
|
|
|
|
|
|
|
model = CurrentModel()
|
2021-08-04 15:43:34 +00:00
|
|
|
callback = CurrentCallback()
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, checkpoint_callback=False, callbacks=[callback])
|
2020-07-25 03:57:31 +00:00
|
|
|
|
2021-03-11 02:46:37 +00:00
|
|
|
if stage == "fit":
|
|
|
|
trainer.fit(model)
|
|
|
|
elif stage == "validate":
|
2021-07-28 10:12:46 +00:00
|
|
|
trainer.validate(model)
|
2021-03-11 02:46:37 +00:00
|
|
|
else:
|
2021-07-28 10:12:46 +00:00
|
|
|
trainer.test(model)
|
2020-07-25 03:57:31 +00:00
|
|
|
|
2021-08-04 15:43:34 +00:00
|
|
|
assert callback.stage == stage
|
|
|
|
assert model.stage == stage
|
2020-09-30 10:26:27 +00:00
|
|
|
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
@pytest.mark.parametrize("train_batches, max_steps, log_interval", [(10, 10, 1), (3, 10, 1), (3, 10, 5)])
|
2020-09-30 10:26:27 +00:00
|
|
|
@patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics")
|
2020-10-06 14:27:06 +00:00
|
|
|
def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, log_interval):
|
2021-03-30 22:28:04 +00:00
|
|
|
class TestModel(BoringModel):
|
|
|
|
def training_step(self, *args, **kwargs):
|
|
|
|
self.log("foo", -1)
|
|
|
|
return super().training_step(*args, **kwargs)
|
|
|
|
|
|
|
|
model = TestModel()
|
2020-09-30 10:26:27 +00:00
|
|
|
trainer = Trainer(
|
|
|
|
default_root_dir=tmpdir,
|
2020-10-06 14:27:06 +00:00
|
|
|
log_every_n_steps=log_interval,
|
|
|
|
flush_logs_every_n_steps=log_interval,
|
2020-09-30 10:26:27 +00:00
|
|
|
limit_train_batches=train_batches,
|
|
|
|
limit_val_batches=0,
|
|
|
|
max_steps=max_steps,
|
|
|
|
)
|
|
|
|
trainer.fit(model)
|
|
|
|
expected_calls = [call(metrics=ANY, step=s) for s in range(log_interval - 1, max_steps, log_interval)]
|
|
|
|
log_metrics_mock.assert_has_calls(expected_calls)
|
2020-10-27 10:57:16 +00:00
|
|
|
|
|
|
|
|
2021-01-27 16:38:14 +00:00
|
|
|
class TestLightningDataModule(LightningDataModule):
|
|
|
|
def __init__(self, dataloaders):
|
|
|
|
super().__init__()
|
|
|
|
self._dataloaders = dataloaders
|
|
|
|
|
|
|
|
def test_dataloader(self):
|
|
|
|
return self._dataloaders
|
|
|
|
|
2021-02-16 22:11:56 +00:00
|
|
|
def predict_dataloader(self):
|
|
|
|
return self._dataloaders
|
|
|
|
|
2021-01-27 16:38:14 +00:00
|
|
|
|
2021-04-27 20:23:55 +00:00
|
|
|
class CustomPredictionWriter(BasePredictionWriter):
|
|
|
|
|
|
|
|
write_on_batch_end_called = False
|
|
|
|
write_on_epoch_end_called = False
|
|
|
|
|
|
|
|
def __init__(self, output_dir: str, *args, **kwargs):
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
self.output_dir = output_dir
|
|
|
|
|
|
|
|
def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *args, **kwargs):
|
|
|
|
assert prediction.shape == torch.Size([1, 2])
|
2021-09-14 14:40:19 +00:00
|
|
|
assert len(batch_indices) == 1
|
2021-04-27 20:23:55 +00:00
|
|
|
self.write_on_batch_end_called = True
|
|
|
|
|
|
|
|
def write_on_epoch_end(self, trainer, pl_module, predictions, batch_indices):
|
|
|
|
expected = 1 if trainer.accelerator_connector.is_distributed else 2
|
|
|
|
assert len(predictions) == 2
|
|
|
|
assert len(predictions[0]) == expected
|
2021-09-14 14:40:19 +00:00
|
|
|
assert len(batch_indices) == 2
|
|
|
|
assert len(batch_indices[0]) == expected
|
2021-04-27 20:23:55 +00:00
|
|
|
self.write_on_epoch_end_called = True
|
2021-04-27 12:46:45 +00:00
|
|
|
|
|
|
|
def on_predict_epoch_end(self, trainer, pl_module, outputs):
|
|
|
|
if trainer.accelerator_connector.is_distributed:
|
|
|
|
for idx in range(2):
|
|
|
|
assert isinstance(trainer.predict_dataloaders[idx].batch_sampler.sampler, UnrepeatedDistributedSampler)
|
|
|
|
assert isinstance(trainer.predict_dataloaders[idx].batch_sampler, IndexBatchSamplerWrapper)
|
|
|
|
super().on_predict_epoch_end(trainer, pl_module, outputs)
|
|
|
|
|
2021-01-27 16:38:14 +00:00
|
|
|
|
2021-04-27 20:23:55 +00:00
|
|
|
def predict(
|
|
|
|
tmpdir, accelerator, gpus, num_processes, model=None, plugins=None, datamodule=True, pbrr=None, use_callbacks=True
|
|
|
|
):
|
2021-02-06 15:06:17 +00:00
|
|
|
dataloaders = [torch.utils.data.DataLoader(RandomDataset(32, 2)), torch.utils.data.DataLoader(RandomDataset(32, 2))]
|
2021-01-27 16:38:14 +00:00
|
|
|
|
2021-03-06 17:15:21 +00:00
|
|
|
model = model or BoringModel()
|
2021-03-11 02:46:37 +00:00
|
|
|
dm = TestLightningDataModule(dataloaders)
|
2021-01-27 16:38:14 +00:00
|
|
|
|
2021-04-27 20:23:55 +00:00
|
|
|
cb = CustomPredictionWriter(tmpdir, write_interval="batch")
|
|
|
|
cb_1 = CustomPredictionWriter(tmpdir, write_interval="epoch")
|
|
|
|
|
2021-01-27 16:38:14 +00:00
|
|
|
trainer = Trainer(
|
|
|
|
default_root_dir=tmpdir,
|
|
|
|
max_epochs=1,
|
|
|
|
log_every_n_steps=1,
|
|
|
|
weights_summary=None,
|
|
|
|
accelerator=accelerator,
|
|
|
|
gpus=gpus,
|
|
|
|
num_processes=num_processes,
|
|
|
|
plugins=plugins,
|
2021-04-27 12:46:45 +00:00
|
|
|
progress_bar_refresh_rate=pbrr,
|
2021-07-26 11:37:35 +00:00
|
|
|
callbacks=[cb, cb_1] if use_callbacks else [],
|
2021-01-27 16:38:14 +00:00
|
|
|
)
|
2021-04-27 12:46:45 +00:00
|
|
|
if accelerator == "ddp_spawn":
|
|
|
|
with pytest.raises(MisconfigurationException):
|
|
|
|
trainer.predict(model, datamodule=dm, return_predictions=True)
|
|
|
|
|
2021-01-27 16:38:14 +00:00
|
|
|
if datamodule:
|
2021-03-11 02:46:37 +00:00
|
|
|
results = trainer.predict(model, datamodule=dm)
|
2021-01-27 16:38:14 +00:00
|
|
|
else:
|
|
|
|
results = trainer.predict(model, dataloaders=dataloaders)
|
|
|
|
|
2021-04-27 12:46:45 +00:00
|
|
|
if not isinstance(trainer.training_type_plugin, DDPSpawnPlugin):
|
2021-04-27 20:23:55 +00:00
|
|
|
if use_callbacks:
|
|
|
|
assert cb.write_on_batch_end_called
|
|
|
|
assert not cb.write_on_epoch_end_called
|
|
|
|
|
|
|
|
assert not cb_1.write_on_batch_end_called
|
|
|
|
assert cb_1.write_on_epoch_end_called
|
|
|
|
|
2021-04-27 12:46:45 +00:00
|
|
|
num_samples = 1 if accelerator == "ddp" else 2
|
|
|
|
assert len(results) == 2
|
|
|
|
assert len(results[0]) == num_samples
|
|
|
|
assert results[0][0].shape == torch.Size([1, 2])
|
2021-01-27 16:38:14 +00:00
|
|
|
|
|
|
|
|
2021-03-06 17:15:21 +00:00
|
|
|
def test_trainer_predict_no_return(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Test trainer.predict warns when nothing is returned."""
|
2021-03-06 17:15:21 +00:00
|
|
|
|
|
|
|
class CustomBoringModel(BoringModel):
|
2021-03-23 15:13:13 +00:00
|
|
|
def predict_step(self, batch, batch_idx, dataloader_idx=None):
|
2021-03-06 17:15:21 +00:00
|
|
|
if (batch_idx + 1) % 2 == 0:
|
|
|
|
return
|
|
|
|
|
2021-03-23 15:13:13 +00:00
|
|
|
return super().predict_step(batch, batch_idx, dataloader_idx)
|
2021-03-06 17:15:21 +00:00
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
with pytest.warns(UserWarning, match="predict returned None"):
|
2021-04-27 20:23:55 +00:00
|
|
|
predict(tmpdir, None, None, 1, model=CustomBoringModel(), use_callbacks=False)
|
2021-03-06 17:15:21 +00:00
|
|
|
|
|
|
|
|
2021-03-23 22:07:48 +00:00
|
|
|
def test_trainer_predict_grad(tmpdir):
|
|
|
|
class CustomBoringModel(BoringModel):
|
|
|
|
def predict_step(self, batch, batch_idx, dataloader_idx=None):
|
|
|
|
assert batch.expand_as(batch).grad_fn is None
|
|
|
|
return super().predict_step(batch, batch_idx, dataloader_idx)
|
|
|
|
|
2021-04-27 20:23:55 +00:00
|
|
|
predict(tmpdir, None, None, 1, model=CustomBoringModel(), use_callbacks=False)
|
2021-03-23 22:07:48 +00:00
|
|
|
|
|
|
|
x = torch.zeros(1, requires_grad=True)
|
|
|
|
assert x.expand_as(x).grad_fn is not None
|
|
|
|
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
@pytest.mark.parametrize("progress_bar_refresh_rate", [0, 5, None])
|
|
|
|
@pytest.mark.parametrize("datamodule", [False, True])
|
2021-04-14 08:50:36 +00:00
|
|
|
def test_trainer_predict_cpu(tmpdir, datamodule, progress_bar_refresh_rate):
|
|
|
|
predict(tmpdir, None, None, 1, datamodule=datamodule, pbrr=progress_bar_refresh_rate)
|
2021-01-27 16:38:14 +00:00
|
|
|
|
|
|
|
|
2021-03-02 18:57:13 +00:00
|
|
|
@RunIf(min_gpus=2, special=True)
|
2021-07-26 11:37:35 +00:00
|
|
|
@pytest.mark.parametrize("num_gpus", [1, 2])
|
2021-01-27 16:38:14 +00:00
|
|
|
def test_trainer_predict_dp(tmpdir, num_gpus):
|
|
|
|
predict(tmpdir, "dp", num_gpus, None)
|
|
|
|
|
|
|
|
|
2021-03-30 17:39:02 +00:00
|
|
|
@RunIf(min_gpus=2, special=True, fairscale=True)
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
def test_trainer_predict_ddp(tmpdir):
|
2021-04-27 12:46:45 +00:00
|
|
|
predict(tmpdir, "ddp", 2, None)
|
2021-01-27 16:38:14 +00:00
|
|
|
|
|
|
|
|
2021-03-02 18:57:13 +00:00
|
|
|
@RunIf(min_gpus=2, skip_windows=True, special=True)
|
2021-01-27 16:38:14 +00:00
|
|
|
def test_trainer_predict_ddp_spawn(tmpdir):
|
|
|
|
predict(tmpdir, "ddp_spawn", 2, None)
|
|
|
|
|
|
|
|
|
2021-03-02 18:57:13 +00:00
|
|
|
@RunIf(min_gpus=2, special=True)
|
2021-01-27 16:38:14 +00:00
|
|
|
def test_trainer_predict_1_gpu(tmpdir):
|
|
|
|
predict(tmpdir, None, 1, None)
|
|
|
|
|
|
|
|
|
2021-04-27 12:46:45 +00:00
|
|
|
@RunIf(skip_windows=True)
|
2021-01-27 16:38:14 +00:00
|
|
|
def test_trainer_predict_ddp_cpu(tmpdir):
|
|
|
|
predict(tmpdir, "ddp_cpu", 0, 2)
|
|
|
|
|
|
|
|
|
2021-09-14 14:40:19 +00:00
|
|
|
@pytest.mark.parametrize("dataset_cls", [RandomDataset, RandomIterableDatasetWithLen, RandomIterableDataset])
|
|
|
|
def test_index_batch_sampler_wrapper_with_iterable_dataset(dataset_cls, tmpdir):
|
|
|
|
|
|
|
|
ds = dataset_cls(32, 8)
|
|
|
|
loader = DataLoader(ds)
|
|
|
|
is_iterable_dataset = isinstance(ds, IterableDataset)
|
|
|
|
|
|
|
|
class CustomPredictionWriter(BasePredictionWriter):
|
|
|
|
def __init__(self, output_dir: str, *args, **kwargs):
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
self.output_dir = output_dir
|
|
|
|
|
|
|
|
def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *args, **kwargs):
|
|
|
|
assert not batch_indices if is_iterable_dataset else batch_indices
|
|
|
|
|
|
|
|
cb = CustomPredictionWriter(tmpdir)
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, callbacks=cb)
|
|
|
|
predictions = trainer.predict(BoringModel(), dataloaders=loader)
|
|
|
|
assert len(predictions) == 8
|
|
|
|
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
@patch("torch.cuda.device_count", return_value=2)
|
|
|
|
@patch("torch.cuda.is_available", return_value=True)
|
2021-04-27 12:46:45 +00:00
|
|
|
def test_spawn_predict_return_predictions(*_):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Test that `return_predictions=True` raise a MisconfigurationException with spawn training type plugins."""
|
2021-04-27 12:46:45 +00:00
|
|
|
model = BoringModel()
|
|
|
|
|
|
|
|
def run(expected_plugin, **trainer_kwargs):
|
|
|
|
trainer = Trainer(**trainer_kwargs, fast_dev_run=True)
|
|
|
|
assert isinstance(trainer.training_type_plugin, expected_plugin)
|
|
|
|
with pytest.raises(MisconfigurationException, match="`return_predictions` should be set to `False`"):
|
|
|
|
trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True)
|
|
|
|
|
|
|
|
run(DDPSpawnPlugin, accelerator="ddp_spawn", gpus=2)
|
|
|
|
run(DDPSpawnPlugin, accelerator="ddp_cpu", num_processes=2)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("return_predictions", [None, False, True])
|
|
|
|
@pytest.mark.parametrize("precision", [32, 64])
|
|
|
|
def test_predict_return_predictions_cpu(return_predictions, precision, tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Test that `return_predictions=True`."""
|
2021-04-27 12:46:45 +00:00
|
|
|
seed_everything(42)
|
|
|
|
model = BoringModel()
|
|
|
|
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, precision=precision)
|
|
|
|
preds = trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=return_predictions)
|
|
|
|
if return_predictions or return_predictions is None:
|
|
|
|
assert len(preds) == 1
|
|
|
|
assert preds[0].shape == torch.Size([1, 2])
|
|
|
|
assert preds[0].dtype == (torch.float64 if precision == 64 else torch.float32)
|
|
|
|
|
|
|
|
|
2021-02-03 07:24:46 +00:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
["limit_train_batches", "global_step", "num_training_batches", "current_epoch", "should_train"],
|
|
|
|
[(0.2, 0, 0, 0, False), (0.5, 10, 2, 4, True)],
|
|
|
|
)
|
2021-02-06 15:06:17 +00:00
|
|
|
def test_disabled_training_for_insufficient_limit_train_batches(
|
|
|
|
tmpdir, limit_train_batches, global_step, num_training_batches, current_epoch, should_train
|
|
|
|
):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Verify when `limit_train_batches` is float & between [0.0, 1.0] and.
|
|
|
|
|
2021-02-03 07:24:46 +00:00
|
|
|
`int(self.num_training_batches * self.limit_train_batches) == 0`, the training loop is disabled.
|
|
|
|
"""
|
2021-02-06 15:06:17 +00:00
|
|
|
|
2021-02-03 07:24:46 +00:00
|
|
|
class CurrentModel(BoringModel):
|
|
|
|
|
|
|
|
training_step_invoked = False
|
|
|
|
training_epoch_end_invoked = False
|
|
|
|
|
|
|
|
def training_step(self, *args, **kwargs):
|
|
|
|
self.training_step_invoked = True
|
|
|
|
return super().training_step(*args, **kwargs)
|
|
|
|
|
|
|
|
def training_epoch_end(self, *args, **kwargs):
|
|
|
|
self.training_epoch_end_invoked = True
|
|
|
|
return super().training_epoch_end(*args, **kwargs)
|
|
|
|
|
|
|
|
dataset_len = 100
|
|
|
|
batch_size = 25
|
|
|
|
|
|
|
|
train = RandomDataset(32, length=dataset_len)
|
|
|
|
train_loader = DataLoader(train, batch_size=batch_size)
|
|
|
|
|
|
|
|
model = CurrentModel()
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_epochs=5, limit_train_batches=limit_train_batches)
|
2021-04-28 18:11:32 +00:00
|
|
|
trainer.fit(model, train_loader)
|
2021-02-03 07:24:46 +00:00
|
|
|
|
|
|
|
params_string = f"""`limit_train_batches={limit_train_batches}`, `dataset_len={dataset_len}`
|
|
|
|
& `batch_size={batch_size}` as
|
|
|
|
`num_training_batches={num_training_batches}`"""
|
|
|
|
if should_train:
|
|
|
|
error_string = f"should run with {params_string}"
|
|
|
|
else:
|
|
|
|
error_string = f"should not run with {params_string}"
|
|
|
|
|
2021-05-04 10:50:56 +00:00
|
|
|
assert trainer.state.finished, f"Training failed with {trainer.state}"
|
2021-02-03 07:24:46 +00:00
|
|
|
assert trainer.global_step == global_step
|
|
|
|
assert trainer.num_training_batches == num_training_batches
|
|
|
|
assert trainer.current_epoch == current_epoch
|
|
|
|
assert model.training_step_invoked == should_train, f"`training_step` {error_string}"
|
|
|
|
assert model.training_epoch_end_invoked == should_train, f"`training_epoch_end` {error_string}"
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
|
|
|
|
|
2021-02-13 07:36:22 +00:00
|
|
|
@pytest.mark.parametrize(["max_steps", "max_epochs", "global_step"], [(10, 5, 10), (20, None, 20)])
|
|
|
|
def test_repeated_fit_calls_with_max_epochs_and_steps(tmpdir, max_steps, max_epochs, global_step):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Ensure that the training loop is bound by `max_steps` and `max_epochs` for repeated calls of `trainer.fit`,
|
|
|
|
and disabled if the limit is reached."""
|
2021-02-13 07:36:22 +00:00
|
|
|
|
|
|
|
dataset_len = 200
|
|
|
|
batch_size = 10
|
|
|
|
|
|
|
|
train_data = DataLoader(RandomDataset(32, dataset_len), batch_size=batch_size)
|
|
|
|
|
|
|
|
model = BoringModel()
|
|
|
|
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_steps=max_steps, max_epochs=max_epochs)
|
2021-02-13 07:36:22 +00:00
|
|
|
trainer.fit(model, train_data)
|
|
|
|
assert trainer.global_step == global_step
|
|
|
|
trainer.fit(model, train_data)
|
|
|
|
assert trainer.global_step == global_step
|
|
|
|
|
|
|
|
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
def test_trainer_access_in_configure_optimizers(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Verify that the configure optimizer function can reference the trainer."""
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
|
|
|
|
class TestModel(BoringModel):
|
|
|
|
def configure_optimizers(self):
|
|
|
|
assert self.trainer is not None, "Expect to have access to the trainer within `configure_optimizers`"
|
|
|
|
|
|
|
|
train_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
|
|
|
|
|
|
|
|
model = TestModel()
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
|
|
|
|
trainer.fit(model, train_data)
|
|
|
|
|
|
|
|
|
2021-03-02 08:03:32 +00:00
|
|
|
@RunIf(min_gpus=1)
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
def test_setup_hook_move_to_device_correctly(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Verify that if a user defines a layer in the setup hook function, this is moved to the correct device."""
|
PoC: Accelerator refactor (#5743)
* restoring the result from subprocess
* fix queue.get() order for results
* add missing "block_backward_sync" context manager
* add missing "block_backward_sync" context manager
* fix sync_batchnorm
* fix supported gpu-ids for tuple
* fix clip gradients and inf recursion
* accelerator selection: added cluster_environment plugin
* fix torchelastic test
* fix reduce early stopping decision for DDP
* fix tests: callbacks, conversion to lightning optimizer
* fix lightning optimizer does not pickle
* fix setting benchmark and deterministic option
* fix slurm amp test
* fix prepare_data test and determine node_rank
* fix retrieving last path when testing
* remove obsolete plugin argument
* fix test: test_trainer_config
* fix torchscript tests
* fix trainer.model access
* move properties
* fix test_transfer_batch_hook
* fix auto_select_gpus
* fix omegaconf test
* fix test that needs to simulate slurm ddp
* add horovod plugin
* fix test with named arguments
* clean up whitespace
* fix datamodules test
* remove old accelerators
* fix naming
* move old plugins
* move to plugins
* create precision subpackage
* create training_type subpackage
* fix all new import errors
* fix wrong arguments order passed to test
* fix LR finder
* Added sharded training type and amp plugin
* Move clip grad to precision plugin
* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically
* Fix import issue, attempting to fix tests
* Fix initial test
* Reflect hook logic from master, should wrap model after move to device
* Optional state consolidation, since master has optimizers not wrapped
* change attribute for instance test
* reset optimizers
optimizers are not used in main process, so state would be wrong.
* legacy
* imports in accel
* legacy2
* trainer imports
* fix import errors after rebase
* move hook to new setup location
* provide unwrapping logic
* fix trainer callback system
* added ddp2 implementation
* fix imports .legacy
* move plugins
* restore legacy
* drop test.py from root
* add tpu accelerator and plugins
* fixes
* fix lightning optimizer merge
* reset bugreportmodel
* unwrapping
* step routing forward
* model access
* unwrap
* opt
* integrate distrib_type
* sync changes
* sync
* fixes
* add forgotten generators
* add missing logic
* update
* import
* missed imports
* import fixes
* isort
* mv f
* changelog
* format
* move helper to parallel plugin
* d
* add world size
* clean up
* duplicate
* activate ddp_sharded and tpu
* set nvidia flags
* remove unused colab var
* use_tpu <-> on_tpu attrs
* make some ddp_cpu and clusterplugin tests pass
* Ref/accelerator connector (#5742)
* final cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* connector cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* trainer cleanup
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* accelerator cleanup + missing logic in accelerator connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add missing changes to callbacks
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* reflect accelerator changes to lightning module
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* clean cluster envs
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* cleanup plugins
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* add broadcasting
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* yapf
* remove plugin connector
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* plugins
* manual optimization
* update optimizer routing
* add rank to torchelastic
* fix memory mixed precision
* setstate on trainer for pickling in ddp spawn
* add predict method
* add back commented accelerator code
* adapt test for sync_batch_norm to new plugin
* fix deprecated tests
* fix ddp cpu choice when no num_processes are given
* yapf format
* skip a memory test that cannot pass anymore
* fix pickle error in spawn plugin
* x
* avoid
* x
* fix cyclic import in docs build
* add support for sharded
* update typing
* add sharded and sharded_spawn to distributed types
* make unwrap model default
* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel
* update sharded spawn to reflect changes
* update sharded to reflect changes
* Merge 1.1.5 changes
* fix merge
* fix merge
* yapf isort
* fix merge
* yapf isort
* fix indentation in test
* copy over reinit scheduler implementation from dev1.2
* fix apex tracking calls with dev_debugger
* reduce diff to dev1.2, clean up
* fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu
* sort plugin tests legacy/new
* fix error handling for amp on cpu
* fix merge
fix merge
fix merge
* [Feat] Resolve manual_backward (#5837)
* resolve manual_backward
* resolve flake8
* update
* resolve for ddp_spawn
* resolve flake8
* resolve flake8
* resolve flake8
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* fix tests/accelerator tests on cpu
* [BugFix] Resolve manual optimization (#5852)
* resolve manual_optimization
* update
* update
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)
* resovle a bug
* Accelerator refactor sharded rpc (#5854)
* rpc branch
* merge
* update handling of rpc
* make devices etc. Optional in RPC
* set devices etc. later if necessary
* remove devices from sequential
* make devices optional in rpc
* fix import
* uncomment everything
* fix cluster selection
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
* resolve bug
* fix assert in rpc test
* resolve a test
* fix docs compilation
* accelerator refactor - fix for sharded parity test (#5866)
* fix memory issue with ddp_spawn
* x
x
x
x
x
x
x
x
x
* x
* Remove DDP2 as this does not apply
* Add missing pre optimizer hook to ensure lambda closure is called
* fix apex docstring
* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* update
* update
* revert init
* resolve a bug
* update
* resolve flake8
* update
* update
* update
* revert init
* update
* resolve flake8
* update
* update
* update
* update
* update
* all_gather
* update
* make plugins work, add misconfig for RPC
* update
* update
* remove breaking test
* resolve some tests
* resolve flake8
* revert to ddp_spawn
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>
* yapf isort
* resolve flake8
* fix apex doctests
* fix apex doctests 2
* resolve docs
* update drone
* clean env
* update
* update
* update
* update
* merge
* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)
* Fix RPC related tests, clean out old API, update for new accelerator API
* Move tests out of legacy folder, update paths and names
* Update test_remove_1-4.py
* Expose properties for tpu cores/gpus/num_gpus
* Add root GPU property
* Move properties to properties.py
* move tests that were previously in drone
* Fix root GPU property (#5908)
* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator
* Add missing tests back
* fix best model path transfer when no checkpoint callback available
* Fix setup hook order [wip] (#5858)
* Call trainer setup hook before accelerator setup
* Add test case
* add new test
* typo
* fix callback order in test
Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* rename ddp sequential -> rpc sequential for special test
* revert
* fix stupid merge problem
* Use property in connector for sampler (#5913)
* merge the import conflicts
* fix spawning of processes in slurm
* [wip] Fix some bugs for TPU [skip ci] (#5878)
* fixed for single tpu
* fixed spawn
* fixed spawn
* update
* update
* wip
* resolve bugs
* resolve bug
* update on comment
* removed decorator
* resolve comments
* set to 4
* update
* update
* need cleaning
* update
* update
* update
* resolve flake8
* resolve bugs
* exclude broadcast
* resolve bugs
* change test
* update
* update
* skip if meet fails
* properly raise trace
* update
* add catch
* wrap test
* resolve typo
* update
* typo
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
* resolve some tests
* update
* fix imports
* update
* resolve flake8
* update azure pipeline
* skip a sharded test on cpu that requires a gpu
* resolve tpus
* resolve bug
* resolve flake8
* update
* updat utils
* revert permission change on files
* suggestions from carlos
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting changes
* remove incomplete comment
* Update pytorch_lightning/accelerators/__init__.py
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
* remove unrelated formatting change
* add types
* warn 1.7 ddp manual backward only if ddp kwarg unset
* yapf + isort
* pep8 unused imports
* fix cyclic import in docs
* Apply suggestions from code review
* typer in accelerator.py
* typo
* Apply suggestions from code review
* formatting
* update on comments
* update typo
* Update pytorch_lightning/trainer/properties.py
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
* update
* suggestion from code review
* suggestion from code review
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
|
|
|
|
|
|
|
class TestModel(BoringModel):
|
|
|
|
def setup(self, stage: str) -> None:
|
|
|
|
self.new_layer = torch.nn.Linear(2, 2)
|
|
|
|
|
|
|
|
def training_step(self, batch, batch_idx):
|
|
|
|
output = self.layer(batch)
|
|
|
|
# will crash if not moved to correct device
|
|
|
|
output = self.new_layer(output)
|
|
|
|
loss = self.loss(batch, output)
|
|
|
|
return {"loss": loss}
|
|
|
|
|
|
|
|
# fake data
|
|
|
|
train_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
|
|
|
|
|
|
|
|
# model
|
|
|
|
model = TestModel()
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=1)
|
|
|
|
trainer.fit(model, train_data)
|
2021-03-01 13:36:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_train_loop_system(tmpdir):
|
|
|
|
"""
|
|
|
|
Test the following methods are called in the order in automatic optimization.
|
|
|
|
1. optimizer.step (skip when gradient accumulation)
|
|
|
|
2. model.training_step
|
|
|
|
3. optimizer.zero_grad (run when the first batch of gradient accumulation)
|
|
|
|
4. model.backward
|
|
|
|
|
|
|
|
Note that the order is NOT `training_step`->`zero_grad`->`backward`->`step`.
|
|
|
|
This is because `optimizer.step(closure)` calls `closure()` which then calls
|
|
|
|
the three remaining methods `training_step`, `zero_grad` and `backward` inside.
|
|
|
|
"""
|
|
|
|
called_methods = []
|
|
|
|
|
|
|
|
trainer_options = dict(
|
|
|
|
default_root_dir=tmpdir,
|
|
|
|
max_epochs=1,
|
|
|
|
limit_train_batches=5,
|
|
|
|
limit_val_batches=1,
|
|
|
|
limit_test_batches=1,
|
|
|
|
progress_bar_refresh_rate=0,
|
|
|
|
)
|
|
|
|
|
|
|
|
class TestOptimizer(SGD):
|
|
|
|
def step(self, *args, **kwargs):
|
|
|
|
called_methods.append("step")
|
|
|
|
return super().step(*args, **kwargs)
|
|
|
|
|
|
|
|
def zero_grad(self, *args, **kwargs):
|
|
|
|
called_methods.append("zero_grad")
|
|
|
|
return super().zero_grad(*args, **kwargs)
|
|
|
|
|
|
|
|
class TestModel(BoringModel):
|
|
|
|
def configure_optimizers(self):
|
|
|
|
return TestOptimizer(self.parameters(), lr=0.1)
|
|
|
|
|
|
|
|
def training_step(self, *args, **kwargs):
|
|
|
|
called_methods.append("training_step")
|
|
|
|
return super().training_step(*args, **kwargs)
|
|
|
|
|
|
|
|
def backward(self, *args, **kwargs):
|
|
|
|
called_methods.append("backward")
|
|
|
|
return super().backward(*args, **kwargs)
|
|
|
|
|
|
|
|
model = TestModel()
|
|
|
|
trainer = Trainer(**trainer_options)
|
|
|
|
|
|
|
|
# No methods are called yet.
|
|
|
|
assert called_methods == []
|
|
|
|
|
|
|
|
trainer.fit(model)
|
2021-07-26 11:37:35 +00:00
|
|
|
assert called_methods == ["step", "training_step", "zero_grad", "backward"] * trainer.limit_train_batches
|
2021-03-01 13:36:46 +00:00
|
|
|
|
|
|
|
called_methods.clear()
|
|
|
|
trainer = Trainer(**trainer_options, accumulate_grad_batches=3)
|
|
|
|
|
|
|
|
# No methods are called yet.
|
|
|
|
assert called_methods == []
|
|
|
|
|
|
|
|
trainer.fit(model)
|
|
|
|
assert called_methods == [
|
|
|
|
# 0
|
|
|
|
"training_step",
|
|
|
|
"zero_grad",
|
|
|
|
"backward",
|
|
|
|
# 1
|
|
|
|
"training_step",
|
|
|
|
"backward",
|
|
|
|
# 2
|
|
|
|
"step",
|
|
|
|
"training_step",
|
|
|
|
"backward",
|
|
|
|
# 3
|
|
|
|
"training_step",
|
|
|
|
"zero_grad",
|
|
|
|
"backward",
|
|
|
|
# 4
|
|
|
|
"step",
|
|
|
|
"training_step",
|
|
|
|
"backward",
|
|
|
|
]
|
2021-03-08 01:58:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_init_optimizers_resets_lightning_optimizers(tmpdir):
|
2021-07-26 11:37:35 +00:00
|
|
|
"""Test that the Trainer resets the `lightning_optimizers` list everytime new optimizers get initialized."""
|
2021-03-08 01:58:03 +00:00
|
|
|
|
|
|
|
def compare_optimizers():
|
|
|
|
assert trainer.lightning_optimizers[0].optimizer is trainer.optimizers[0]
|
|
|
|
|
|
|
|
model = BoringModel()
|
|
|
|
model.lr = 0.2
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, auto_lr_find=True)
|
2021-03-08 01:58:03 +00:00
|
|
|
|
|
|
|
trainer.tune(model)
|
|
|
|
compare_optimizers()
|
|
|
|
|
|
|
|
trainer.fit(model)
|
|
|
|
compare_optimizers()
|
|
|
|
|
2021-06-22 09:49:32 +00:00
|
|
|
trainer.fit_loop.max_epochs = 2 # simulate multiple fit calls
|
2021-03-08 01:58:03 +00:00
|
|
|
trainer.fit(model)
|
|
|
|
compare_optimizers()
|
2021-03-10 06:38:53 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_check_val_every_n_epoch_exception(tmpdir):
|
|
|
|
|
|
|
|
with pytest.raises(MisconfigurationException, match="should be an integer."):
|
2021-07-26 11:37:35 +00:00
|
|
|
Trainer(default_root_dir=tmpdir, max_epochs=1, check_val_every_n_epoch=1.2)
|
2021-03-23 15:13:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_trainer_attach_data_pipeline_to_model(tmpdir):
|
|
|
|
class DataPipeline:
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
class TestDataModule(LightningDataModule):
|
|
|
|
|
|
|
|
data_pipeline = DataPipeline()
|
|
|
|
|
|
|
|
def train_dataloader(self):
|
|
|
|
return DataLoader(RandomDataset(32, 64))
|
|
|
|
|
|
|
|
def val_dataloader(self):
|
|
|
|
return DataLoader(RandomDataset(32, 64))
|
|
|
|
|
|
|
|
def test_dataloader(self):
|
|
|
|
return DataLoader(RandomDataset(32, 64))
|
|
|
|
|
|
|
|
class TestCallback(Callback):
|
|
|
|
def on_fit_start(self, trainer, pl_module: LightningModule) -> None:
|
2021-09-06 12:49:09 +00:00
|
|
|
"""Called when fit begins."""
|
2021-03-23 15:13:13 +00:00
|
|
|
assert isinstance(pl_module.data_pipeline, DataPipeline)
|
|
|
|
|
|
|
|
model = BoringModel()
|
|
|
|
dm = TestDataModule()
|
|
|
|
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, callbacks=[TestCallback()])
|
|
|
|
trainer.fit(model, datamodule=dm)
|
2021-03-29 13:29:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_exception_when_testing_or_validating_with_fast_dev_run(tmpdir):
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
|
2021-04-30 13:54:58 +00:00
|
|
|
model = BoringModel()
|
|
|
|
trainer.fit(model)
|
2021-03-29 13:29:54 +00:00
|
|
|
|
|
|
|
with pytest.raises(MisconfigurationException, match=r"\.validate\(\)` with `fast_dev_run=True"):
|
|
|
|
trainer.validate()
|
|
|
|
with pytest.raises(MisconfigurationException, match=r"\.test\(\)` with `fast_dev_run=True"):
|
|
|
|
trainer.test()
|
2021-04-08 18:04:26 +00:00
|
|
|
|
|
|
|
|
|
|
|
class TrainerStagesModel(BoringModel):
|
|
|
|
def on_train_start(self) -> None:
|
|
|
|
assert self.trainer.model.training
|
|
|
|
assert self.training
|
|
|
|
|
|
|
|
def on_validation_start(self) -> None:
|
|
|
|
assert not self.trainer.model.training
|
|
|
|
assert not self.training
|
|
|
|
|
|
|
|
def on_test_start(self) -> None:
|
|
|
|
assert not self.trainer.model.training
|
|
|
|
assert not self.training
|
|
|
|
|
|
|
|
def on_predict_start(self) -> None:
|
|
|
|
assert not self.trainer.model.training
|
|
|
|
assert not self.training
|
|
|
|
|
|
|
|
|
2021-05-12 20:10:15 +00:00
|
|
|
@pytest.mark.parametrize(
|
2021-08-02 21:24:07 +00:00
|
|
|
"accelerator,num_processes", [(None, 1), pytest.param("ddp_cpu", 2, marks=RunIf(skip_windows=True))]
|
2021-05-12 20:10:15 +00:00
|
|
|
)
|
2021-04-08 18:04:26 +00:00
|
|
|
def test_model_in_correct_mode_during_stages(tmpdir, accelerator, num_processes):
|
|
|
|
model = TrainerStagesModel()
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, accelerator=accelerator, num_processes=num_processes, fast_dev_run=True)
|
|
|
|
trainer.fit(model)
|
|
|
|
trainer.validate(model)
|
|
|
|
trainer.test(model)
|
|
|
|
trainer.predict(model, model.val_dataloader())
|
2021-04-26 09:23:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
class TestDummyModelForCheckpoint(BoringModel):
|
|
|
|
def validation_step(self, batch, batch_idx):
|
|
|
|
output = self.layer(batch)
|
|
|
|
loss = self.loss(batch, output)
|
2021-07-26 11:37:35 +00:00
|
|
|
self.log("x", loss)
|
2021-04-26 09:23:29 +00:00
|
|
|
|
|
|
|
def validation_epoch_end(self, outputs) -> None:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@RunIf(skip_windows=True)
|
|
|
|
def test_fit_test_synchronization(tmpdir):
|
2021-07-26 11:37:35 +00:00
|
|
|
"""Test that the trainer synchronizes processes before returning control back to the caller."""
|
2021-04-26 09:23:29 +00:00
|
|
|
tutils.set_random_master_port()
|
|
|
|
model = TestDummyModelForCheckpoint()
|
2021-07-26 11:37:35 +00:00
|
|
|
checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="x", mode="min", save_top_k=1)
|
2021-04-26 09:23:29 +00:00
|
|
|
trainer = Trainer(
|
2021-07-26 11:37:35 +00:00
|
|
|
default_root_dir=tmpdir, max_epochs=2, accelerator="ddp_cpu", num_processes=2, callbacks=[checkpoint]
|
2021-04-26 09:23:29 +00:00
|
|
|
)
|
|
|
|
trainer.fit(model)
|
2021-07-26 11:37:35 +00:00
|
|
|
assert os.path.exists(checkpoint.best_model_path), f"Could not find checkpoint at rank {trainer.global_rank}"
|
2021-04-26 09:23:29 +00:00
|
|
|
trainer.test()
|
2021-04-28 19:17:20 +00:00
|
|
|
|
|
|
|
|
2021-04-29 12:39:45 +00:00
|
|
|
class CustomCallbackOnLoadCheckpoint(Callback):
|
|
|
|
def on_save_checkpoint(self, trainer, pl_module, checkpoint) -> dict:
|
|
|
|
return {"a": None}
|
|
|
|
|
|
|
|
|
|
|
|
def test_on_load_checkpoint_missing_callbacks(tmpdir):
|
2021-07-26 11:37:35 +00:00
|
|
|
"""Test a warning appears when callbacks in the checkpoint don't match callbacks provided when resuming."""
|
2021-04-29 12:39:45 +00:00
|
|
|
|
|
|
|
model = BoringModel()
|
|
|
|
chk = ModelCheckpoint(dirpath=tmpdir, save_last=True)
|
|
|
|
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, max_epochs=3, callbacks=[chk, CustomCallbackOnLoadCheckpoint()])
|
|
|
|
trainer.fit(model)
|
|
|
|
|
|
|
|
trainer = Trainer(
|
|
|
|
default_root_dir=tmpdir, max_epochs=5, resume_from_checkpoint=chk.last_model_path, progress_bar_refresh_rate=1
|
|
|
|
)
|
|
|
|
with pytest.warns(UserWarning, match="CustomCallbackOnLoadCheckpoint"):
|
|
|
|
trainer.fit(model)
|
|
|
|
|
|
|
|
|
2021-04-28 19:17:20 +00:00
|
|
|
def test_module_current_fx_attributes_reset(tmpdir):
|
2021-07-26 11:37:35 +00:00
|
|
|
"""Ensure that lightning module's attributes related to current fx are reset at the end of execution."""
|
2021-04-28 19:17:20 +00:00
|
|
|
model = BoringModel()
|
2021-07-26 11:37:35 +00:00
|
|
|
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1, checkpoint_callback=False, logger=False)
|
2021-05-19 20:31:06 +00:00
|
|
|
|
2021-04-28 19:17:20 +00:00
|
|
|
trainer.fit(model)
|
2021-05-19 20:31:06 +00:00
|
|
|
assert model._current_fx_name is None
|
|
|
|
assert model._current_dataloader_idx is None
|
|
|
|
|
2021-04-28 19:17:20 +00:00
|
|
|
trainer.test(model)
|
2021-05-19 20:31:06 +00:00
|
|
|
assert model._current_fx_name is None
|
|
|
|
assert model._current_dataloader_idx is None
|
2021-05-21 16:03:16 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_exception_when_lightning_module_is_not_set_on_trainer():
|
|
|
|
trainer = Trainer()
|
|
|
|
|
|
|
|
with pytest.raises(MisconfigurationException, match=r"`model` must be provided.*validate"):
|
|
|
|
trainer.validate()
|
|
|
|
with pytest.raises(MisconfigurationException, match=r"`model` must be provided.*test"):
|
|
|
|
trainer.test()
|
|
|
|
with pytest.raises(MisconfigurationException, match=r"`model` must be provided.*predict"):
|
|
|
|
trainer.predict()
|
2021-06-28 19:26:03 +00:00
|
|
|
|
|
|
|
|
2021-09-15 10:54:17 +00:00
|
|
|
class CustomException(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2021-06-28 19:26:03 +00:00
|
|
|
@RunIf(min_gpus=2, special=True)
|
|
|
|
def test_ddp_terminate_when_deadlock_is_detected(tmpdir):
|
2021-07-26 11:37:35 +00:00
|
|
|
"""Test that DDP kills the remaining processes when only one rank is throwing an exception."""
|
2021-06-28 19:26:03 +00:00
|
|
|
|
|
|
|
class TestModel(BoringModel):
|
|
|
|
def training_step(self, batch, batch_idx):
|
|
|
|
if batch_idx == 1 and self.trainer.is_global_zero:
|
|
|
|
# rank 0: raises an exception
|
|
|
|
# rank 1: continues training but will hang on the next barrier in the training loop
|
|
|
|
raise CustomException
|
|
|
|
return super().training_step(batch, batch_idx)
|
|
|
|
|
|
|
|
model = TestModel()
|
|
|
|
|
|
|
|
trainer = Trainer(
|
2021-07-26 11:37:35 +00:00
|
|
|
default_root_dir=tmpdir, max_epochs=1, limit_train_batches=5, num_sanity_val_steps=0, gpus=2, accelerator="ddp"
|
2021-06-28 19:26:03 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
# simulate random failure in training_step on rank 0
|
|
|
|
with pytest.raises(DeadlockDetectedException, match="CustomException"):
|
|
|
|
trainer.fit(model)
|
2021-07-21 09:37:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
@RunIf(min_gpus=1)
|
|
|
|
def test_multiple_trainer_constant_memory_allocated(tmpdir):
|
2021-09-06 12:49:09 +00:00
|
|
|
"""This tests ensures calling the trainer several times reset the memory back to 0."""
|
2021-07-21 09:37:05 +00:00
|
|
|
|
|
|
|
class TestModel(BoringModel):
|
|
|
|
def training_step(self, batch, batch_idx):
|
|
|
|
loss = super().training_step(batch, batch_idx)
|
|
|
|
self.log("train_loss", loss["loss"])
|
|
|
|
return loss
|
|
|
|
|
|
|
|
def configure_optimizers(self):
|
|
|
|
return torch.optim.Adam(self.layer.parameters(), lr=0.1)
|
|
|
|
|
|
|
|
class Check(Callback):
|
|
|
|
def on_epoch_start(self, trainer, *_):
|
|
|
|
assert isinstance(trainer.training_type_plugin.model, DistributedDataParallel)
|
|
|
|
|
2021-08-12 12:50:45 +00:00
|
|
|
def current_memory():
|
|
|
|
# before measuring the memory force release any leftover allocations, including CUDA tensors
|
|
|
|
gc.collect()
|
|
|
|
return torch.cuda.memory_allocated(0)
|
|
|
|
|
|
|
|
initial = current_memory()
|
2021-07-21 09:37:05 +00:00
|
|
|
|
|
|
|
model = TestModel()
|
|
|
|
trainer_kwargs = dict(
|
|
|
|
default_root_dir=tmpdir,
|
|
|
|
fast_dev_run=True,
|
|
|
|
gpus=1,
|
|
|
|
accelerator="ddp",
|
|
|
|
progress_bar_refresh_rate=0,
|
2021-07-26 11:37:35 +00:00
|
|
|
callbacks=Check(),
|
2021-07-21 09:37:05 +00:00
|
|
|
)
|
|
|
|
trainer = Trainer(**trainer_kwargs)
|
|
|
|
trainer.fit(model)
|
|
|
|
|
|
|
|
assert trainer.training_type_plugin.model is model
|
|
|
|
assert list(trainer.optimizers[0].state.values())[0]["exp_avg_sq"].device == torch.device("cpu")
|
2021-07-26 11:37:35 +00:00
|
|
|
assert trainer.callback_metrics["train_loss"].device == torch.device("cpu")
|
2021-07-21 09:37:05 +00:00
|
|
|
|
2021-08-12 12:50:45 +00:00
|
|
|
assert current_memory() <= initial
|
2021-07-30 13:03:15 +00:00
|
|
|
|
2021-07-21 09:37:05 +00:00
|
|
|
deepcopy(trainer)
|
2021-07-30 13:03:15 +00:00
|
|
|
|
2021-08-12 12:50:45 +00:00
|
|
|
assert current_memory() <= initial
|
2021-07-21 09:37:05 +00:00
|
|
|
|
|
|
|
trainer_2 = Trainer(**trainer_kwargs)
|
|
|
|
trainer_2.fit(model)
|
|
|
|
|
2021-08-12 12:50:45 +00:00
|
|
|
assert current_memory() <= initial
|
2021-08-18 02:04:40 +00:00
|
|
|
|
|
|
|
|
|
|
|
class TrainerStagesErrorsModel(BoringModel):
|
|
|
|
def on_train_start(self) -> None:
|
|
|
|
raise Exception("Error during train")
|
|
|
|
|
|
|
|
def on_validation_start(self) -> None:
|
|
|
|
raise Exception("Error during validation")
|
|
|
|
|
|
|
|
def on_test_start(self) -> None:
|
|
|
|
raise Exception("Error during test")
|
|
|
|
|
|
|
|
def on_predict_start(self) -> None:
|
|
|
|
raise Exception("Error during predict")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"accelerator,num_processes",
|
|
|
|
[
|
|
|
|
(None, 1),
|
|
|
|
pytest.param("ddp_cpu", 2, marks=RunIf(skip_windows=True)),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_error_handling_all_stages(tmpdir, accelerator, num_processes):
|
|
|
|
model = TrainerStagesErrorsModel()
|
|
|
|
trainer = Trainer(default_root_dir=tmpdir, accelerator=accelerator, num_processes=num_processes, fast_dev_run=True)
|
2021-08-30 22:58:48 +00:00
|
|
|
|
|
|
|
with pytest.raises(Exception, match=r"Error during train"), patch(
|
|
|
|
"pytorch_lightning.Trainer._on_exception"
|
|
|
|
) as exception_hook:
|
2021-08-18 02:04:40 +00:00
|
|
|
trainer.fit(model)
|
2021-08-30 22:58:48 +00:00
|
|
|
exception_hook.assert_called()
|
|
|
|
|
|
|
|
with pytest.raises(Exception, match=r"Error during validation"), patch(
|
|
|
|
"pytorch_lightning.Trainer._on_exception"
|
|
|
|
) as exception_hook:
|
2021-08-18 02:04:40 +00:00
|
|
|
trainer.validate(model)
|
2021-08-30 22:58:48 +00:00
|
|
|
exception_hook.assert_called()
|
|
|
|
|
|
|
|
with pytest.raises(Exception, match=r"Error during test"), patch(
|
|
|
|
"pytorch_lightning.Trainer._on_exception"
|
|
|
|
) as exception_hook:
|
2021-08-18 02:04:40 +00:00
|
|
|
trainer.test(model)
|
2021-08-30 22:58:48 +00:00
|
|
|
exception_hook.assert_called()
|
|
|
|
|
|
|
|
with pytest.raises(Exception, match=r"Error during predict"), patch(
|
|
|
|
"pytorch_lightning.Trainer._on_exception"
|
|
|
|
) as exception_hook:
|
2021-08-18 02:04:40 +00:00
|
|
|
trainer.predict(model, model.val_dataloader(), return_predictions=False)
|
2021-08-30 22:58:48 +00:00
|
|
|
exception_hook.assert_called()
|