lightning/tests/tests_pytorch/trainer/test_trainer.py

2281 lines
81 KiB
Python
Raw Normal View History

2020-10-13 11:18:07 +00:00
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import logging
import math
import os
import pickle
2021-01-23 23:52:04 +00:00
from argparse import Namespace
from contextlib import nullcontext
2021-01-23 23:52:04 +00:00
from copy import deepcopy
from pathlib import Path
from re import escape
from unittest.mock import ANY, call, Mock, patch
2020-03-12 16:41:37 +00:00
import cloudpickle
import pytest
import torch
import torch.nn as nn
2022-03-27 21:31:20 +00:00
from torch.multiprocessing import ProcessRaisedException
from torch.nn.parallel.distributed import DistributedDataParallel
from torch.optim import SGD
from torch.utils.data import DataLoader, IterableDataset
import pytorch_lightning
import tests_pytorch.helpers.utils as tutils
from lightning_lite.utilities.cloud_io import _load as pl_load
from lightning_lite.utilities.seed import seed_everything
from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer
from pytorch_lightning.accelerators import CPUAccelerator, CUDAAccelerator
from pytorch_lightning.callbacks import EarlyStopping, GradientAccumulationScheduler, ModelCheckpoint, Timer
from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint
2021-04-27 20:23:55 +00:00
from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter
from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv
from pytorch_lightning.demos.boring_classes import (
2022-12-07 07:12:06 +00:00
BoringDataModule,
BoringModel,
RandomDataset,
RandomIterableDataset,
RandomIterableDatasetWithLen,
)
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler
from pytorch_lightning.strategies import (
DataParallelStrategy,
DDPFullyShardedStrategy,
DDPShardedStrategy,
DDPSpawnShardedStrategy,
DDPSpawnStrategy,
DDPStrategy,
SingleDeviceStrategy,
)
from pytorch_lightning.trainer.states import RunningStage, TrainerFn
from pytorch_lightning.utilities.exceptions import DeadlockDetectedException, MisconfigurationException
from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE
from tests_pytorch.conftest import mock_cuda_count, mock_mps_count
from tests_pytorch.helpers.datamodules import ClassifDataModule
from tests_pytorch.helpers.runif import RunIf
from tests_pytorch.helpers.simple_models import ClassificationModel
2020-04-27 11:41:30 +00:00
if _OMEGACONF_AVAILABLE:
from omegaconf import OmegaConf
2020-04-27 11:41:30 +00:00
def test_trainer_error_when_input_not_lightning_module():
"""Test that a useful error gets raised when the Trainer methods receive something other than a
LightningModule."""
trainer = Trainer()
for method in ("fit", "validate", "test", "predict"):
with pytest.raises(TypeError, match=escape(f"`Trainer.{method}()` requires a `LightningModule`, got: Linear")):
run_method = getattr(trainer, method)
run_method(nn.Linear(2, 2))
trainer = Trainer(auto_lr_find=True, auto_scale_batch_size=True)
with pytest.raises(TypeError, match=escape("`Trainer.tune()` requires a `LightningModule`, got: Linear")):
trainer.tune(nn.Linear(2, 2))
@pytest.mark.parametrize("url_ckpt", [True, False])
def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
"""Tests use case where trainer saves the model, and user loads it from tags independently."""
# set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
monkeypatch.setenv("TORCH_HOME", str(tmpdir))
class CustomModel(BoringModel):
def __init__(self, lr=1e-2):
super().__init__()
self.save_hyperparameters()
lr = 1e-3
model = CustomModel(lr=lr)
# logger file to get meta
logger = tutils.get_default_logger(tmpdir)
trainer = Trainer(default_root_dir=tmpdir, max_steps=1, limit_val_batches=1, logger=logger)
# fit model
trainer.fit(model)
# training complete
assert trainer.state.finished, f"Training failed with {trainer.state}"
# save model
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
trainer.save_checkpoint(new_weights_path)
2020-04-26 21:30:50 +00:00
# assert ckpt has hparams
ckpt = torch.load(new_weights_path)
assert LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), "hyper_parameters missing from checkpoints"
2020-04-26 21:30:50 +00:00
# load new model
hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
hparams_path = os.path.join(hparams_path, "hparams.yaml")
ckpt_path = (
f"http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}"
if url_ckpt
else new_weights_path
)
model_2 = CustomModel.load_from_checkpoint(checkpoint_path=ckpt_path, hparams_file=hparams_path)
assert model_2.hparams.lr == lr
@pytest.mark.parametrize("url_ckpt", [True, False])
def test_strict_model_load(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
"""Tests use case where trainer saves the model, and user loads it from tags independently."""
# set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
monkeypatch.setenv("TORCH_HOME", tmpdir)
model = BoringModel()
# Extra layer
model.c_d3 = torch.nn.Linear(10, 12)
# logger file to get meta
logger = tutils.get_default_logger(tmpdir)
# fit model
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1, logger=logger)
trainer.fit(model)
# training complete
assert trainer.state.finished, f"Training failed with {trainer.state}"
# save model
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
trainer.save_checkpoint(new_weights_path)
# load new model
hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
hparams_path = os.path.join(hparams_path, "hparams.yaml")
ckpt_path = (
f"http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}"
if url_ckpt
else new_weights_path
)
try:
BoringModel.load_from_checkpoint(checkpoint_path=ckpt_path, hparams_file=hparams_path)
# todo: specify the possible exception
except Exception:
failed = True
else:
failed = False
assert failed, "Model should not been loaded since the extra layer added."
failed = False
try:
BoringModel.load_from_checkpoint(checkpoint_path=ckpt_path, hparams_file=hparams_path, strict=False)
# todo: specify the possible exception
except Exception:
failed = True
assert not failed, "Model should be loaded due to strict=False."
def test_trainer_accumulate_grad_batches_incorrect_value(tmpdir):
with pytest.raises(MisconfigurationException, match=".*should be an int or a dict.*"):
Trainer(default_root_dir=tmpdir, accumulate_grad_batches=(2, 5))
def test_trainer_accumulate_grad_batches_with_grad_acc_callback(tmpdir):
with pytest.raises(
MisconfigurationException, match=".*set both `accumulate_grad_batches` and passed an instance.*"
):
Trainer(default_root_dir=tmpdir, accumulate_grad_batches=7, callbacks=[GradientAccumulationScheduler({0: 2})])
@pytest.mark.parametrize(
["accumulate_grad_batches", "limit_train_batches"],
[
({1: 2, 3: 4}, 1.0),
({1: 2, 3: 4}, 0.5), # not to be divisible by accumulate_grad_batches on purpose
(3, 1.0),
(3, 0.8), # not to be divisible by accumulate_grad_batches on purpose
(4, 1.0),
(4, 0.7), # not to be divisible by accumulate_grad_batches on purpose
],
)
def test_gradient_accumulation_scheduling_last_batch(tmpdir, accumulate_grad_batches, limit_train_batches):
"""Verify optimizer.step() applied to last batch while grad accumulation."""
class TestModel(BoringModel):
def state_dict(self, *args, **kwargs):
return deepcopy(super().state_dict(*args, **kwargs))
def check(self, d1, d2, equal=True):
keys = d1.keys() | d2.keys()
values = [torch.equal(d1[k], d2[k]) for k in keys]
return all(values) if equal else not any(values)
def backward(self, *args, **kwargs) -> None:
pre_bwd_state_dict = self.state_dict()
assert self.check(self.start_state_dict, pre_bwd_state_dict)
out = super().backward(*args, **kwargs)
# state dict is equal, just the gradients changed
assert self.check(pre_bwd_state_dict, self.state_dict())
return out
def optimizer_step(self, *args, **kwargs):
pre_opt_step_state_dict = self.state_dict()
assert self.check(self.start_state_dict, pre_opt_step_state_dict)
# this calls `backward` and `on_after_backward` inside the closure
out = super().optimizer_step(*args, **kwargs)
# the state dict changed
assert self.check(pre_opt_step_state_dict, self.state_dict(), equal=False)
self.opt_step_called = True
return out
def on_train_batch_start(self, *_):
self.start_state_dict = self.state_dict()
self.opt_step_called = False
def on_train_batch_end(self, outputs, batch, batch_idx):
end_state_dict = self.state_dict()
is_last_batch = (batch_idx + 1) == self.trainer.num_training_batches
if is_last_batch or self.opt_step_called:
assert self.check(self.start_state_dict, end_state_dict, equal=False)
else:
assert self.check(self.start_state_dict, end_state_dict)
model = TestModel()
trainer = Trainer(
accumulate_grad_batches=accumulate_grad_batches,
max_epochs=2,
limit_train_batches=limit_train_batches,
limit_val_batches=0,
default_root_dir=tmpdir,
enable_progress_bar=False,
)
trainer.fit(model)
def test_loading_meta_tags(tmpdir):
"""test for backward compatibility to meta_tags.csv."""
hparams = {
"batch_size": 32,
"learning_rate": 0.001 * 8,
"optimizer_name": "adam",
}
# save tags
logger = tutils.get_default_logger(tmpdir)
logger.log_hyperparams(Namespace(some_str="a_str", an_int=1, a_float=2.0))
logger.log_hyperparams(hparams)
logger.save()
# load hparams
path_expt_dir = tutils.get_data_path(logger, path_dir=tmpdir)
hparams_path = os.path.join(path_expt_dir, TensorBoardLogger.NAME_HPARAMS_FILE)
hparams = load_hparams_from_yaml(hparams_path)
# save as legacy meta_tags.csv
tags_path = os.path.join(path_expt_dir, "meta_tags.csv")
save_hparams_to_tags_csv(tags_path, hparams)
clean v2 docs (#691) * updated gitignore * Update README.md * updated gitignore * updated links in ninja file * updated docs * Update README.md * Update README.md * finished callbacks * finished callbacks * finished callbacks * fixed left menu * added callbacks to menu * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * fixing TensorBoard (#687) * flake8 * fix typo * fix tensorboardlogger drop test_tube dependence * formatting * fix tensorboard & tests * upgrade Tensorboard * test formatting separately * try to fix JIT issue * add tests for 1.4 * added direct links to docs * updated gitignore * updated links in ninja file * updated docs * finished callbacks * finished callbacks * finished callbacks * fixed left menu * added callbacks to menu * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * finished rebase * making private members * making private members * making private members * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * set auto dp if no backend * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * fixed lightning import * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * finished lightning module * finished lightning module * finished lightning module * finished lightning module * added callbacks * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * set auto dp if no backend * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * flake 8 * flake 8 Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
2020-01-17 11:03:31 +00:00
tags = load_hparams_from_tags_csv(tags_path)
assert hparams == tags
def test_loading_yaml(tmpdir):
hparams = {
"batch_size": 32,
"learning_rate": 0.001 * 8,
"optimizer_name": "adam",
}
# save tags
logger = tutils.get_default_logger(tmpdir)
logger.log_hyperparams(Namespace(some_str="a_str", an_int=1, a_float=2.0))
logger.log_hyperparams(hparams)
logger.save()
# load hparams
path_expt_dir = tutils.get_data_path(logger, path_dir=tmpdir)
hparams_path = os.path.join(path_expt_dir, "hparams.yaml")
tags = load_hparams_from_yaml(hparams_path)
assert tags["batch_size"] == 32 and tags["optimizer_name"] == "adam"
@pytest.mark.parametrize(
"save_top_k,save_last,expected_files",
[
pytest.param(-1, False, [f"epoch={i}.ckpt" for i in range(5)], id="CASE K=-1 (all)"),
pytest.param(1, False, {"epoch=4.ckpt"}, id="CASE K=1 (2.5, epoch 4)"),
pytest.param(2, False, [f"epoch={i}.ckpt" for i in (2, 4)], id="CASE K=2 (2.5 epoch 4, 2.8 epoch 2)"),
pytest.param(4, False, [f"epoch={i}.ckpt" for i in range(1, 5)], id="CASE K=4 (save all 4 base)"),
pytest.param(3, False, [f"epoch={i}.ckpt" for i in range(2, 5)], id="CASE K=3 (save the 2nd, 3rd, 4th model)"),
pytest.param(1, True, {"epoch=4.ckpt", "last.ckpt"}, id="CASE K=1 (save the 4th model and the last model)"),
],
)
def test_model_checkpoint_options(tmpdir, save_top_k, save_last, expected_files):
"""Test ModelCheckpoint options."""
Custom argparser extension with Trainer arguments (argument types added) (#1147) * `add_argparse_args` method fixed (argument types added) * CHANGELOG.md upd * autopep8 fixes * --gpus=0 removed from test (for ci tests) * typo fixed * reduce on plateau scheduler fixed * Trainer cli related tests moved to test_trainer_cli.py * refactored: get_init_arguments_and_types is a public classmethod of the Trainer now * test_get_init_arguments_and_types added * autopep8 fixes * Trainer cli related tests moved to test_trainer_cli.py * refactored: get_init_arguments_and_types is a public classmethod of the Trainer now * test_get_init_arguments_and_types added * autopep8 fixes * Trainer cli related tests moved to test_trainer_cli.py * refactored: get_init_arguments_and_types is a public classmethod of the Trainer now * test_get_init_arguments_and_types added * autopep8 fixes * Trainer cli related tests moved to test_trainer_cli.py * test_get_init_arguments_and_types added * autopep8 fixes * Apply suggestions from code review * cosmetics * cosmetics * Update pytorch_lightning/trainer/trainer.py Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com> * `Trainer.get_init_arguments_and_types` now returns arg types wrapped in tuples (not in sets) * deprecated args are now ignored in argparser * get_deprecated_arg_names small refactor * get_deprecated_arg_names bug fixed * Trainer cli related tests moved to test_trainer_cli.py * refactored: get_init_arguments_and_types is a public classmethod of the Trainer now * test_get_init_arguments_and_types added * autopep8 fixes * Trainer cli related tests moved to test_trainer_cli.py * autopep8 fixes * Trainer cli related tests moved to test_trainer_cli.py * Trainer cli related tests moved to test_trainer_cli.py * test_get_init_arguments_and_types added * autopep8 fixes * autopep8 fixes * Apply suggestions from code review * cosmetics * cosmetics * Update pytorch_lightning/trainer/trainer.py Co-Authored-By: Jirka Borovec <Borda@users.noreply.github.com> * `Trainer.get_init_arguments_and_types` now returns arg types wrapped in tuples (not in sets) * deprecated args are now ignored in argparser * get_deprecated_arg_names small refactor * get_deprecated_arg_names bug fixed * Update pytorch_lightning/trainer/trainer.py Co-Authored-By: Joe Davison <joe@huggingface.co> * Update pytorch_lightning/trainer/trainer.py Co-Authored-By: Joe Davison <joe@huggingface.co> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Joe Davison <joe@huggingface.co> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-03-24 18:55:27 +00:00
def mock_save_function(filepath, *args):
open(filepath, "a").close()
# simulated losses
losses = [10, 9, 2.8, 5, 2.5]
checkpoint_callback = ModelCheckpoint(
2021-02-06 15:06:17 +00:00
dirpath=tmpdir,
filename="{epoch}",
monitor="checkpoint_on",
2021-02-06 15:06:17 +00:00
save_top_k=save_top_k,
save_last=save_last,
verbose=True,
save_on_train_epoch_end=False,
)
trainer = Trainer()
trainer.state.fn = TrainerFn.FITTING
trainer.save_checkpoint = mock_save_function
# emulate callback's calls during the training
for i, loss in enumerate(losses, 1):
# sets `trainer.global_step`
trainer.fit_loop.epoch_loop.batch_loop.optimizer_loop.optim_progress.optimizer.step.total.completed = i
trainer.callback_metrics.update({"checkpoint_on": torch.tensor(loss)})
checkpoint_callback.on_validation_end(trainer, trainer.lightning_module)
trainer.fit_loop.epoch_progress.current.completed = i # sets `trainer.current_epoch`
file_lists = set(os.listdir(tmpdir))
assert len(file_lists) == len(
expected_files
), f"Should save {len(expected_files)} models when save_top_k={save_top_k} but found={file_lists}"
# verify correct naming
for fname in expected_files:
assert fname in file_lists
def test_model_checkpoint_only_weights(tmpdir):
"""Tests use case where ModelCheckpoint is configured to save only model weights, and user tries to load
checkpoint to resume training."""
model = BoringModel()
trainer = Trainer(
Continue Jeremy's early stopping PR #1504 (#2391) * add state_dict for early stopping * move best attr after monitor_op defined * improve early stopping and model checkpoint callbacks * fix formatting * fix attr init order * clean up setting of default_root_dir attr * logger needs default root dir set first * reorg trainer init * remove direct references to checkpoint callback * more fixes * more bugfixes * run callbacks at epoch end * update tests to use on epoch end * PR cleanup * address failing tests * refactor for homogeneity * fix merge conflict * separate tests * tests for early stopping bug regressions * small fixes * revert model checkpoint change * typo fix * fix tests * update train loop * cannot pass an int as default_save_path * refactor log message * fix test case * appease the linter * fix some doctests * move config to callback * fixes from rebase * fixes from rebase * chlog * docs * reformat * formatting * fix * fix * fixes from rebase * add new test for patience * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update tests/callbacks/test_early_stopping.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * fix formatting * remove enable_early_stop attribute * add state_dict for early stopping * move best attr after monitor_op defined * improve early stopping and model checkpoint callbacks * fix formatting * fix attr init order * clean up setting of default_root_dir attr * logger needs default root dir set first * reorg trainer init * remove direct references to checkpoint callback * more fixes * more bugfixes * run callbacks at epoch end * update tests to use on epoch end * PR cleanup * address failing tests * refactor for homogeneity * fix merge conflict * separate tests * tests for early stopping bug regressions * small fixes * revert model checkpoint change * typo fix * fix tests * update train loop * fix test case * appease the linter * fix some doctests * move config to callback * fixes from rebase * fixes from rebase * chlog * docs * reformat * formatting * fix * fix * fixes from rebase * add new test for patience * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update tests/callbacks/test_early_stopping.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * fix formatting * remove enable_early_stop attribute * fix test with new epoch indexing * fix progress bar totals * fix off by one error (see #2289) epoch starts at 0 now * added missing imports * fix hpc_save folderpath * fix formatting * fix tests * small fixes from a rebase * fix * tmpdir * tmpdir * tmpdir * wandb * fix merge conflict * add back evaluation after training * test_resume_early_stopping_from_checkpoint TODO * undo the horovod check * update changelog * remove a duplicate test from merge error * try fix dp_resume test * add the logger fix from master * try remove default_root_dir * try mocking numpy * try import numpy in docs test * fix wandb test * pep 8 fix * skip if no amp * dont mock when doctesting * install extra * fix the resume ES test * undo conf.py changes * revert remove comet pickle from test * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update weights_loading.rst * Update weights_loading.rst * Update weights_loading.rst * renamed flag * renamed flag * revert the None check in logger experiment name/version * add the old comments * _experiment * test chckpointing on DDP * skip the ddp test on windows * cloudpickle * renamed flag * renamed flag * parentheses for clarity * apply suggestion max epochs Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jeremy Jordan <jtjordan@ncsu.edu> Co-authored-by: Jirka <jirka@pytorchlightning.ai> Co-authored-by: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-06-29 01:36:46 +00:00
default_root_dir=tmpdir,
max_epochs=1,
limit_train_batches=1,
limit_val_batches=1,
callbacks=[ModelCheckpoint(dirpath=tmpdir, save_weights_only=True)],
)
# fit model
trainer.fit(model)
# training complete
assert trainer.state.finished, f"Training failed with {trainer.state}"
checkpoint_path = trainer.checkpoint_callback.best_model_path
# assert saved checkpoint has no trainer data
checkpoint = torch.load(checkpoint_path)
assert "optimizer_states" not in checkpoint, "checkpoint should contain only model weights"
assert "lr_schedulers" not in checkpoint, "checkpoint should contain only model weights"
# assert loading model works when checkpoint has only weights
assert BoringModel.load_from_checkpoint(checkpoint_path=checkpoint_path)
# directly save model
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
trainer.save_checkpoint(new_weights_path, weights_only=True)
# assert saved checkpoint has no trainer data
checkpoint = torch.load(new_weights_path)
assert "optimizer_states" not in checkpoint, "checkpoint should contain only model weights"
assert "lr_schedulers" not in checkpoint, "checkpoint should contain only model weights"
# assert restoring train state fails
with pytest.raises(KeyError, match="checkpoint contains only the model"):
trainer._checkpoint_connector.restore(new_weights_path)
def test_model_freeze_unfreeze():
model = BoringModel()
model.freeze()
assert not model.training
for param in model.parameters():
assert not param.requires_grad
model.unfreeze()
assert model.training
for param in model.parameters():
assert param.requires_grad
# TODO: move to `tests/tests_pytorch/models/test_restore.py`
@pytest.mark.parametrize("url_ckpt", [True, False])
def test_fit_ckpt_path_epoch_restored(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
"""Verify resuming from checkpoint runs the right number of epochs."""
# set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
monkeypatch.setenv("TORCH_HOME", tmpdir)
class TestModel(BoringModel):
# Model that tracks epochs and batches seen
num_epochs_end_seen = 0
num_batches_seen = 0
num_on_load_checkpoint_called = 0
def on_train_epoch_end(self):
self.num_epochs_end_seen += 1
def on_train_batch_start(self, *_):
self.num_batches_seen += 1
def on_load_checkpoint(self, _):
self.num_on_load_checkpoint_called += 1
model = TestModel()
max_epochs = 2
trainer = Trainer(
max_epochs=max_epochs,
limit_train_batches=0.65,
[WIP] Rename overfit_pct to overfit_batches (and fix) and val_percent_check and test_percent_check (and fix) (#2213) * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
2020-06-17 12:03:28 +00:00
limit_val_batches=1,
callbacks=ModelCheckpoint(dirpath=tmpdir, save_top_k=-1),
default_root_dir=tmpdir,
val_check_interval=1.0,
enable_progress_bar=False,
logger=False,
enable_model_summary=False,
)
trainer.fit(model)
assert model.num_epochs_end_seen == max_epochs
assert model.num_batches_seen == trainer.num_training_batches * max_epochs == trainer.global_step
assert model.num_on_load_checkpoint_called == 0
checkpoints = set(Path(trainer.checkpoint_callback.dirpath).glob("*.ckpt"))
if url_ckpt:
# transform local paths into url checkpoints
ip, port = tmpdir_server
checkpoints = [f"http://{ip}:{port}/" + ckpt.name for ckpt in checkpoints]
assert len(checkpoints) == max_epochs
for ckpt in checkpoints:
model = TestModel()
state = pl_load(ckpt)
# Resume training
trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, enable_progress_bar=False)
trainer.fit(model, ckpt_path=ckpt)
assert state["global_step"] + model.num_batches_seen == trainer.global_step
assert model.num_on_load_checkpoint_called == 1
def test_trainer_max_steps_and_epochs(tmpdir):
"""Verify model trains according to specified max steps."""
model = BoringModel()
num_train_samples = math.floor(len(model.train_dataloader()) * 0.5)
# define less train steps than epochs
trainer_kwargs = {
"limit_train_batches": 0.5,
"default_root_dir": tmpdir,
"max_epochs": 3,
"max_steps": num_train_samples + 10,
"logger": False,
"enable_model_summary": False,
"enable_progress_bar": False,
}
trainer = Trainer(**trainer_kwargs)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps"
# define less train epochs than steps
trainer_kwargs["max_epochs"] = 2
trainer_kwargs["max_steps"] = 3 * 2 * num_train_samples
trainer = Trainer(**trainer_kwargs)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.global_step == num_train_samples * trainer.max_epochs
assert trainer.current_epoch == trainer.max_epochs, "Model did not stop at max_epochs"
# if max_steps is positive and max_epochs is negative, use max_steps
trainer_kwargs["max_epochs"] = -1
trainer_kwargs["max_steps"] = 3
trainer = Trainer(**trainer_kwargs)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.global_step == 3
@pytest.mark.parametrize(
"max_epochs,max_steps,incorrect_variable",
[
(-100, -1, "max_epochs"),
(1, -2, "max_steps"),
],
)
def test_trainer_max_steps_and_epochs_validation(max_epochs, max_steps, incorrect_variable):
"""Don't allow max_epochs or max_steps to be less than -1 or a float."""
with pytest.raises(
MisconfigurationException,
match=f"`{incorrect_variable}` must be a non-negative integer or -1",
):
Trainer(max_epochs=max_epochs, max_steps=max_steps)
@pytest.mark.parametrize(
"max_epochs,max_steps,is_done,correct_trainer_epochs",
[
(None, -1, False, None),
(-1, -1, False, -1),
(5, -1, False, 5),
(-1, 10, False, -1),
(None, 0, True, None),
(0, -1, True, 0),
(-1, 0, True, -1),
(0, -1, True, 0),
],
)
def test_trainer_max_steps_and_epochs_fit_loop_done(max_epochs, max_steps, is_done, correct_trainer_epochs):
trainer = Trainer(max_epochs=max_epochs, max_steps=max_steps)
assert trainer.max_epochs == correct_trainer_epochs
assert trainer.max_steps == max_steps
if isinstance(correct_trainer_epochs, int):
assert trainer.fit_loop.done is is_done
# Make sure there is no timer
timer_callbacks = [c for c in trainer.callbacks if isinstance(c, Timer)]
assert len(timer_callbacks) == 0
def test_trainer_min_steps_and_epochs(tmpdir):
"""Verify model trains according to specified min steps."""
num_train_samples = math.floor(len(BoringModel().train_dataloader()) * 0.5)
class CustomModel(BoringModel):
def training_step(self, *args, **kwargs):
# try to force stop right after first step
if self.global_step > 0:
self.trainer.should_step = True
return super().training_step(*args, **kwargs)
model = CustomModel()
trainer_kwargs = {
"limit_train_batches": 0.5,
"default_root_dir": tmpdir,
"val_check_interval": 2,
"min_epochs": 1,
"max_epochs": 7,
# define less min steps than 1 epoch
"min_steps": num_train_samples // 2,
"logger": False,
"enable_model_summary": False,
"enable_progress_bar": False,
}
trainer = Trainer(**trainer_kwargs)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.current_epoch > 0
assert trainer.global_step >= num_train_samples, "Model did not train for at least min_epochs"
# define less epochs than min_steps
trainer_kwargs["min_steps"] = math.floor(num_train_samples * 1.5)
trainer = Trainer(**trainer_kwargs)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.current_epoch > 0
assert trainer.global_step >= math.floor(num_train_samples * 1.5), "Model did not train for at least min_steps"
def test_trainer_min_steps_and_min_epochs_not_reached(tmpdir, caplog):
"""Test that min_epochs/min_steps in Trainer are enforced even if EarlyStopping is triggered."""
class TestModel(BoringModel):
training_step_invoked = 0
def training_step(self, batch, batch_idx):
output = super().training_step(batch, batch_idx)
output["loss"] = output["loss"] * 0.0 # force minimal loss to trigger early stopping
self.log("loss", output["loss"])
self.training_step_invoked += 1
if self.current_epoch < 2:
assert not self.trainer.should_stop
else:
assert self.trainer.should_stop
return output
model = TestModel()
early_stop = EarlyStopping(monitor="loss", patience=0, check_on_train_epoch_end=True)
min_epochs = 5
trainer = Trainer(
default_root_dir=tmpdir,
enable_progress_bar=False,
min_epochs=min_epochs,
limit_val_batches=0,
limit_train_batches=2,
callbacks=[early_stop],
)
with caplog.at_level(logging.INFO, logger="pytorch_lightning.trainer.trainer"):
trainer.fit(model)
message = f"min_epochs={min_epochs}` or `min_steps=None` has not been met. Training will continue"
num_messages = sum(1 for record in caplog.records if message in record.message)
assert num_messages == 1
assert model.training_step_invoked == min_epochs * 2
def test_trainer_max_steps_accumulate_batches(tmpdir):
"""Verify model trains according to specified max steps with grad accumulated batches."""
model = BoringModel()
num_train_samples = math.floor(len(model.train_dataloader()) * 0.5)
# define less train steps than epochs
trainer = Trainer(
limit_train_batches=0.5,
default_root_dir=tmpdir,
max_steps=num_train_samples + 10,
accumulate_grad_batches=10,
logger=False,
enable_progress_bar=False,
enable_model_summary=False,
)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps"
@pytest.mark.parametrize("cudnn_benchmark", (False, True))
@pytest.mark.parametrize(
["benchmark_", "deterministic", "expected"],
[
(None, False, None),
(None, True, False),
(None, None, None),
(True, False, True),
(True, True, True),
(True, None, True),
(False, False, False),
(False, True, False),
(False, None, False),
],
)
def test_benchmark_option(cudnn_benchmark, benchmark_, deterministic, expected):
"""Verify benchmark option."""
original_val = torch.backends.cudnn.benchmark
torch.backends.cudnn.benchmark = cudnn_benchmark
if benchmark_ and deterministic:
with pytest.warns(UserWarning, match="You passed `deterministic=True` and `benchmark=True`"):
trainer = Trainer(benchmark=benchmark_, deterministic=deterministic)
else:
trainer = Trainer(benchmark=benchmark_, deterministic=deterministic)
expected = cudnn_benchmark if expected is None else expected
assert torch.backends.cudnn.benchmark == expected
assert trainer._accelerator_connector.benchmark == expected
torch.backends.cudnn.benchmark = original_val
@pytest.mark.parametrize("ckpt_path", (None, "last"))
@pytest.mark.parametrize("fn", (TrainerFn.FITTING, TrainerFn.VALIDATING))
def test_checkpoint_path_input_last_fault_tolerant(tmpdir, ckpt_path, fn):
mc = ModelCheckpoint()
mc.best_model_path = "foobar"
# manually create to simulate fault-tolerant training
ft_ckpt = _FaultToleranceCheckpoint(tmpdir)
Path(ft_ckpt.ckpt_path).touch()
trainer = Trainer(callbacks=[mc, ft_ckpt])
trainer.state.fn = fn
if ckpt_path == "last":
ctxt = nullcontext()
final_path = os.path.join(tmpdir, ".pl_auto_save.ckpt")
elif fn == "fit": # and ckpt_path == best
ctxt = pytest.warns(UserWarning, match="Because fault tolerance is enabled")
final_path = os.path.join(tmpdir, ".pl_auto_save.ckpt")
else: # ckpt_path == best and fn == validate
ctxt = pytest.warns(UserWarning, match="There is also a fault-tolerant checkpoint available")
final_path = "foobar"
with ctxt:
ckpt_path = trainer._checkpoint_connector._set_ckpt_path(
fn, ckpt_path, model_provided=fn == "fit", model_connected=True
)
assert ckpt_path == final_path
@pytest.mark.parametrize("ckpt_path", (None, "last"))
@pytest.mark.parametrize("save_last", (True, False))
@pytest.mark.parametrize("fn", ("fit", "validate"))
def test_checkpoint_path_input_last(tmpdir, ckpt_path, save_last, fn):
model = BoringModel()
mc = ModelCheckpoint(save_last=save_last)
trainer = Trainer(
max_epochs=1,
limit_train_batches=1,
limit_val_batches=1,
enable_model_summary=False,
enable_progress_bar=False,
logger=False,
default_root_dir=tmpdir,
callbacks=[mc],
)
assert trainer.ckpt_path is None
trainer_fn = getattr(trainer, fn)
if fn == "fit":
ctxt = nullcontext() if ckpt_path is None else pytest.warns(UserWarning, match="No checkpoint will be loaded")
with ctxt:
trainer_fn(model, ckpt_path=ckpt_path)
assert trainer.ckpt_path is None
else:
trainer.fit(model)
if ckpt_path is None:
ctxt = pytest.warns(
UserWarning,
match=r"(?!.*however it is default only when fitting)^"
r".*The best model of the previous `fit` call will be used",
)
final_path = mc.best_model_path
else:
if save_last:
ctxt = nullcontext()
final_path = mc.last_model_path
else:
ctxt = pytest.warns(UserWarning, match="No checkpoint will be loaded")
final_path = None
with ctxt:
trainer_fn(ckpt_path=ckpt_path)
assert trainer.ckpt_path == final_path
def test_checkpoint_find_last(tmpdir):
"""Test that the last checkpoint is found correctly."""
model = BoringModel()
mc = ModelCheckpoint(save_last=True)
trainer = Trainer(
max_epochs=1,
limit_train_batches=1,
limit_val_batches=0,
enable_model_summary=False,
enable_progress_bar=False,
logger=False,
default_root_dir=tmpdir,
callbacks=[mc],
)
assert trainer.ckpt_path is None
trainer.fit(model)
model = BoringModel()
mc = ModelCheckpoint()
trainer = Trainer(
max_epochs=1,
limit_train_batches=1,
limit_val_batches=0,
enable_model_summary=False,
enable_progress_bar=False,
logger=False,
default_root_dir=tmpdir,
callbacks=[mc],
)
assert trainer.ckpt_path is None
trainer.fit(model, ckpt_path="last")
assert trainer.ckpt_path == str(tmpdir / "checkpoints" / "last.ckpt")
@pytest.mark.parametrize("ckpt_path", (None, "best", "specific"))
@pytest.mark.parametrize("save_top_k", (-1, 0, 1, 2))
@pytest.mark.parametrize("fn", ("validate", "test", "predict"))
def test_checkpoint_path_input(tmpdir, ckpt_path, save_top_k, fn):
class TestModel(BoringModel):
def validation_step(self, batch, batch_idx):
self.log("foo", -batch_idx)
return super().validation_step(batch, batch_idx)
def test_step(self, *args):
return self.validation_step(*args)
def predict_step(self, batch, *_):
return self(batch)
model = TestModel()
model.test_epoch_end = None
trainer = Trainer(
max_epochs=2,
limit_val_batches=1,
limit_test_batches=1,
limit_predict_batches=1,
enable_progress_bar=False,
default_root_dir=tmpdir,
callbacks=[ModelCheckpoint(monitor="foo", save_top_k=save_top_k)],
)
trainer.fit(model)
trainer_fn = getattr(trainer, fn)
assert trainer.ckpt_path is None
if ckpt_path == "best":
# ckpt_path is 'best', meaning we load the best weights
if save_top_k == 0:
with pytest.raises(ValueError, match=".*is not configured to save the best.*"):
trainer_fn(ckpt_path=ckpt_path)
with pytest.raises(ValueError, match=".*is not configured to save the best.*"):
trainer_fn(model, ckpt_path=ckpt_path)
else:
trainer_fn(ckpt_path=ckpt_path)
assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
trainer_fn(model, ckpt_path=ckpt_path)
assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
elif ckpt_path is None:
# ckpt_path is None, meaning we don't load any checkpoints and use the provided model
trainer_fn(model, ckpt_path=ckpt_path)
assert trainer.ckpt_path is None
if save_top_k > 0:
# ckpt_path is None with no model provided means load the best weights
with pytest.warns(UserWarning, match="The best model of the previous `fit` call will be used"):
trainer_fn(ckpt_path=ckpt_path)
assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
else:
# specific checkpoint, pick one from saved ones
if save_top_k == 0:
with pytest.raises(FileNotFoundError):
trainer_fn(ckpt_path="random.ckpt")
else:
ckpt_path = str(
list((Path(tmpdir) / f"lightning_logs/version_{trainer.logger.version}/checkpoints").iterdir())[
0
].absolute()
)
trainer_fn(ckpt_path=ckpt_path)
assert trainer.ckpt_path == ckpt_path
trainer_fn(model, ckpt_path=ckpt_path)
assert trainer.ckpt_path == ckpt_path
@pytest.mark.parametrize("enable_checkpointing", (False, True))
@pytest.mark.parametrize("fn", ("validate", "test", "predict"))
def test_tested_checkpoint_path_best(tmpdir, enable_checkpointing, fn):
class TestModel(BoringModel):
def validation_step(self, batch, batch_idx):
self.log("foo", -batch_idx)
return super().validation_step(batch, batch_idx)
def test_step(self, *args):
return self.validation_step(*args)
def predict_step(self, batch, *_):
return self(batch)
model = TestModel()
model.test_epoch_end = None
trainer = Trainer(
max_epochs=2,
limit_val_batches=1,
limit_test_batches=1,
limit_predict_batches=1,
enable_progress_bar=False,
default_root_dir=tmpdir,
enable_checkpointing=enable_checkpointing,
)
trainer.fit(model)
trainer_fn = getattr(trainer, fn)
assert trainer.ckpt_path is None
if enable_checkpointing:
trainer_fn(ckpt_path="best")
assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
trainer_fn(model, ckpt_path="best")
assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
else:
with pytest.raises(ValueError, match="`ModelCheckpoint` is not configured."):
trainer_fn(ckpt_path="best")
with pytest.raises(ValueError, match="`ModelCheckpoint` is not configured."):
trainer_fn(model, ckpt_path="best")
def test_best_ckpt_evaluate_raises_warning_with_multiple_ckpt_callbacks():
"""Test that a warning is raised if best ckpt callback is used for evaluation configured with multiple
checkpoints."""
ckpt_callback1 = ModelCheckpoint(monitor="foo")
ckpt_callback1.best_model_path = "foo_best_model.ckpt"
ckpt_callback2 = ModelCheckpoint(monitor="bar")
ckpt_callback2.best_model_path = "bar_best_model.ckpt"
trainer = Trainer(callbacks=[ckpt_callback1, ckpt_callback2])
trainer.state.fn = TrainerFn.TESTING
with pytest.warns(UserWarning, match="best checkpoint path from first checkpoint callback"):
trainer._checkpoint_connector._set_ckpt_path(
trainer.state.fn, ckpt_path="best", model_provided=False, model_connected=True
)
def test_disabled_training(tmpdir):
"""Verify that `limit_train_batches=0` disables the training loop unless `fast_dev_run=True`."""
class CurrentModel(BoringModel):
training_step_invoked = False
training_epoch_end_invoked = False
def training_step(self, *args, **kwargs):
self.training_step_invoked = True
return super().training_step(*args, **kwargs)
def training_epoch_end(self, *args, **kwargs):
self.training_epoch_end_invoked = True
return super().training_epoch_end(*args, **kwargs)
model = CurrentModel()
trainer_options = dict(
default_root_dir=tmpdir,
enable_progress_bar=False,
max_epochs=2,
limit_train_batches=0.0,
limit_val_batches=0.2,
fast_dev_run=False,
)
before_state_dict = deepcopy(model.state_dict())
trainer = Trainer(**trainer_options)
trainer.fit(model)
after_state_dict = model.state_dict()
for key in before_state_dict.keys():
assert torch.all(torch.eq(before_state_dict[key], after_state_dict[key]))
# check that limit_train_batches=0 turns off training
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.current_epoch == 0
assert not model.training_step_invoked, "`training_step` should not run when `limit_train_batches=0`"
assert not model.training_epoch_end_invoked, "`training_epoch_end` should not run when `limit_train_batches=0`"
# check that limit_train_batches has no influence when fast_dev_run is turned on
model = CurrentModel()
trainer_options.update(fast_dev_run=True)
before_state_dict = deepcopy(model.state_dict())
trainer = Trainer(**trainer_options)
trainer.fit(model)
after_state_dict = model.state_dict()
for key in before_state_dict.keys():
assert not torch.all(torch.eq(before_state_dict[key], after_state_dict[key]))
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.current_epoch == 1
assert model.training_step_invoked, "did not run `training_step` with `fast_dev_run=True`"
assert model.training_epoch_end_invoked, "did not run `training_epoch_end` with `fast_dev_run=True`"
Continue Jeremy's early stopping PR #1504 (#2391) * add state_dict for early stopping * move best attr after monitor_op defined * improve early stopping and model checkpoint callbacks * fix formatting * fix attr init order * clean up setting of default_root_dir attr * logger needs default root dir set first * reorg trainer init * remove direct references to checkpoint callback * more fixes * more bugfixes * run callbacks at epoch end * update tests to use on epoch end * PR cleanup * address failing tests * refactor for homogeneity * fix merge conflict * separate tests * tests for early stopping bug regressions * small fixes * revert model checkpoint change * typo fix * fix tests * update train loop * cannot pass an int as default_save_path * refactor log message * fix test case * appease the linter * fix some doctests * move config to callback * fixes from rebase * fixes from rebase * chlog * docs * reformat * formatting * fix * fix * fixes from rebase * add new test for patience * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update tests/callbacks/test_early_stopping.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * fix formatting * remove enable_early_stop attribute * add state_dict for early stopping * move best attr after monitor_op defined * improve early stopping and model checkpoint callbacks * fix formatting * fix attr init order * clean up setting of default_root_dir attr * logger needs default root dir set first * reorg trainer init * remove direct references to checkpoint callback * more fixes * more bugfixes * run callbacks at epoch end * update tests to use on epoch end * PR cleanup * address failing tests * refactor for homogeneity * fix merge conflict * separate tests * tests for early stopping bug regressions * small fixes * revert model checkpoint change * typo fix * fix tests * update train loop * fix test case * appease the linter * fix some doctests * move config to callback * fixes from rebase * fixes from rebase * chlog * docs * reformat * formatting * fix * fix * fixes from rebase * add new test for patience * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update tests/callbacks/test_early_stopping.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * fix formatting * remove enable_early_stop attribute * fix test with new epoch indexing * fix progress bar totals * fix off by one error (see #2289) epoch starts at 0 now * added missing imports * fix hpc_save folderpath * fix formatting * fix tests * small fixes from a rebase * fix * tmpdir * tmpdir * tmpdir * wandb * fix merge conflict * add back evaluation after training * test_resume_early_stopping_from_checkpoint TODO * undo the horovod check * update changelog * remove a duplicate test from merge error * try fix dp_resume test * add the logger fix from master * try remove default_root_dir * try mocking numpy * try import numpy in docs test * fix wandb test * pep 8 fix * skip if no amp * dont mock when doctesting * install extra * fix the resume ES test * undo conf.py changes * revert remove comet pickle from test * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update weights_loading.rst * Update weights_loading.rst * Update weights_loading.rst * renamed flag * renamed flag * revert the None check in logger experiment name/version * add the old comments * _experiment * test chckpointing on DDP * skip the ddp test on windows * cloudpickle * renamed flag * renamed flag * parentheses for clarity * apply suggestion max epochs Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jeremy Jordan <jtjordan@ncsu.edu> Co-authored-by: Jirka <jirka@pytorchlightning.ai> Co-authored-by: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-06-29 01:36:46 +00:00
def test_disabled_validation(tmpdir):
[WIP] Rename overfit_pct to overfit_batches (and fix) and val_percent_check and test_percent_check (and fix) (#2213) * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
2020-06-17 12:03:28 +00:00
"""Verify that `limit_val_batches=0` disables the validation loop unless `fast_dev_run=True`."""
class CurrentModel(BoringModel):
validation_step_invoked = False
validation_epoch_end_invoked = False
def validation_step(self, *args, **kwargs):
self.validation_step_invoked = True
return super().validation_step(*args, **kwargs)
def validation_epoch_end(self, *args, **kwargs):
self.validation_epoch_end_invoked = True
return super().validation_epoch_end(*args, **kwargs)
model = CurrentModel()
trainer_options = dict(
Continue Jeremy's early stopping PR #1504 (#2391) * add state_dict for early stopping * move best attr after monitor_op defined * improve early stopping and model checkpoint callbacks * fix formatting * fix attr init order * clean up setting of default_root_dir attr * logger needs default root dir set first * reorg trainer init * remove direct references to checkpoint callback * more fixes * more bugfixes * run callbacks at epoch end * update tests to use on epoch end * PR cleanup * address failing tests * refactor for homogeneity * fix merge conflict * separate tests * tests for early stopping bug regressions * small fixes * revert model checkpoint change * typo fix * fix tests * update train loop * cannot pass an int as default_save_path * refactor log message * fix test case * appease the linter * fix some doctests * move config to callback * fixes from rebase * fixes from rebase * chlog * docs * reformat * formatting * fix * fix * fixes from rebase * add new test for patience * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update tests/callbacks/test_early_stopping.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * fix formatting * remove enable_early_stop attribute * add state_dict for early stopping * move best attr after monitor_op defined * improve early stopping and model checkpoint callbacks * fix formatting * fix attr init order * clean up setting of default_root_dir attr * logger needs default root dir set first * reorg trainer init * remove direct references to checkpoint callback * more fixes * more bugfixes * run callbacks at epoch end * update tests to use on epoch end * PR cleanup * address failing tests * refactor for homogeneity * fix merge conflict * separate tests * tests for early stopping bug regressions * small fixes * revert model checkpoint change * typo fix * fix tests * update train loop * fix test case * appease the linter * fix some doctests * move config to callback * fixes from rebase * fixes from rebase * chlog * docs * reformat * formatting * fix * fix * fixes from rebase * add new test for patience * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/callbacks/model_checkpoint.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update tests/callbacks/test_early_stopping.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * fix formatting * remove enable_early_stop attribute * fix test with new epoch indexing * fix progress bar totals * fix off by one error (see #2289) epoch starts at 0 now * added missing imports * fix hpc_save folderpath * fix formatting * fix tests * small fixes from a rebase * fix * tmpdir * tmpdir * tmpdir * wandb * fix merge conflict * add back evaluation after training * test_resume_early_stopping_from_checkpoint TODO * undo the horovod check * update changelog * remove a duplicate test from merge error * try fix dp_resume test * add the logger fix from master * try remove default_root_dir * try mocking numpy * try import numpy in docs test * fix wandb test * pep 8 fix * skip if no amp * dont mock when doctesting * install extra * fix the resume ES test * undo conf.py changes * revert remove comet pickle from test * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update weights_loading.rst * Update weights_loading.rst * Update weights_loading.rst * renamed flag * renamed flag * revert the None check in logger experiment name/version * add the old comments * _experiment * test chckpointing on DDP * skip the ddp test on windows * cloudpickle * renamed flag * renamed flag * parentheses for clarity * apply suggestion max epochs Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jeremy Jordan <jtjordan@ncsu.edu> Co-authored-by: Jirka <jirka@pytorchlightning.ai> Co-authored-by: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-06-29 01:36:46 +00:00
default_root_dir=tmpdir,
enable_progress_bar=False,
max_epochs=2,
limit_train_batches=0.4,
[WIP] Rename overfit_pct to overfit_batches (and fix) and val_percent_check and test_percent_check (and fix) (#2213) * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
2020-06-17 12:03:28 +00:00
limit_val_batches=0.0,
fast_dev_run=False,
)
trainer = Trainer(**trainer_options)
trainer.fit(model)
[WIP] Rename overfit_pct to overfit_batches (and fix) and val_percent_check and test_percent_check (and fix) (#2213) * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
2020-06-17 12:03:28 +00:00
# check that limit_val_batches=0 turns off validation
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.current_epoch == 2
assert not model.validation_step_invoked, "`validation_step` should not run when `limit_val_batches=0`"
assert not model.validation_epoch_end_invoked, "`validation_epoch_end` should not run when `limit_val_batches=0`"
[WIP] Rename overfit_pct to overfit_batches (and fix) and val_percent_check and test_percent_check (and fix) (#2213) * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
2020-06-17 12:03:28 +00:00
# check that limit_val_batches has no influence when fast_dev_run is turned on
model = CurrentModel()
trainer_options.update(fast_dev_run=True)
trainer = Trainer(**trainer_options)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.current_epoch == 1
assert model.validation_step_invoked, "did not run `validation_step` with `fast_dev_run=True`"
assert model.validation_epoch_end_invoked, "did not run `validation_epoch_end` with `fast_dev_run=True`"
@pytest.mark.parametrize("track_grad_norm", [0, torch.tensor(1), "nan"])
def test_invalid_track_grad_norm(tmpdir, track_grad_norm):
with pytest.raises(MisconfigurationException, match="`track_grad_norm` must be a positive number or 'inf'"):
Trainer(default_root_dir=tmpdir, track_grad_norm=track_grad_norm)
2021-09-01 08:49:00 +00:00
def test_on_exception_hook(tmpdir):
"""Test the on_exception callback hook and the trainer interrupted flag."""
2021-09-01 08:49:00 +00:00
model = BoringModel()
class InterruptCallback(Callback):
def __init__(self):
super().__init__()
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
raise KeyboardInterrupt
2021-09-01 08:49:00 +00:00
def on_test_start(self, trainer, pl_module):
raise MisconfigurationException
class HandleInterruptCallback(Callback):
def __init__(self):
super().__init__()
2021-09-01 08:49:00 +00:00
self.exception = None
2021-09-01 08:49:00 +00:00
def on_exception(self, trainer, pl_module, exception):
self.exception = exception
interrupt_callback = InterruptCallback()
handle_interrupt_callback = HandleInterruptCallback()
trainer = Trainer(
callbacks=[interrupt_callback, handle_interrupt_callback],
max_epochs=1,
[WIP] Rename overfit_pct to overfit_batches (and fix) and val_percent_check and test_percent_check (and fix) (#2213) * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * fixed percent check for val/test * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * overfit_pct now uses train loaders for val and test and does not shuffle * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks * add on fit_start on fit_end hooks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
2020-06-17 12:03:28 +00:00
limit_val_batches=0.1,
limit_train_batches=0.2,
enable_progress_bar=False,
logger=False,
default_root_dir=tmpdir,
)
assert not trainer.interrupted
2021-09-01 08:49:00 +00:00
assert handle_interrupt_callback.exception is None
trainer.fit(model)
assert trainer.interrupted
2021-09-01 08:49:00 +00:00
assert isinstance(handle_interrupt_callback.exception, KeyboardInterrupt)
with pytest.raises(MisconfigurationException):
2021-09-01 08:49:00 +00:00
trainer.test(model)
assert trainer.interrupted
assert isinstance(handle_interrupt_callback.exception, MisconfigurationException)
@pytest.mark.parametrize("precision", [32, pytest.param(16, marks=RunIf(min_cuda_gpus=1))])
@RunIf(sklearn=True)
def test_gradient_clipping_by_norm(tmpdir, precision):
"""Test gradient clipping by norm."""
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
trainer = Trainer(
default_root_dir=tmpdir,
max_steps=1,
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
max_epochs=1,
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
accelerator="auto",
devices=1,
precision=precision,
gradient_clip_algorithm="norm",
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
gradient_clip_val=0.05,
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
)
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
class TestModel(ClassificationModel):
def configure_gradient_clipping(self, *args, **kwargs):
super().configure_gradient_clipping(*args, **kwargs)
# test that gradient is clipped correctly
parameters = self.parameters()
grad_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), 2) for p in parameters]), 2)
torch.testing.assert_close(grad_norm, torch.tensor(0.05, device=self.device))
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
self.assertion_called = True
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
model = TestModel()
trainer.fit(model, ClassifDataModule())
assert model.assertion_called
@pytest.mark.parametrize("precision", [32, pytest.param(16, marks=RunIf(min_cuda_gpus=1))])
def test_gradient_clipping_by_value(tmpdir, precision):
"""Test gradient clipping by value."""
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
trainer = Trainer(
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
default_root_dir=tmpdir,
max_steps=1,
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
max_epochs=1,
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
accelerator="auto",
devices=1,
precision=precision,
gradient_clip_algorithm="value",
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
gradient_clip_val=1e-10,
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
)
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
class TestModel(BoringModel):
def configure_gradient_clipping(self, *args, **kwargs):
super().configure_gradient_clipping(*args, **kwargs)
# test that gradient is clipped correctly
parameters = self.parameters()
grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters]
grad_max = torch.max(torch.stack(grad_max_list))
torch.testing.assert_close(grad_max.abs(), torch.tensor(1e-10, device=self.device))
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
self.assertion_called = True
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
model = TestModel()
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
trainer.fit(model)
Fix gradient norm tracking and gradient clipping (#9287) * WIP * Progress * Undo test change * Fix plugin closure execution order * Update CHANGELOG * Fix manual optimization on AMP and skipping backward * Fix for deepspeed * Typo * Hook test for manual closure * Add skipping test with AMP * You are hideous, apex * Add deepspeed test * Update CHANGELOG * Fix for broken master * Add RunIf * FIXMEs * Rename * Fix grad norm * add a simple test * update test * update test * update test * fix merge conflicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Sea of changes * Undo change * Introduce TPUPrecisionPlugin * Undo changes * Undo changes * Resolve FIXME * Undo change * Undo change * Undo change * Fix FIXMEs * Fix FIXME * Correct value * Bad merge * Fix circular imports * WIP * Fixing clipping * Fixes * Bad merge * Move optimizer step and clipping into the `PrecisionPlugin` * Fix AMP * Update CHANGELOG * Fix tests * Underscore * Progress * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove pre_optimizer_step * Missed one * Progress * Progress * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update FIXMEs * Fix test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix test * DeepSpeed warning. mypy * Rename * Finish tests * Update CHANGELOG * Dumb fixes * accelerator=auto * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Update on comments * Use ClassifModule Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
2021-10-28 15:23:27 +00:00
assert model.assertion_called
Add `Trainer(gradient_clip_algorithm='value'|'norm')` (#6123) * add changelog * add clip by value * fix bug in training tricks.rst * fix bug in trainer.rst * Update trainer.rst * Update trainer.rst * Update CHANGELOG.md Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/plugins/precision/deepspeed_precision.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update pytorch_lightning/utilities/enums.py Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * yapf formatting * update training tricks * update based on comment * update based on comment * Update pytorch_lightning/trainer/trainer.py Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> * update based on comment * pep8 * mypy * mypy * Update docs/source/advanced/training_tricks.rst Co-authored-by: thomas chaton <thomas@grid.ai> * Update sharded_native_amp.py * Update test_sharded_parity.py * update test codes * Update test_tpu.py * Update pytorch_lightning/trainer/connectors/training_trick_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update test_trainer.py * Update enums.py * Update enums.py * add super-class initialization to precision plugins. * add clip_grad horovod cpu test * add clip_grad horovod cpu test * use subprocess check_call * change order of horovod tests * set max_epochs 2 in horovod test * remove clip_grad_val test from horovod-cpu * remove "type: ignore" * divide clip grad val test in horovod * update based on comments * add super-class initialization to precision plugins. * bugfix * bugfix * revert some changes * revert some changes * Update tests/models/test_horovod.py * merge master * Delete signature test No point in testing a signature Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: thomas chaton <thomas@grid.ai> Co-authored-by: ananthsub <ananth.subramaniam@gmail.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
2021-04-06 13:27:37 +00:00
def test_invalid_gradient_clip_value(tmpdir):
with pytest.raises(TypeError, match="`gradient_clip_val` should be an int or a float"):
Trainer(default_root_dir=tmpdir, gradient_clip_val=(1, 2))
def test_invalid_gradient_clip_algo(tmpdir):
with pytest.raises(MisconfigurationException, match="`gradient_clip_algorithm` norm2 is invalid"):
Trainer(default_root_dir=tmpdir, gradient_clip_algorithm="norm2")
@RunIf(min_cuda_gpus=1)
def test_gpu_choice():
num_gpus = torch.cuda.device_count()
Trainer(accelerator="gpu", devices=num_gpus, auto_select_gpus=True)
with pytest.raises(MisconfigurationException, match=r".*but your machine only has.*"):
Trainer(accelerator="gpu", devices=num_gpus + 1, auto_select_gpus=True)
@pytest.mark.parametrize("limit_val_batches", [0.0, 1, 1.0, 0.5, 5])
def test_num_sanity_val_steps(tmpdir, limit_val_batches):
"""Test that the number of sanity check batches is clipped to `limit_val_batches`."""
class CustomModel(BoringModel):
def validation_step(self, batch, batch_idx, dataloader_idx):
return super().validation_step(batch, batch_idx)
def val_dataloader(self):
return [DataLoader(RandomDataset(32, 64)), DataLoader(RandomDataset(32, 64))]
model = CustomModel()
model.validation_epoch_end = None
num_sanity_val_steps = 4
trainer = Trainer(
default_root_dir=tmpdir,
num_sanity_val_steps=num_sanity_val_steps,
limit_val_batches=limit_val_batches,
max_steps=1,
)
assert trainer.num_sanity_val_steps == num_sanity_val_steps
class CustomModelMixedVal(CustomModel):
def val_dataloader(self):
return [DataLoader(RandomDataset(32, 64), batch_size=8), DataLoader(RandomDataset(32, 64))]
model = CustomModelMixedVal()
model.validation_epoch_end = None
with patch.object(
trainer.fit_loop.epoch_loop.val_loop.epoch_loop,
"_evaluation_step",
wraps=trainer.fit_loop.epoch_loop.val_loop.epoch_loop._evaluation_step,
) as mocked:
trainer.fit(model)
assert mocked.call_count == sum(
min(num_sanity_val_steps, num_batches) for num_batches in trainer.num_val_batches
)
@pytest.mark.parametrize("limit_val_batches", [0.0, 1, 1.0, 0.3])
def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
"""Test that `num_sanity_val_steps=-1` runs through all validation data once, and as many batches as limited by
`limit_val_batches` Trainer argument."""
class CustomModel(BoringModel):
def validation_step(self, batch, batch_idx, dataloader_idx):
return super().validation_step(batch, batch_idx)
def val_dataloader(self):
return [DataLoader(RandomDataset(32, 64)), DataLoader(RandomDataset(32, 64))]
model = CustomModel()
model.validation_epoch_end = None
trainer = Trainer(
default_root_dir=tmpdir, num_sanity_val_steps=-1, limit_val_batches=limit_val_batches, max_steps=1
)
assert trainer.num_sanity_val_steps == float("inf")
with patch.object(
trainer.fit_loop.epoch_loop.val_loop.epoch_loop,
"_evaluation_step",
wraps=trainer.fit_loop.epoch_loop.val_loop.epoch_loop._evaluation_step,
) as mocked:
val_dataloaders = model.val_dataloader()
trainer.fit(model, val_dataloaders=val_dataloaders)
assert mocked.call_count == sum(trainer.num_val_batches)
def test_trainer_subclassing():
model = BoringModel()
# First way of pulling out args from signature is to list them
class TrainerSubclass(Trainer):
def __init__(self, custom_arg, *args, custom_kwarg="test", **kwargs):
super().__init__(*args, **kwargs)
self.custom_arg = custom_arg
self.custom_kwarg = custom_kwarg
trainer = TrainerSubclass(123, custom_kwarg="custom", fast_dev_run=True)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.custom_arg == 123
assert trainer.custom_kwarg == "custom"
assert trainer.fast_dev_run
# Second way is to pop from the dict
# It's a special case because Trainer does not have any positional args
class TrainerSubclass(Trainer):
def __init__(self, **kwargs):
self.custom_arg = kwargs.pop("custom_arg", 0)
self.custom_kwarg = kwargs.pop("custom_kwarg", "test")
super().__init__(**kwargs)
trainer = TrainerSubclass(custom_kwarg="custom", fast_dev_run=True)
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
assert trainer.custom_kwarg == "custom"
assert trainer.fast_dev_run
# when we pass in an unknown arg, the base class should complain
with pytest.raises(TypeError, match=r"__init__\(\) got an unexpected keyword argument 'abcdefg'"):
TrainerSubclass(abcdefg="unknown_arg")
@RunIf(omegaconf=True)
@pytest.mark.parametrize(
"trainer_params",
[{"max_epochs": 1, "accelerator": "gpu", "devices": 1}, {"max_epochs": 1, "accelerator": "gpu", "devices": [0]}],
)
def test_trainer_omegaconf(cuda_count_1, trainer_params):
config = OmegaConf.create(trainer_params)
Trainer(**config)
def test_trainer_pickle(tmpdir):
trainer = Trainer(max_epochs=1, default_root_dir=tmpdir)
pickle.dumps(trainer)
cloudpickle.dumps(trainer)
@pytest.mark.parametrize("stage", ("fit", "validate", "test"))
def test_trainer_setup_call(tmpdir, stage):
"""Test setup call gets the correct stage."""
class CurrentModel(BoringModel):
def setup(self, stage):
self.stage = stage
class CurrentCallback(Callback):
def setup(self, trainer, model, stage):
assert model is not None
self.stage = stage
model = CurrentModel()
callback = CurrentCallback()
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, enable_checkpointing=False, callbacks=[callback])
if stage == "fit":
trainer.fit(model)
elif stage == "validate":
trainer.validate(model)
else:
trainer.test(model)
assert callback.stage == stage
assert model.stage == stage
@pytest.mark.parametrize("train_batches, max_steps, log_interval", [(10, 10, 1), (3, 10, 1), (3, 10, 5)])
@patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics")
def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, log_interval):
class TestModel(BoringModel):
def training_step(self, *args, **kwargs):
self.log("foo", -1)
return super().training_step(*args, **kwargs)
model = TestModel()
trainer = Trainer(
default_root_dir=tmpdir,
log_every_n_steps=log_interval,
limit_train_batches=train_batches,
limit_val_batches=0,
max_steps=max_steps,
logger=TensorBoardLogger(tmpdir),
)
trainer.fit(model)
expected_calls = [call(metrics=ANY, step=s) for s in range(log_interval - 1, max_steps, log_interval)]
log_metrics_mock.assert_has_calls(expected_calls)
feature: Allow str arguments in Trainer.profiler (#3656) * allow trainer's profiler param to have a str value * add tests * update docs * update exception message * Update CHANGELOG * fix pep8 issues * cleanup test code Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Add deprecation warning if using bool for profiler * Add deprecation tests and move deprecated tests * Remove bool option to profiler from docs * Deprecate bool args to profiler in CHANGELOG * fixup! Add deprecation warning if using bool for profiler * fixup! Add deprecation tests and move deprecated tests * Apply suggestions from code review Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> * Implement suggestions, remove whitespace * fixup! Implement suggestions, remove whitespace * Allow bool, str (case insensitive), BaseProfiler * Add info about bool deprecation to trainer * fixup! Add info about bool deprecation to trainer * Move deprecate todo to test_deprecated * Test wrong profiler type, improve error message * fixup! Test wrong profiler type, improve error message * Update pytorch_lightning/trainer/connectors/profiler_connector.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Apply suggestions from code review * Readd bool to profiler types, test cli profiler arg * Remove extra whitespace in doc Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * Apply suggestions from code review Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * Update deprecation versions Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
2020-10-27 10:57:16 +00:00
class TestLightningDataModule(LightningDataModule):
def __init__(self, dataloaders):
super().__init__()
self._dataloaders = dataloaders
def test_dataloader(self):
return self._dataloaders
Add PredictLoop (#5752) * integrate distrib_type * sync changes * sync * fixes * add forgotten generators * add missing logic * update * import * missed imports * import fixes * isort * mv f * changelog * format * move helper to parallel plugin * d * add world size * clean up * duplicate * activate ddp_sharded and tpu * set nvidia flags * remove unused colab var * use_tpu <-> on_tpu attrs * make some ddp_cpu and clusterplugin tests pass * Ref/accelerator connector (#5742) * final cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * connector cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * trainer cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * accelerator cleanup + missing logic in accelerator connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add missing changes to callbacks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * reflect accelerator changes to lightning module Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * clean cluster envs Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * cleanup plugins Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add broadcasting Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * yapf * remove plugin connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * plugins * add predict_loop * manual optimization * clean predictloop * update optimizer routing * add predict loop on new accelerator * resolve a bug * add rank to torchelastic * add predict_loop * add predict loop on new accelerator * resolve a bug * fix memory mixed precision * update * setstate on trainer for pickling in ddp spawn * add predict_loop * clean predictloop * add predict loop on new accelerator * resolve a bug * add predict_loop * add predict loop on new accelerator * resolve a bug * add predict_loop * add predict loop on new accelerator * resolve a bug * add predict_loop * add predict loop on new accelerator * resolve a bug * add predict_loop * clean predictloop * add predict loop on new accelerator * resolve a bug * add predict_loop * add predict loop on new accelerator * resolve a bug * resolve tests * add predict method * add back commented accelerator code * adapt test for sync_batch_norm to new plugin * fix deprecated tests * fix ddp cpu choice when no num_processes are given * yapf format * skip a memory test that cannot pass anymore * remove sanetize * rename train to run_train * remove useless hooks * add misconfigurationException * remove wrong naming * resolve some legacy * udpate docstring * fix pickle error in spawn plugin * x * avoid * x * fix cyclic import in docs build * add support for sharded * update typing * add sharded and sharded_spawn to distributed types * make unwrap model default * refactor LightningShardedDataParallel similar to LightningDistributedDataParallel * update sharded spawn to reflect changes * update sharded to reflect changes * Merge 1.1.5 changes * fix merge * fix merge * yapf isort * fix merge * yapf isort * fix indentation in test * copy over reinit scheduler implementation from dev1.2 * fix apex tracking calls with dev_debugger * reduce diff to dev1.2, clean up * fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu * sort plugin tests legacy/new * fix error handling for amp on cpu * fix merge fix merge fix merge * [Feat] Resolve manual_backward (#5837) * resolve manual_backward * resolve flake8 * update * resolve for ddp_spawn * resolve flake8 * resolve flake8 * resolve flake8 Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * fix tests/accelerator tests on cpu * [BugFix] Resolve manual optimization (#5852) * resolve manual_optimization * update * update Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856) * resovle a bug * Accelerator refactor sharded rpc (#5854) * rpc branch * merge * update handling of rpc * make devices etc. Optional in RPC * set devices etc. later if necessary * remove devices from sequential * make devices optional in rpc * fix import * uncomment everything * fix cluster selection Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * resolve bug * fix assert in rpc test * resolve a test * fix docs compilation * accelerator refactor - fix for sharded parity test (#5866) * fix memory issue with ddp_spawn * x x x x x x x x x * x * Remove DDP2 as this does not apply * Add missing pre optimizer hook to ensure lambda closure is called * fix apex docstring * [accelerator][BugFix] Resolve some test for 1 gpu (#5863) * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * update * resolve flake8 * update * update * update * update * update * all_gather * update * make plugins work, add misconfig for RPC * update * update * remove breaking test * resolve some tests * resolve flake8 * revert to ddp_spawn Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de> * yapf isort * resolve flake8 * fix apex doctests * fix apex doctests 2 * resolve docs * update drone * clean env * update * update * update * update * merge * Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881) * Fix RPC related tests, clean out old API, update for new accelerator API * Move tests out of legacy folder, update paths and names * Update test_remove_1-4.py * Expose properties for tpu cores/gpus/num_gpus * Add root GPU property * Move properties to properties.py * move tests that were previously in drone * Fix root GPU property (#5908) * Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator * Add missing tests back * fix best model path transfer when no checkpoint callback available * Fix setup hook order [wip] (#5858) * Call trainer setup hook before accelerator setup * Add test case * add new test * typo * fix callback order in test Co-authored-by: tchaton <thomas@grid.ai> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * rename ddp sequential -> rpc sequential for special test * revert * fix stupid merge problem * Use property in connector for sampler (#5913) * merge the import conflicts * fix spawning of processes in slurm * [wip] Fix some bugs for TPU [skip ci] (#5878) * fixed for single tpu * fixed spawn * fixed spawn * update * update * wip * resolve bugs * resolve bug * update on comment * removed decorator * resolve comments * set to 4 * update * update * need cleaning * update * update * update * resolve flake8 * resolve bugs * exclude broadcast * resolve bugs * change test * update * update * skip if meet fails * properly raise trace * update * add catch * wrap test * resolve typo * update * typo Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> * resolve some tests * update * fix imports * update * resolve flake8 * update azure pipeline * skip a sharded test on cpu that requires a gpu * resolve tpus * resolve bug * resolve flake8 * update * updat utils * revert permission change on files * suggestions from carlos Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting changes * remove incomplete comment * Update pytorch_lightning/accelerators/__init__.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting change * add types * warn 1.7 ddp manual backward only if ddp kwarg unset * yapf + isort * pep8 unused imports * fix cyclic import in docs * Apply suggestions from code review * typer in accelerator.py * typo * resolve flake8 * update code * update * Update pytorch_lightning/trainer/predict_loop.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * Update pytorch_lightning/trainer/predict_loop.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * fix merge * fix merge * reset legacy accelerator * add missing rename dispatch * rename post traning * update code * resolved comments * typo * typo * add flow description * resolve comments * update on comments * update flow * add backticks * resolve tpu Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: justusschock <justus.schock@posteo.de> Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-16 22:11:56 +00:00
def predict_dataloader(self):
return self._dataloaders
2021-04-27 20:23:55 +00:00
class CustomPredictionWriter(BasePredictionWriter):
write_on_batch_end_called = False
write_on_epoch_end_called = False
def __init__(self, output_dir: str, *args, **kwargs):
super().__init__(*args, **kwargs)
self.output_dir = output_dir
def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *args, **kwargs):
assert prediction.shape == torch.Size([1, 2])
assert len(batch_indices) == 1
2021-04-27 20:23:55 +00:00
self.write_on_batch_end_called = True
def write_on_epoch_end(self, trainer, pl_module, predictions, batch_indices):
expected = 1 if trainer._accelerator_connector.is_distributed else 2
2021-04-27 20:23:55 +00:00
assert len(predictions) == 2
assert len(predictions[0]) == expected
assert len(batch_indices) == 2
assert len(batch_indices[0]) == expected
2021-04-27 20:23:55 +00:00
self.write_on_epoch_end_called = True
def on_predict_epoch_end(self, trainer, pl_module, outputs):
if trainer._accelerator_connector.is_distributed:
for idx in range(2):
assert isinstance(trainer.predict_dataloaders[idx].batch_sampler.sampler, UnrepeatedDistributedSampler)
assert isinstance(trainer.predict_dataloaders[idx].batch_sampler, IndexBatchSamplerWrapper)
super().on_predict_epoch_end(trainer, pl_module, outputs)
2021-04-27 20:23:55 +00:00
def predict(
tmpdir,
strategy=None,
accelerator=None,
devices=None,
model=None,
plugins=None,
datamodule=True,
enable_progress_bar=True,
use_callbacks=True,
2021-04-27 20:23:55 +00:00
):
2021-02-06 15:06:17 +00:00
dataloaders = [torch.utils.data.DataLoader(RandomDataset(32, 2)), torch.utils.data.DataLoader(RandomDataset(32, 2))]
model = model or BoringModel()
dm = TestLightningDataModule(dataloaders)
2021-04-27 20:23:55 +00:00
cb = CustomPredictionWriter(tmpdir, write_interval="batch")
cb_1 = CustomPredictionWriter(tmpdir, write_interval="epoch")
trainer = Trainer(
default_root_dir=tmpdir,
max_epochs=1,
log_every_n_steps=1,
enable_model_summary=False,
strategy=strategy,
accelerator=accelerator,
devices=devices,
plugins=plugins,
enable_progress_bar=enable_progress_bar,
callbacks=[cb, cb_1] if use_callbacks else [],
)
if strategy == "ddp_spawn":
with pytest.raises(ProcessRaisedException, match="`return_predictions` should be set to `False`"):
trainer.predict(model, datamodule=dm, return_predictions=True)
if datamodule:
results = trainer.predict(model, datamodule=dm)
else:
results = trainer.predict(model, dataloaders=dataloaders)
if not isinstance(trainer.strategy, DDPSpawnStrategy):
2021-04-27 20:23:55 +00:00
if use_callbacks:
assert cb.write_on_batch_end_called
assert not cb.write_on_epoch_end_called
assert not cb_1.write_on_batch_end_called
assert cb_1.write_on_epoch_end_called
num_samples = 1 if strategy == "ddp" else 2
assert len(results) == 2
assert len(results[0]) == num_samples
assert results[0][0].shape == torch.Size([1, 2])
def test_trainer_predict_no_return(tmpdir):
"""Test trainer.predict warns when nothing is returned."""
class CustomBoringModel(BoringModel):
def predict_step(self, batch, batch_idx, dataloader_idx=0):
if (batch_idx + 1) % 2 == 0:
return
return super().predict_step(batch, batch_idx, dataloader_idx)
with pytest.warns(UserWarning, match="predict returned None"):
predict(tmpdir, model=CustomBoringModel(), use_callbacks=False)
def test_trainer_predict_grad(tmpdir):
class CustomBoringModel(BoringModel):
def predict_step(self, batch, batch_idx, dataloader_idx=0):
assert batch.expand_as(batch).grad_fn is None
return super().predict_step(batch, batch_idx, dataloader_idx)
predict(tmpdir, model=CustomBoringModel(), use_callbacks=False)
x = torch.zeros(1, requires_grad=True)
assert x.expand_as(x).grad_fn is not None
@pytest.mark.parametrize("enable_progress_bar", [False, True])
@pytest.mark.parametrize("datamodule", [False, True])
def test_trainer_predict_cpu(tmpdir, datamodule, enable_progress_bar):
predict(tmpdir, datamodule=datamodule, enable_progress_bar=enable_progress_bar)
@RunIf(min_cuda_gpus=2, standalone=True)
@pytest.mark.parametrize(
"kwargs",
[
{"strategy": "dp", "devices": 1},
{"strategy": "dp", "devices": 2},
{"strategy": "ddp", "devices": 2},
],
)
2021-11-26 17:13:14 +00:00
def test_trainer_predict_standalone(tmpdir, kwargs):
predict(tmpdir, accelerator="gpu", **kwargs)
@pytest.mark.parametrize(
"accelerator",
[
pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)),
pytest.param("mps", marks=RunIf(mps=True)),
],
)
def test_trainer_predict_1_gpu(tmpdir, accelerator):
predict(tmpdir, accelerator=accelerator, devices=1)
@RunIf(skip_windows=True)
@pytest.mark.parametrize("accelerator", ["cpu", pytest.param("gpu", marks=RunIf(min_cuda_gpus=2))])
def test_trainer_predict_ddp_spawn(tmpdir, accelerator):
predict(tmpdir, strategy="ddp_spawn", accelerator=accelerator, devices=2)
@pytest.mark.parametrize("dataset_cls", [RandomDataset, RandomIterableDatasetWithLen, RandomIterableDataset])
def test_index_batch_sampler_wrapper_with_iterable_dataset(dataset_cls, tmpdir):
ds = dataset_cls(32, 8)
loader = DataLoader(ds)
is_iterable_dataset = isinstance(ds, IterableDataset)
class CustomPredictionWriter(BasePredictionWriter):
def __init__(self, output_dir: str, *args, **kwargs):
super().__init__(*args, **kwargs)
self.output_dir = output_dir
def write_on_batch_end(self, trainer, pl_module, prediction, batch_indices, *args, **kwargs):
assert not batch_indices if is_iterable_dataset else batch_indices
cb = CustomPredictionWriter(tmpdir)
trainer = Trainer(default_root_dir=tmpdir, callbacks=cb)
predictions = trainer.predict(BoringModel(), dataloaders=loader)
assert len(predictions) == 8
2022-01-19 21:27:12 +00:00
def test_spawn_predict_return_predictions(tmpdir):
"""Test that `return_predictions=True` raise a MisconfigurationException with spawn strategies."""
model = BoringModel()
2022-01-19 21:27:12 +00:00
trainer = Trainer(default_root_dir=tmpdir, accelerator="cpu", strategy="ddp_spawn", devices=2, fast_dev_run=True)
assert isinstance(trainer.strategy, DDPSpawnStrategy)
with pytest.raises(ProcessRaisedException, match="`return_predictions` should be set to `False`"):
2021-12-03 16:37:40 +00:00
trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=True)
@pytest.mark.parametrize("return_predictions", [None, False, True])
@pytest.mark.parametrize("precision", [32, 64])
def test_predict_return_predictions_cpu(return_predictions, precision, tmpdir):
"""Test that `return_predictions=True`."""
seed_everything(42)
model = BoringModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, precision=precision)
preds = trainer.predict(model, dataloaders=model.train_dataloader(), return_predictions=return_predictions)
if return_predictions or return_predictions is None:
assert len(preds) == 1
assert preds[0].shape == torch.Size([1, 2])
assert preds[0].dtype == (torch.float64 if precision == 64 else torch.float32)
@pytest.mark.parametrize(["max_steps", "max_epochs", "global_step"], [(10, 5, 10), (20, None, 20)])
def test_repeated_fit_calls_with_max_epochs_and_steps(tmpdir, max_steps, max_epochs, global_step):
"""Ensure that the training loop is bound by `max_steps` and `max_epochs` for repeated calls of `trainer.fit`,
and disabled if the limit is reached."""
dataset_len = 200
batch_size = 10
train_data = DataLoader(RandomDataset(32, dataset_len), batch_size=batch_size)
model = BoringModel()
trainer = Trainer(default_root_dir=tmpdir, max_steps=max_steps, max_epochs=max_epochs)
trainer.fit(model, train_data)
assert trainer.global_step == global_step
trainer.fit(model, train_data)
assert trainer.global_step == global_step
PoC: Accelerator refactor (#5743) * restoring the result from subprocess * fix queue.get() order for results * add missing "block_backward_sync" context manager * add missing "block_backward_sync" context manager * fix sync_batchnorm * fix supported gpu-ids for tuple * fix clip gradients and inf recursion * accelerator selection: added cluster_environment plugin * fix torchelastic test * fix reduce early stopping decision for DDP * fix tests: callbacks, conversion to lightning optimizer * fix lightning optimizer does not pickle * fix setting benchmark and deterministic option * fix slurm amp test * fix prepare_data test and determine node_rank * fix retrieving last path when testing * remove obsolete plugin argument * fix test: test_trainer_config * fix torchscript tests * fix trainer.model access * move properties * fix test_transfer_batch_hook * fix auto_select_gpus * fix omegaconf test * fix test that needs to simulate slurm ddp * add horovod plugin * fix test with named arguments * clean up whitespace * fix datamodules test * remove old accelerators * fix naming * move old plugins * move to plugins * create precision subpackage * create training_type subpackage * fix all new import errors * fix wrong arguments order passed to test * fix LR finder * Added sharded training type and amp plugin * Move clip grad to precision plugin * Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically * Fix import issue, attempting to fix tests * Fix initial test * Reflect hook logic from master, should wrap model after move to device * Optional state consolidation, since master has optimizers not wrapped * change attribute for instance test * reset optimizers optimizers are not used in main process, so state would be wrong. * legacy * imports in accel * legacy2 * trainer imports * fix import errors after rebase * move hook to new setup location * provide unwrapping logic * fix trainer callback system * added ddp2 implementation * fix imports .legacy * move plugins * restore legacy * drop test.py from root * add tpu accelerator and plugins * fixes * fix lightning optimizer merge * reset bugreportmodel * unwrapping * step routing forward * model access * unwrap * opt * integrate distrib_type * sync changes * sync * fixes * add forgotten generators * add missing logic * update * import * missed imports * import fixes * isort * mv f * changelog * format * move helper to parallel plugin * d * add world size * clean up * duplicate * activate ddp_sharded and tpu * set nvidia flags * remove unused colab var * use_tpu <-> on_tpu attrs * make some ddp_cpu and clusterplugin tests pass * Ref/accelerator connector (#5742) * final cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * connector cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * trainer cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * accelerator cleanup + missing logic in accelerator connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add missing changes to callbacks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * reflect accelerator changes to lightning module Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * clean cluster envs Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * cleanup plugins Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add broadcasting Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * yapf * remove plugin connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * plugins * manual optimization * update optimizer routing * add rank to torchelastic * fix memory mixed precision * setstate on trainer for pickling in ddp spawn * add predict method * add back commented accelerator code * adapt test for sync_batch_norm to new plugin * fix deprecated tests * fix ddp cpu choice when no num_processes are given * yapf format * skip a memory test that cannot pass anymore * fix pickle error in spawn plugin * x * avoid * x * fix cyclic import in docs build * add support for sharded * update typing * add sharded and sharded_spawn to distributed types * make unwrap model default * refactor LightningShardedDataParallel similar to LightningDistributedDataParallel * update sharded spawn to reflect changes * update sharded to reflect changes * Merge 1.1.5 changes * fix merge * fix merge * yapf isort * fix merge * yapf isort * fix indentation in test * copy over reinit scheduler implementation from dev1.2 * fix apex tracking calls with dev_debugger * reduce diff to dev1.2, clean up * fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu * sort plugin tests legacy/new * fix error handling for amp on cpu * fix merge fix merge fix merge * [Feat] Resolve manual_backward (#5837) * resolve manual_backward * resolve flake8 * update * resolve for ddp_spawn * resolve flake8 * resolve flake8 * resolve flake8 Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * fix tests/accelerator tests on cpu * [BugFix] Resolve manual optimization (#5852) * resolve manual_optimization * update * update Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856) * resovle a bug * Accelerator refactor sharded rpc (#5854) * rpc branch * merge * update handling of rpc * make devices etc. Optional in RPC * set devices etc. later if necessary * remove devices from sequential * make devices optional in rpc * fix import * uncomment everything * fix cluster selection Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * resolve bug * fix assert in rpc test * resolve a test * fix docs compilation * accelerator refactor - fix for sharded parity test (#5866) * fix memory issue with ddp_spawn * x x x x x x x x x * x * Remove DDP2 as this does not apply * Add missing pre optimizer hook to ensure lambda closure is called * fix apex docstring * [accelerator][BugFix] Resolve some test for 1 gpu (#5863) * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * update * resolve flake8 * update * update * update * update * update * all_gather * update * make plugins work, add misconfig for RPC * update * update * remove breaking test * resolve some tests * resolve flake8 * revert to ddp_spawn Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de> * yapf isort * resolve flake8 * fix apex doctests * fix apex doctests 2 * resolve docs * update drone * clean env * update * update * update * update * merge * Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881) * Fix RPC related tests, clean out old API, update for new accelerator API * Move tests out of legacy folder, update paths and names * Update test_remove_1-4.py * Expose properties for tpu cores/gpus/num_gpus * Add root GPU property * Move properties to properties.py * move tests that were previously in drone * Fix root GPU property (#5908) * Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator * Add missing tests back * fix best model path transfer when no checkpoint callback available * Fix setup hook order [wip] (#5858) * Call trainer setup hook before accelerator setup * Add test case * add new test * typo * fix callback order in test Co-authored-by: tchaton <thomas@grid.ai> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * rename ddp sequential -> rpc sequential for special test * revert * fix stupid merge problem * Use property in connector for sampler (#5913) * merge the import conflicts * fix spawning of processes in slurm * [wip] Fix some bugs for TPU [skip ci] (#5878) * fixed for single tpu * fixed spawn * fixed spawn * update * update * wip * resolve bugs * resolve bug * update on comment * removed decorator * resolve comments * set to 4 * update * update * need cleaning * update * update * update * resolve flake8 * resolve bugs * exclude broadcast * resolve bugs * change test * update * update * skip if meet fails * properly raise trace * update * add catch * wrap test * resolve typo * update * typo Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> * resolve some tests * update * fix imports * update * resolve flake8 * update azure pipeline * skip a sharded test on cpu that requires a gpu * resolve tpus * resolve bug * resolve flake8 * update * updat utils * revert permission change on files * suggestions from carlos Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting changes * remove incomplete comment * Update pytorch_lightning/accelerators/__init__.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting change * add types * warn 1.7 ddp manual backward only if ddp kwarg unset * yapf + isort * pep8 unused imports * fix cyclic import in docs * Apply suggestions from code review * typer in accelerator.py * typo * Apply suggestions from code review * formatting * update on comments * update typo * Update pytorch_lightning/trainer/properties.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update * suggestion from code review * suggestion from code review Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> Co-authored-by: chaton <thomas@grid.ai> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
def test_trainer_access_in_configure_optimizers(tmpdir):
"""Verify that the configure optimizer function can reference the trainer."""
PoC: Accelerator refactor (#5743) * restoring the result from subprocess * fix queue.get() order for results * add missing "block_backward_sync" context manager * add missing "block_backward_sync" context manager * fix sync_batchnorm * fix supported gpu-ids for tuple * fix clip gradients and inf recursion * accelerator selection: added cluster_environment plugin * fix torchelastic test * fix reduce early stopping decision for DDP * fix tests: callbacks, conversion to lightning optimizer * fix lightning optimizer does not pickle * fix setting benchmark and deterministic option * fix slurm amp test * fix prepare_data test and determine node_rank * fix retrieving last path when testing * remove obsolete plugin argument * fix test: test_trainer_config * fix torchscript tests * fix trainer.model access * move properties * fix test_transfer_batch_hook * fix auto_select_gpus * fix omegaconf test * fix test that needs to simulate slurm ddp * add horovod plugin * fix test with named arguments * clean up whitespace * fix datamodules test * remove old accelerators * fix naming * move old plugins * move to plugins * create precision subpackage * create training_type subpackage * fix all new import errors * fix wrong arguments order passed to test * fix LR finder * Added sharded training type and amp plugin * Move clip grad to precision plugin * Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically * Fix import issue, attempting to fix tests * Fix initial test * Reflect hook logic from master, should wrap model after move to device * Optional state consolidation, since master has optimizers not wrapped * change attribute for instance test * reset optimizers optimizers are not used in main process, so state would be wrong. * legacy * imports in accel * legacy2 * trainer imports * fix import errors after rebase * move hook to new setup location * provide unwrapping logic * fix trainer callback system * added ddp2 implementation * fix imports .legacy * move plugins * restore legacy * drop test.py from root * add tpu accelerator and plugins * fixes * fix lightning optimizer merge * reset bugreportmodel * unwrapping * step routing forward * model access * unwrap * opt * integrate distrib_type * sync changes * sync * fixes * add forgotten generators * add missing logic * update * import * missed imports * import fixes * isort * mv f * changelog * format * move helper to parallel plugin * d * add world size * clean up * duplicate * activate ddp_sharded and tpu * set nvidia flags * remove unused colab var * use_tpu <-> on_tpu attrs * make some ddp_cpu and clusterplugin tests pass * Ref/accelerator connector (#5742) * final cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * connector cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * trainer cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * accelerator cleanup + missing logic in accelerator connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add missing changes to callbacks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * reflect accelerator changes to lightning module Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * clean cluster envs Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * cleanup plugins Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add broadcasting Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * yapf * remove plugin connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * plugins * manual optimization * update optimizer routing * add rank to torchelastic * fix memory mixed precision * setstate on trainer for pickling in ddp spawn * add predict method * add back commented accelerator code * adapt test for sync_batch_norm to new plugin * fix deprecated tests * fix ddp cpu choice when no num_processes are given * yapf format * skip a memory test that cannot pass anymore * fix pickle error in spawn plugin * x * avoid * x * fix cyclic import in docs build * add support for sharded * update typing * add sharded and sharded_spawn to distributed types * make unwrap model default * refactor LightningShardedDataParallel similar to LightningDistributedDataParallel * update sharded spawn to reflect changes * update sharded to reflect changes * Merge 1.1.5 changes * fix merge * fix merge * yapf isort * fix merge * yapf isort * fix indentation in test * copy over reinit scheduler implementation from dev1.2 * fix apex tracking calls with dev_debugger * reduce diff to dev1.2, clean up * fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu * sort plugin tests legacy/new * fix error handling for amp on cpu * fix merge fix merge fix merge * [Feat] Resolve manual_backward (#5837) * resolve manual_backward * resolve flake8 * update * resolve for ddp_spawn * resolve flake8 * resolve flake8 * resolve flake8 Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * fix tests/accelerator tests on cpu * [BugFix] Resolve manual optimization (#5852) * resolve manual_optimization * update * update Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856) * resovle a bug * Accelerator refactor sharded rpc (#5854) * rpc branch * merge * update handling of rpc * make devices etc. Optional in RPC * set devices etc. later if necessary * remove devices from sequential * make devices optional in rpc * fix import * uncomment everything * fix cluster selection Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * resolve bug * fix assert in rpc test * resolve a test * fix docs compilation * accelerator refactor - fix for sharded parity test (#5866) * fix memory issue with ddp_spawn * x x x x x x x x x * x * Remove DDP2 as this does not apply * Add missing pre optimizer hook to ensure lambda closure is called * fix apex docstring * [accelerator][BugFix] Resolve some test for 1 gpu (#5863) * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * update * resolve flake8 * update * update * update * update * update * all_gather * update * make plugins work, add misconfig for RPC * update * update * remove breaking test * resolve some tests * resolve flake8 * revert to ddp_spawn Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de> * yapf isort * resolve flake8 * fix apex doctests * fix apex doctests 2 * resolve docs * update drone * clean env * update * update * update * update * merge * Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881) * Fix RPC related tests, clean out old API, update for new accelerator API * Move tests out of legacy folder, update paths and names * Update test_remove_1-4.py * Expose properties for tpu cores/gpus/num_gpus * Add root GPU property * Move properties to properties.py * move tests that were previously in drone * Fix root GPU property (#5908) * Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator * Add missing tests back * fix best model path transfer when no checkpoint callback available * Fix setup hook order [wip] (#5858) * Call trainer setup hook before accelerator setup * Add test case * add new test * typo * fix callback order in test Co-authored-by: tchaton <thomas@grid.ai> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * rename ddp sequential -> rpc sequential for special test * revert * fix stupid merge problem * Use property in connector for sampler (#5913) * merge the import conflicts * fix spawning of processes in slurm * [wip] Fix some bugs for TPU [skip ci] (#5878) * fixed for single tpu * fixed spawn * fixed spawn * update * update * wip * resolve bugs * resolve bug * update on comment * removed decorator * resolve comments * set to 4 * update * update * need cleaning * update * update * update * resolve flake8 * resolve bugs * exclude broadcast * resolve bugs * change test * update * update * skip if meet fails * properly raise trace * update * add catch * wrap test * resolve typo * update * typo Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> * resolve some tests * update * fix imports * update * resolve flake8 * update azure pipeline * skip a sharded test on cpu that requires a gpu * resolve tpus * resolve bug * resolve flake8 * update * updat utils * revert permission change on files * suggestions from carlos Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting changes * remove incomplete comment * Update pytorch_lightning/accelerators/__init__.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting change * add types * warn 1.7 ddp manual backward only if ddp kwarg unset * yapf + isort * pep8 unused imports * fix cyclic import in docs * Apply suggestions from code review * typer in accelerator.py * typo * Apply suggestions from code review * formatting * update on comments * update typo * Update pytorch_lightning/trainer/properties.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update * suggestion from code review * suggestion from code review Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> Co-authored-by: chaton <thomas@grid.ai> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
class TestModel(BoringModel):
def configure_optimizers(self):
assert self.trainer is not None, "Expect to have access to the trainer within `configure_optimizers`"
train_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
model = TestModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
trainer.fit(model, train_data)
@pytest.mark.parametrize(
"accelerator",
[
pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)),
pytest.param("mps", marks=RunIf(mps=True)),
],
)
def test_setup_hook_move_to_device_correctly(tmpdir, accelerator):
"""Verify that if a user defines a layer in the setup hook function, this is moved to the correct device."""
PoC: Accelerator refactor (#5743) * restoring the result from subprocess * fix queue.get() order for results * add missing "block_backward_sync" context manager * add missing "block_backward_sync" context manager * fix sync_batchnorm * fix supported gpu-ids for tuple * fix clip gradients and inf recursion * accelerator selection: added cluster_environment plugin * fix torchelastic test * fix reduce early stopping decision for DDP * fix tests: callbacks, conversion to lightning optimizer * fix lightning optimizer does not pickle * fix setting benchmark and deterministic option * fix slurm amp test * fix prepare_data test and determine node_rank * fix retrieving last path when testing * remove obsolete plugin argument * fix test: test_trainer_config * fix torchscript tests * fix trainer.model access * move properties * fix test_transfer_batch_hook * fix auto_select_gpus * fix omegaconf test * fix test that needs to simulate slurm ddp * add horovod plugin * fix test with named arguments * clean up whitespace * fix datamodules test * remove old accelerators * fix naming * move old plugins * move to plugins * create precision subpackage * create training_type subpackage * fix all new import errors * fix wrong arguments order passed to test * fix LR finder * Added sharded training type and amp plugin * Move clip grad to precision plugin * Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically * Fix import issue, attempting to fix tests * Fix initial test * Reflect hook logic from master, should wrap model after move to device * Optional state consolidation, since master has optimizers not wrapped * change attribute for instance test * reset optimizers optimizers are not used in main process, so state would be wrong. * legacy * imports in accel * legacy2 * trainer imports * fix import errors after rebase * move hook to new setup location * provide unwrapping logic * fix trainer callback system * added ddp2 implementation * fix imports .legacy * move plugins * restore legacy * drop test.py from root * add tpu accelerator and plugins * fixes * fix lightning optimizer merge * reset bugreportmodel * unwrapping * step routing forward * model access * unwrap * opt * integrate distrib_type * sync changes * sync * fixes * add forgotten generators * add missing logic * update * import * missed imports * import fixes * isort * mv f * changelog * format * move helper to parallel plugin * d * add world size * clean up * duplicate * activate ddp_sharded and tpu * set nvidia flags * remove unused colab var * use_tpu <-> on_tpu attrs * make some ddp_cpu and clusterplugin tests pass * Ref/accelerator connector (#5742) * final cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * connector cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * trainer cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * accelerator cleanup + missing logic in accelerator connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add missing changes to callbacks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * reflect accelerator changes to lightning module Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * clean cluster envs Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * cleanup plugins Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add broadcasting Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * yapf * remove plugin connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * plugins * manual optimization * update optimizer routing * add rank to torchelastic * fix memory mixed precision * setstate on trainer for pickling in ddp spawn * add predict method * add back commented accelerator code * adapt test for sync_batch_norm to new plugin * fix deprecated tests * fix ddp cpu choice when no num_processes are given * yapf format * skip a memory test that cannot pass anymore * fix pickle error in spawn plugin * x * avoid * x * fix cyclic import in docs build * add support for sharded * update typing * add sharded and sharded_spawn to distributed types * make unwrap model default * refactor LightningShardedDataParallel similar to LightningDistributedDataParallel * update sharded spawn to reflect changes * update sharded to reflect changes * Merge 1.1.5 changes * fix merge * fix merge * yapf isort * fix merge * yapf isort * fix indentation in test * copy over reinit scheduler implementation from dev1.2 * fix apex tracking calls with dev_debugger * reduce diff to dev1.2, clean up * fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu * sort plugin tests legacy/new * fix error handling for amp on cpu * fix merge fix merge fix merge * [Feat] Resolve manual_backward (#5837) * resolve manual_backward * resolve flake8 * update * resolve for ddp_spawn * resolve flake8 * resolve flake8 * resolve flake8 Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * fix tests/accelerator tests on cpu * [BugFix] Resolve manual optimization (#5852) * resolve manual_optimization * update * update Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856) * resovle a bug * Accelerator refactor sharded rpc (#5854) * rpc branch * merge * update handling of rpc * make devices etc. Optional in RPC * set devices etc. later if necessary * remove devices from sequential * make devices optional in rpc * fix import * uncomment everything * fix cluster selection Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * resolve bug * fix assert in rpc test * resolve a test * fix docs compilation * accelerator refactor - fix for sharded parity test (#5866) * fix memory issue with ddp_spawn * x x x x x x x x x * x * Remove DDP2 as this does not apply * Add missing pre optimizer hook to ensure lambda closure is called * fix apex docstring * [accelerator][BugFix] Resolve some test for 1 gpu (#5863) * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * update * resolve flake8 * update * update * update * update * update * all_gather * update * make plugins work, add misconfig for RPC * update * update * remove breaking test * resolve some tests * resolve flake8 * revert to ddp_spawn Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de> * yapf isort * resolve flake8 * fix apex doctests * fix apex doctests 2 * resolve docs * update drone * clean env * update * update * update * update * merge * Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881) * Fix RPC related tests, clean out old API, update for new accelerator API * Move tests out of legacy folder, update paths and names * Update test_remove_1-4.py * Expose properties for tpu cores/gpus/num_gpus * Add root GPU property * Move properties to properties.py * move tests that were previously in drone * Fix root GPU property (#5908) * Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator * Add missing tests back * fix best model path transfer when no checkpoint callback available * Fix setup hook order [wip] (#5858) * Call trainer setup hook before accelerator setup * Add test case * add new test * typo * fix callback order in test Co-authored-by: tchaton <thomas@grid.ai> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * rename ddp sequential -> rpc sequential for special test * revert * fix stupid merge problem * Use property in connector for sampler (#5913) * merge the import conflicts * fix spawning of processes in slurm * [wip] Fix some bugs for TPU [skip ci] (#5878) * fixed for single tpu * fixed spawn * fixed spawn * update * update * wip * resolve bugs * resolve bug * update on comment * removed decorator * resolve comments * set to 4 * update * update * need cleaning * update * update * update * resolve flake8 * resolve bugs * exclude broadcast * resolve bugs * change test * update * update * skip if meet fails * properly raise trace * update * add catch * wrap test * resolve typo * update * typo Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> * resolve some tests * update * fix imports * update * resolve flake8 * update azure pipeline * skip a sharded test on cpu that requires a gpu * resolve tpus * resolve bug * resolve flake8 * update * updat utils * revert permission change on files * suggestions from carlos Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting changes * remove incomplete comment * Update pytorch_lightning/accelerators/__init__.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting change * add types * warn 1.7 ddp manual backward only if ddp kwarg unset * yapf + isort * pep8 unused imports * fix cyclic import in docs * Apply suggestions from code review * typer in accelerator.py * typo * Apply suggestions from code review * formatting * update on comments * update typo * Update pytorch_lightning/trainer/properties.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update * suggestion from code review * suggestion from code review Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> Co-authored-by: chaton <thomas@grid.ai> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
class TestModel(BoringModel):
def setup(self, stage: str) -> None:
self.new_layer = torch.nn.Linear(2, 2)
def training_step(self, batch, batch_idx):
output = self.layer(batch)
# will crash if not moved to correct device
output = self.new_layer(output)
loss = self.loss(batch, output)
return {"loss": loss}
# fake data
train_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
# model
model = TestModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, accelerator=accelerator, devices=1)
PoC: Accelerator refactor (#5743) * restoring the result from subprocess * fix queue.get() order for results * add missing "block_backward_sync" context manager * add missing "block_backward_sync" context manager * fix sync_batchnorm * fix supported gpu-ids for tuple * fix clip gradients and inf recursion * accelerator selection: added cluster_environment plugin * fix torchelastic test * fix reduce early stopping decision for DDP * fix tests: callbacks, conversion to lightning optimizer * fix lightning optimizer does not pickle * fix setting benchmark and deterministic option * fix slurm amp test * fix prepare_data test and determine node_rank * fix retrieving last path when testing * remove obsolete plugin argument * fix test: test_trainer_config * fix torchscript tests * fix trainer.model access * move properties * fix test_transfer_batch_hook * fix auto_select_gpus * fix omegaconf test * fix test that needs to simulate slurm ddp * add horovod plugin * fix test with named arguments * clean up whitespace * fix datamodules test * remove old accelerators * fix naming * move old plugins * move to plugins * create precision subpackage * create training_type subpackage * fix all new import errors * fix wrong arguments order passed to test * fix LR finder * Added sharded training type and amp plugin * Move clip grad to precision plugin * Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically * Fix import issue, attempting to fix tests * Fix initial test * Reflect hook logic from master, should wrap model after move to device * Optional state consolidation, since master has optimizers not wrapped * change attribute for instance test * reset optimizers optimizers are not used in main process, so state would be wrong. * legacy * imports in accel * legacy2 * trainer imports * fix import errors after rebase * move hook to new setup location * provide unwrapping logic * fix trainer callback system * added ddp2 implementation * fix imports .legacy * move plugins * restore legacy * drop test.py from root * add tpu accelerator and plugins * fixes * fix lightning optimizer merge * reset bugreportmodel * unwrapping * step routing forward * model access * unwrap * opt * integrate distrib_type * sync changes * sync * fixes * add forgotten generators * add missing logic * update * import * missed imports * import fixes * isort * mv f * changelog * format * move helper to parallel plugin * d * add world size * clean up * duplicate * activate ddp_sharded and tpu * set nvidia flags * remove unused colab var * use_tpu <-> on_tpu attrs * make some ddp_cpu and clusterplugin tests pass * Ref/accelerator connector (#5742) * final cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * connector cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * trainer cleanup Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * accelerator cleanup + missing logic in accelerator connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add missing changes to callbacks Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * reflect accelerator changes to lightning module Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * clean cluster envs Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * cleanup plugins Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * add broadcasting Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * yapf * remove plugin connector Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * plugins * manual optimization * update optimizer routing * add rank to torchelastic * fix memory mixed precision * setstate on trainer for pickling in ddp spawn * add predict method * add back commented accelerator code * adapt test for sync_batch_norm to new plugin * fix deprecated tests * fix ddp cpu choice when no num_processes are given * yapf format * skip a memory test that cannot pass anymore * fix pickle error in spawn plugin * x * avoid * x * fix cyclic import in docs build * add support for sharded * update typing * add sharded and sharded_spawn to distributed types * make unwrap model default * refactor LightningShardedDataParallel similar to LightningDistributedDataParallel * update sharded spawn to reflect changes * update sharded to reflect changes * Merge 1.1.5 changes * fix merge * fix merge * yapf isort * fix merge * yapf isort * fix indentation in test * copy over reinit scheduler implementation from dev1.2 * fix apex tracking calls with dev_debugger * reduce diff to dev1.2, clean up * fix trainer config test when gpus>0 and num_processes >0 and ddp_cpu * sort plugin tests legacy/new * fix error handling for amp on cpu * fix merge fix merge fix merge * [Feat] Resolve manual_backward (#5837) * resolve manual_backward * resolve flake8 * update * resolve for ddp_spawn * resolve flake8 * resolve flake8 * resolve flake8 Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * fix tests/accelerator tests on cpu * [BugFix] Resolve manual optimization (#5852) * resolve manual_optimization * update * update Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856) * resovle a bug * Accelerator refactor sharded rpc (#5854) * rpc branch * merge * update handling of rpc * make devices etc. Optional in RPC * set devices etc. later if necessary * remove devices from sequential * make devices optional in rpc * fix import * uncomment everything * fix cluster selection Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> * resolve bug * fix assert in rpc test * resolve a test * fix docs compilation * accelerator refactor - fix for sharded parity test (#5866) * fix memory issue with ddp_spawn * x x x x x x x x x * x * Remove DDP2 as this does not apply * Add missing pre optimizer hook to ensure lambda closure is called * fix apex docstring * [accelerator][BugFix] Resolve some test for 1 gpu (#5863) * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * update * update * revert init * resolve a bug * update * resolve flake8 * update * update * update * revert init * update * resolve flake8 * update * update * update * update * update * all_gather * update * make plugins work, add misconfig for RPC * update * update * remove breaking test * resolve some tests * resolve flake8 * revert to ddp_spawn Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de> * yapf isort * resolve flake8 * fix apex doctests * fix apex doctests 2 * resolve docs * update drone * clean env * update * update * update * update * merge * Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881) * Fix RPC related tests, clean out old API, update for new accelerator API * Move tests out of legacy folder, update paths and names * Update test_remove_1-4.py * Expose properties for tpu cores/gpus/num_gpus * Add root GPU property * Move properties to properties.py * move tests that were previously in drone * Fix root GPU property (#5908) * Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator * Add missing tests back * fix best model path transfer when no checkpoint callback available * Fix setup hook order [wip] (#5858) * Call trainer setup hook before accelerator setup * Add test case * add new test * typo * fix callback order in test Co-authored-by: tchaton <thomas@grid.ai> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * rename ddp sequential -> rpc sequential for special test * revert * fix stupid merge problem * Use property in connector for sampler (#5913) * merge the import conflicts * fix spawning of processes in slurm * [wip] Fix some bugs for TPU [skip ci] (#5878) * fixed for single tpu * fixed spawn * fixed spawn * update * update * wip * resolve bugs * resolve bug * update on comment * removed decorator * resolve comments * set to 4 * update * update * need cleaning * update * update * update * resolve flake8 * resolve bugs * exclude broadcast * resolve bugs * change test * update * update * skip if meet fails * properly raise trace * update * add catch * wrap test * resolve typo * update * typo Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> * resolve some tests * update * fix imports * update * resolve flake8 * update azure pipeline * skip a sharded test on cpu that requires a gpu * resolve tpus * resolve bug * resolve flake8 * update * updat utils * revert permission change on files * suggestions from carlos Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting changes * remove incomplete comment * Update pytorch_lightning/accelerators/__init__.py Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> * remove unrelated formatting change * add types * warn 1.7 ddp manual backward only if ddp kwarg unset * yapf + isort * pep8 unused imports * fix cyclic import in docs * Apply suggestions from code review * typer in accelerator.py * typo * Apply suggestions from code review * formatting * update on comments * update typo * Update pytorch_lightning/trainer/properties.py Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * update * suggestion from code review * suggestion from code review Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: SeanNaren <sean@grid.ai> Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz> Co-authored-by: chaton <thomas@grid.ai> Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: root <root@ip-172-31-88-60.ec2.internal> Co-authored-by: Lezwon Castelino <lezwon@gmail.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2021-02-12 20:48:56 +00:00
trainer.fit(model, train_data)
def test_train_loop_system(tmpdir):
"""
Test the following methods are called in the order in automatic optimization.
1. optimizer.step (skip when gradient accumulation)
2. model.training_step
3. optimizer.zero_grad (run when the first batch of gradient accumulation)
4. model.backward
Note that the order is NOT `training_step`->`zero_grad`->`backward`->`step`.
This is because `optimizer.step(closure)` calls `closure()` which then calls
the three remaining methods `training_step`, `zero_grad` and `backward` inside.
"""
called_methods = []
trainer_options = dict(
default_root_dir=tmpdir,
max_epochs=1,
limit_train_batches=5,
limit_val_batches=1,
limit_test_batches=1,
enable_progress_bar=False,
)
class TestOptimizer(SGD):
def step(self, *args, **kwargs):
called_methods.append("step")
return super().step(*args, **kwargs)
def zero_grad(self, *args, **kwargs):
called_methods.append("zero_grad")
return super().zero_grad(*args, **kwargs)
class TestModel(BoringModel):
def configure_optimizers(self):
return TestOptimizer(self.parameters(), lr=0.1)
def training_step(self, *args, **kwargs):
called_methods.append("training_step")
return super().training_step(*args, **kwargs)
def backward(self, *args, **kwargs):
called_methods.append("backward")
return super().backward(*args, **kwargs)
model = TestModel()
trainer = Trainer(**trainer_options)
# No methods are called yet.
assert called_methods == []
trainer.fit(model)
assert called_methods == ["step", "training_step", "zero_grad", "backward"] * trainer.limit_train_batches
called_methods.clear()
trainer = Trainer(**trainer_options, accumulate_grad_batches=3)
# No methods are called yet.
assert called_methods == []
trainer.fit(model)
assert called_methods == [
# 0
"training_step",
"zero_grad",
"backward",
# 1
"training_step",
"backward",
# 2
"step",
"training_step",
"backward",
# 3
"training_step",
"zero_grad",
"backward",
# 4
"step",
"training_step",
"backward",
]
def test_check_val_every_n_epoch_exception(tmpdir):
with pytest.raises(MisconfigurationException, match="should be an integer."):
Trainer(default_root_dir=tmpdir, max_epochs=1, check_val_every_n_epoch=1.2)
def test_exception_when_testing_or_validating_with_fast_dev_run():
trainer = Trainer(fast_dev_run=True)
trainer.state.fn = TrainerFn.TESTING
with pytest.raises(ValueError, match=r"with `fast_dev_run=True`. .* pass an exact checkpoint path"):
trainer._checkpoint_connector._set_ckpt_path(
trainer.state.fn, ckpt_path="best", model_provided=False, model_connected=True
)
class TrainerStagesModel(BoringModel):
def on_train_start(self) -> None:
assert self.trainer.model.training
assert self.training
def on_validation_start(self) -> None:
assert not self.trainer.model.training
assert not self.training
def on_test_start(self) -> None:
assert not self.trainer.model.training
assert not self.training
def on_predict_start(self) -> None:
assert not self.trainer.model.training
assert not self.training
@pytest.mark.parametrize("strategy,devices", [(None, 1), pytest.param("ddp_spawn", 1, marks=RunIf(skip_windows=True))])
def test_model_in_correct_mode_during_stages(tmpdir, strategy, devices):
model = TrainerStagesModel()
trainer = Trainer(default_root_dir=tmpdir, strategy=strategy, accelerator="cpu", devices=devices, fast_dev_run=True)
trainer.fit(model)
trainer.validate(model)
trainer.test(model)
trainer.predict(model, model.val_dataloader())
class TestDummyModelForCheckpoint(BoringModel):
def validation_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
self.log("x", loss)
def validation_epoch_end(self, outputs) -> None:
pass
2022-03-27 21:31:20 +00:00
@RunIf(skip_windows=True)
def test_fit_test_synchronization(tmpdir):
"""Test that the trainer synchronizes processes before returning control back to the caller."""
model = TestDummyModelForCheckpoint()
checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="x", mode="min", save_top_k=1)
trainer = Trainer(
default_root_dir=tmpdir,
max_epochs=2,
strategy="ddp_spawn",
accelerator="cpu",
devices=2,
callbacks=[checkpoint],
)
trainer.fit(model)
assert os.path.exists(checkpoint.best_model_path), f"Could not find checkpoint at rank {trainer.global_rank}"
trainer.test()
class CustomCallbackOnLoadCheckpoint(Callback):
def state_dict(self) -> dict:
return {"a": None}
def test_on_load_checkpoint_missing_callbacks(tmpdir):
"""Test a warning appears when callbacks in the checkpoint don't match callbacks provided when resuming."""
model = BoringModel()
chk = ModelCheckpoint(dirpath=tmpdir, save_last=True)
trainer = Trainer(default_root_dir=tmpdir, max_epochs=3, callbacks=[chk, CustomCallbackOnLoadCheckpoint()])
trainer.fit(model)
trainer = Trainer(default_root_dir=tmpdir, max_epochs=5)
with pytest.warns(UserWarning, match="CustomCallbackOnLoadCheckpoint"):
trainer.fit(model, ckpt_path=chk.last_model_path)
def test_module_current_fx_attributes_reset(tmpdir):
"""Ensure that lightning module's attributes related to current fx are reset at the end of execution."""
model = BoringModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1, enable_checkpointing=False, logger=False)
trainer.fit(model)
assert model._current_fx_name is None
trainer.test(model)
assert model._current_fx_name is None
def test_exception_when_lightning_module_is_not_set_on_trainer():
trainer = Trainer()
with pytest.raises(MisconfigurationException, match=r"`model` must be provided.*validate"):
trainer.validate()
with pytest.raises(MisconfigurationException, match=r"`model` must be provided.*test"):
trainer.test()
with pytest.raises(MisconfigurationException, match=r"`model` must be provided.*predict"):
trainer.predict()
class CustomException(Exception):
pass
@RunIf(min_cuda_gpus=2, standalone=True)
def test_ddp_terminate_when_deadlock_is_detected(tmpdir):
"""Test that DDP kills the remaining processes when only one rank is throwing an exception."""
class TestModel(BoringModel):
def training_step(self, batch, batch_idx):
if batch_idx == 1 and self.trainer.is_global_zero:
# rank 0: raises an exception
# rank 1: continues training but will hang on the next barrier in the training loop
raise CustomException
return super().training_step(batch, batch_idx)
model = TestModel()
trainer = Trainer(
default_root_dir=tmpdir,
max_epochs=1,
limit_train_batches=5,
num_sanity_val_steps=0,
accelerator="gpu",
devices=2,
strategy="ddp",
enable_progress_bar=False,
enable_model_summary=False,
)
# simulate random failure in training_step on rank 0
with pytest.raises(DeadlockDetectedException, match="CustomException"):
trainer.fit(model)
@RunIf(min_cuda_gpus=1)
def test_multiple_trainer_constant_memory_allocated(tmpdir):
"""This tests ensures calling the trainer several times reset the memory back to 0."""
class TestModel(BoringModel):
def training_step(self, batch, batch_idx):
loss = super().training_step(batch, batch_idx)
self.log("train_loss", loss["loss"])
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.layer.parameters(), lr=0.1)
class Check(Callback):
def on_train_epoch_start(self, trainer, *_):
assert isinstance(trainer.strategy.model, DistributedDataParallel)
def current_memory():
# before measuring the memory force release any leftover allocations, including CUDA tensors
gc.collect()
return torch.cuda.memory_allocated(0)
initial = current_memory()
model = TestModel()
trainer_kwargs = dict(
default_root_dir=tmpdir,
fast_dev_run=True,
accelerator="gpu",
devices=1,
strategy="ddp",
enable_progress_bar=False,
callbacks=Check(),
)
trainer = Trainer(**trainer_kwargs)
trainer.fit(model)
assert trainer.strategy.model is model
assert list(trainer.optimizers[0].state.values())[0]["exp_avg_sq"].device == torch.device("cpu")
assert trainer.callback_metrics["train_loss"].device == torch.device("cpu")
assert current_memory() <= initial
deepcopy(trainer)
assert current_memory() <= initial
trainer_2 = Trainer(**trainer_kwargs)
trainer_2.fit(model)
assert current_memory() <= initial
class TrainerStagesErrorsModel(BoringModel):
def on_train_start(self) -> None:
raise Exception("Error during train")
def on_validation_start(self) -> None:
raise Exception("Error during validation")
def on_test_start(self) -> None:
raise Exception("Error during test")
def on_predict_start(self) -> None:
raise Exception("Error during predict")
class ExceptionCounter(Callback):
exceptions = 0
def on_exception(self, *_):
self.exceptions += 1
@pytest.mark.parametrize("strategy", [None, pytest.param("ddp_spawn", marks=RunIf(skip_windows=True))])
def test_error_handling_all_stages(tmpdir, strategy):
model = TrainerStagesErrorsModel()
counter = ExceptionCounter()
trainer = Trainer(
default_root_dir=tmpdir,
strategy=strategy,
devices=1,
callbacks=counter,
fast_dev_run=True,
)
with pytest.raises(Exception, match=r"Error during train"):
trainer.fit(model)
assert counter.exceptions == 1
with pytest.raises(Exception, match=r"Error during validation"):
trainer.validate(model)
assert counter.exceptions == 2
with pytest.raises(Exception, match=r"Error during test"):
trainer.test(model)
assert counter.exceptions == 3
with pytest.raises(Exception, match=r"Error during predict"):
trainer.predict(model, model.val_dataloader(), return_predictions=False)
assert counter.exceptions == 4
def test_trainer_metrics_reset_before_each_task(tmpdir):
"""Test that callback, logged and progress bar metrics are reset before each task starts."""
class TestMetricRestartCallback(Callback):
def _make_assertions(self, trainer):
assert trainer.callback_metrics == {}
assert trainer.progress_bar_metrics == {}
assert trainer.logged_metrics == {}
def on_train_start(self, trainer, *args, **kwargs):
self._make_assertions(trainer)
def on_validation_start(self, trainer, *args, **kwargs):
if trainer.state.fn == TrainerFn.VALIDATING:
self._make_assertions(trainer)
def on_test_start(self, trainer, *args, **kwargs):
self._make_assertions(trainer)
def on_predict_start(self, trainer, *args, **kwargs):
self._make_assertions(trainer)
class CustomBoringModel(BoringModel):
def __init__(self):
super().__init__()
def training_step(self, *args, **kwargs):
self.log("train/metric", 7.0)
return super().training_step(*args, **kwargs)
def validation_step(self, *args, **kwargs):
self.log("val/metric", 14.0)
return super().validation_step(*args, **kwargs)
def test_step(self, *args, **kwargs):
self.log("test/metric", 21.0)
return super().test_step(*args, **kwargs)
model = CustomBoringModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=4, callbacks=[TestMetricRestartCallback()])
trainer.fit(model)
trainer.validate(model)
trainer.test(model)
trainer.predict(model)
def test_detect_anomaly_nan(tmpdir):
class NanModel(BoringModel):
def training_step(self, batch, batch_idx):
output = super().training_step(batch, batch_idx)
output["loss"] = output["loss"] * torch.tensor(float("nan"))
return output
model = NanModel()
trainer = Trainer(default_root_dir=tmpdir, detect_anomaly=True)
with pytest.raises(RuntimeError, match=r"returned nan values in its 0th output."):
with pytest.warns(
UserWarning, match=r".*Error detected in.* Traceback of forward call that caused the error.*"
):
trainer.fit(model)
@pytest.mark.parametrize(
["trainer_kwargs", "strategy_cls", "strategy_name", "accelerator_cls", "devices"],
[
({"strategy": None}, SingleDeviceStrategy, "single_device", CPUAccelerator, 1),
({"strategy": "dp"}, DDPStrategy, "ddp", CPUAccelerator, 1),
({"strategy": "ddp"}, DDPStrategy, "ddp", CPUAccelerator, 1),
({"strategy": "ddp", "num_nodes": 2}, DDPStrategy, "ddp", CPUAccelerator, 1),
(
{"strategy": None, "accelerator": "cuda", "devices": 1},
SingleDeviceStrategy,
"single_device",
CUDAAccelerator,
1,
),
({"strategy": "dp", "accelerator": "cuda", "devices": 1}, DataParallelStrategy, "dp", CUDAAccelerator, 1),
({"strategy": "ddp", "accelerator": "cuda", "devices": 1}, DDPStrategy, "ddp", CUDAAccelerator, 1),
(
{"strategy": "ddp_spawn", "accelerator": "cuda", "devices": 1},
DDPSpawnStrategy,
"ddp_spawn",
CUDAAccelerator,
1,
),
({"strategy": None, "accelerator": "cuda", "devices": 2}, DDPSpawnStrategy, "ddp_spawn", CUDAAccelerator, 2),
({"strategy": "dp", "accelerator": "cuda", "devices": 2}, DataParallelStrategy, "dp", CUDAAccelerator, 2),
({"strategy": "ddp", "accelerator": "cuda", "devices": 2}, DDPStrategy, "ddp", CUDAAccelerator, 2),
({"strategy": "ddp", "accelerator": "cpu", "devices": 2}, DDPStrategy, "ddp", CPUAccelerator, 2),
(
{"strategy": "ddp_spawn", "accelerator": "cpu", "devices": 2},
DDPSpawnStrategy,
"ddp_spawn",
CPUAccelerator,
2,
),
(
{"strategy": "ddp_spawn", "accelerator": "cpu", "devices": 1},
DDPSpawnStrategy,
"ddp_spawn",
CPUAccelerator,
1,
),
(
{"strategy": "ddp_fully_sharded", "accelerator": "cuda", "devices": 1},
DDPFullyShardedStrategy,
"ddp_fully_sharded",
CUDAAccelerator,
1,
),
(
{"strategy": DDPSpawnStrategy(), "accelerator": "cpu", "devices": 2},
DDPSpawnStrategy,
"ddp_spawn",
CPUAccelerator,
2,
),
(
{"strategy": DDPSpawnStrategy(), "accelerator": "cuda", "devices": 2},
DDPSpawnStrategy,
"ddp_spawn",
CUDAAccelerator,
2,
),
({"strategy": DDPStrategy()}, DDPStrategy, "ddp", CPUAccelerator, 1),
({"strategy": DDPStrategy(), "accelerator": "cuda", "devices": 2}, DDPStrategy, "ddp", CUDAAccelerator, 2),
(
{"strategy": DataParallelStrategy(), "accelerator": "cuda", "devices": 2},
DataParallelStrategy,
"dp",
CUDAAccelerator,
2,
),
2022-02-22 13:02:13 +00:00
(
{"strategy": DDPFullyShardedStrategy(), "accelerator": "cuda", "devices": 2},
2022-02-22 13:02:13 +00:00
DDPFullyShardedStrategy,
"ddp_fully_sharded",
CUDAAccelerator,
2022-02-22 13:02:13 +00:00
2,
),
(
{"strategy": DDPSpawnShardedStrategy(), "accelerator": "cuda", "devices": 2},
2022-02-22 13:02:13 +00:00
DDPSpawnShardedStrategy,
"ddp_sharded_spawn",
CUDAAccelerator,
2022-02-22 13:02:13 +00:00
2,
),
(
{"strategy": DDPShardedStrategy(), "accelerator": "cuda", "devices": 2},
DDPShardedStrategy,
"ddp_sharded",
CUDAAccelerator,
2,
),
(
{"strategy": "ddp_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2},
DDPSpawnStrategy,
"ddp_spawn",
CUDAAccelerator,
2,
),
(
{"strategy": "ddp_fully_sharded", "accelerator": "cuda", "devices": 1, "num_nodes": 2},
DDPFullyShardedStrategy,
"ddp_fully_sharded",
CUDAAccelerator,
1,
),
(
{"strategy": "ddp_sharded", "accelerator": "cuda", "devices": 2, "num_nodes": 2},
DDPShardedStrategy,
"ddp_sharded",
CUDAAccelerator,
2,
),
(
{"strategy": "ddp_sharded_spawn", "accelerator": "cuda", "devices": 2, "num_nodes": 2},
DDPSpawnShardedStrategy,
"ddp_sharded_spawn",
CUDAAccelerator,
2,
),
],
)
def test_trainer_config_strategy(monkeypatch, trainer_kwargs, strategy_cls, strategy_name, accelerator_cls, devices):
if trainer_kwargs.get("accelerator") == "cuda":
mock_cuda_count(monkeypatch, trainer_kwargs["devices"])
trainer = Trainer(**trainer_kwargs)
assert isinstance(trainer.strategy, strategy_cls)
assert strategy_cls.strategy_name == strategy_name
2022-02-22 13:02:13 +00:00
assert isinstance(trainer.accelerator, accelerator_cls)
assert trainer.num_devices == devices
assert trainer.num_nodes == trainer_kwargs.get("num_nodes", 1)
# Test with `gpus` and `num_processes` flags
if trainer_kwargs.get("accelerator") == "gpu":
trainer_kwargs["gpus"] = trainer_kwargs.get("devices")
else:
trainer_kwargs["num_processes"] = trainer_kwargs.get("devices")
trainer_kwargs.pop("accelerator", None)
trainer_kwargs.pop("devices", None)
assert isinstance(trainer.strategy, strategy_cls)
assert strategy_cls.strategy_name == strategy_name
assert isinstance(trainer.accelerator, accelerator_cls)
assert trainer.num_devices == devices
assert trainer.num_nodes == trainer_kwargs.get("num_nodes", 1)
@pytest.mark.parametrize(
"running_stage", [RunningStage.TRAINING, RunningStage.VALIDATING, RunningStage.TESTING, RunningStage.PREDICTING]
)
def test_dataloaders_are_not_loaded_if_disabled_through_limit_batches(running_stage):
dl_prefix = running_stage.dataloader_prefix
trainer_kwargs = {f"limit_{dl_prefix}_batches": 0}
trainer = Trainer(**trainer_kwargs)
model = BoringModel()
trainer._data_connector.attach_data(model)
reset_dataloader = getattr(trainer, f"reset_{dl_prefix}_dataloader")
reset_dataloader(model)
dl = (
trainer.train_dataloader
if running_stage == RunningStage.TRAINING
else getattr(trainer, f"{dl_prefix}_dataloaders")
)
assert dl is None
@pytest.mark.parametrize(
["trainer_kwargs", "expected_device_ids"],
[
({}, [0]),
({"devices": 1}, [0]),
({"devices": 1}, [0]),
({"devices": "1"}, [0]),
({"devices": 2}, [0, 1]),
({"accelerator": "gpu", "devices": 1}, [0]),
({"accelerator": "cuda", "devices": 1}, [0]),
({"accelerator": "cuda", "devices": 2}, [0, 1]),
({"accelerator": "cuda", "devices": "2"}, [0, 1]),
({"accelerator": "cuda", "devices": [2]}, [2]),
({"accelerator": "cuda", "devices": "2,"}, [2]),
({"accelerator": "cuda", "devices": [0, 2]}, [0, 2]),
({"accelerator": "cuda", "devices": "0, 2"}, [0, 2]),
({"accelerator": "ipu", "devices": 1}, [0]),
({"accelerator": "ipu", "devices": 2}, [0, 1]),
pytest.param({"accelerator": "mps", "devices": 1}, [0], marks=RunIf(min_torch="1.12")),
],
)
def test_trainer_config_device_ids(monkeypatch, trainer_kwargs, expected_device_ids):
if trainer_kwargs.get("accelerator") in ("cuda", "gpu"):
mock_cuda_count(monkeypatch, 4)
elif trainer_kwargs.get("accelerator") in ("mps", "gpu"):
mock_mps_count(monkeypatch, 1)
elif trainer_kwargs.get("accelerator") == "ipu":
monkeypatch.setattr(pytorch_lightning.accelerators.ipu.IPUAccelerator, "is_available", lambda: True)
monkeypatch.setattr(pytorch_lightning.strategies.ipu, "_IPU_AVAILABLE", lambda: True)
trainer = Trainer(**trainer_kwargs)
assert trainer.device_ids == expected_device_ids
assert trainer.num_devices == len(expected_device_ids)
def test_trainer_save_checkpoint_no_model_attached():
trainer = Trainer()
assert trainer.model is None
with pytest.raises(AttributeError, match="Saving a checkpoint is only possible if a model is attached"):
trainer.save_checkpoint("checkpoint.ckpt")
def test_trainer_calls_logger_finalize_on_exception(tmpdir):
class CustomModel(BoringModel):
def on_fit_start(self):
super().on_fit_start()
raise Exception("logger-finalize")
model = CustomModel()
logger = TensorBoardLogger(save_dir=tmpdir)
logger.finalize = Mock()
trainer = Trainer(logger=logger)
with pytest.raises(Exception, match="logger-finalize"):
trainer.fit(model)
logger.finalize.assert_called_once_with("failed")
# TODO: replace with 1.14 when it is released
@RunIf(min_torch="1.14.0.dev20221202")
def test_trainer_compiled_model():
model = BoringModel()
model = torch.compile(model)
2022-12-07 07:12:06 +00:00
data = BoringDataModule()
trainer = Trainer(
max_epochs=1,
limit_train_batches=1,
limit_val_batches=1,
)
2022-12-07 07:12:06 +00:00
trainer.fit(model, data)
assert trainer.model._compiler_ctx["compiler"] == "dynamo"
model = model.to_uncompiled()
assert model._compiler_ctx is None
2022-12-07 07:12:06 +00:00
trainer.fit(model)
assert trainer.model._compiler_ctx is None
model = torch.compile(model)
trainer = Trainer(max_epochs=1, limit_train_batches=1, limit_val_batches=1, strategy=DDPShardedStrategy)
with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
trainer.fit(model)
trainer = Trainer(max_epochs=1, limit_train_batches=1, limit_val_batches=1, strategy=DDPStrategy)
trainer.fit(model)