fixing miss-leading tested acc values (#5876)
* fixing tested values
* .
* tests
* yapf
* softmax
* hvd
* rename
* lr
* duplicate
* drop
* classif
* rm EvalModel
* Revert "rm EvalModel"
This reverts commit 6c3fb39ebe
.
* update tests
* fix
* azure
* azure
* self
* cpu
* Apply suggestions from code review
Co-authored-by: rohitgr7 <rohitgr1998@gmail.com>
This commit is contained in:
parent
ebabe56f4e
commit
1c851b89e1
|
@ -20,7 +20,8 @@ from argparse import ArgumentParser
|
|||
import torch
|
||||
|
||||
from pytorch_lightning import seed_everything, Trainer
|
||||
from tests.base import EvalModelTemplate
|
||||
from tests.helpers.datamodules import ClassifDataModule
|
||||
from tests.helpers.simple_models import ClassificationModel
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -35,24 +36,28 @@ def main():
|
|||
parser.set_defaults(accelerator="ddp")
|
||||
args = parser.parse_args()
|
||||
|
||||
model = EvalModelTemplate()
|
||||
dm = ClassifDataModule()
|
||||
model = ClassificationModel()
|
||||
trainer = Trainer.from_argparse_args(args)
|
||||
|
||||
result = {}
|
||||
if args.trainer_method == 'fit':
|
||||
trainer.fit(model)
|
||||
result = {'status': 'complete', 'method': args.trainer_method, 'result': None}
|
||||
if args.trainer_method == 'test':
|
||||
result = trainer.test(model)
|
||||
result = {'status': 'complete', 'method': args.trainer_method, 'result': result}
|
||||
if args.trainer_method == 'fit_test':
|
||||
trainer.fit(model)
|
||||
result = trainer.test(model)
|
||||
result = {'status': 'complete', 'method': args.trainer_method, 'result': result}
|
||||
trainer.fit(model, datamodule=dm)
|
||||
result = None
|
||||
elif args.trainer_method == 'test':
|
||||
result = trainer.test(model, datamodule=dm)
|
||||
elif args.trainer_method == 'fit_test':
|
||||
trainer.fit(model, datamodule=dm)
|
||||
result = trainer.test(model, datamodule=dm)
|
||||
else:
|
||||
raise ValueError(f'Unsupported: {args.trainer_method}')
|
||||
|
||||
if len(result) > 0:
|
||||
file_path = os.path.join(args.tmpdir, 'ddp.result')
|
||||
torch.save(result, file_path)
|
||||
result_ext = {
|
||||
'status': 'complete',
|
||||
'method': args.trainer_method,
|
||||
'result': result,
|
||||
}
|
||||
file_path = os.path.join(args.tmpdir, 'ddp.result')
|
||||
torch.save(result_ext, file_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -23,14 +23,13 @@ from tests.accelerators import ddp_model, DDPLauncher
|
|||
from tests.helpers.boring_model import BoringModel
|
||||
from tests.utilities.distributed import call_training_script
|
||||
|
||||
CLI_ARGS = '--max_epochs 1 --gpus 2 --accelerator ddp'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cli_args', [
|
||||
pytest.param('--max_epochs 1 --gpus 2 --accelerator ddp'),
|
||||
])
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
def test_multi_gpu_model_ddp_fit_only(tmpdir, cli_args):
|
||||
def test_multi_gpu_model_ddp_fit_only(tmpdir):
|
||||
# call the script
|
||||
std, err = call_training_script(ddp_model, cli_args, 'fit', tmpdir, timeout=120)
|
||||
call_training_script(ddp_model, CLI_ARGS, 'fit', tmpdir, timeout=120)
|
||||
|
||||
# load the results of the script
|
||||
result_path = os.path.join(tmpdir, 'ddp.result')
|
||||
|
@ -40,13 +39,10 @@ def test_multi_gpu_model_ddp_fit_only(tmpdir, cli_args):
|
|||
assert result['status'] == 'complete'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cli_args', [
|
||||
pytest.param('--max_epochs 1 --gpus 2 --accelerator ddp'),
|
||||
])
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
def test_multi_gpu_model_ddp_test_only(tmpdir, cli_args):
|
||||
def test_multi_gpu_model_ddp_test_only(tmpdir):
|
||||
# call the script
|
||||
call_training_script(ddp_model, cli_args, 'test', tmpdir)
|
||||
call_training_script(ddp_model, CLI_ARGS, 'test', tmpdir)
|
||||
|
||||
# load the results of the script
|
||||
result_path = os.path.join(tmpdir, 'ddp.result')
|
||||
|
@ -56,13 +52,10 @@ def test_multi_gpu_model_ddp_test_only(tmpdir, cli_args):
|
|||
assert result['status'] == 'complete'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cli_args', [
|
||||
pytest.param('--max_epochs 1 --gpus 2 --accelerator ddp'),
|
||||
])
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
|
||||
def test_multi_gpu_model_ddp_fit_test(tmpdir):
|
||||
# call the script
|
||||
call_training_script(ddp_model, cli_args, 'fit_test', tmpdir, timeout=20)
|
||||
call_training_script(ddp_model, CLI_ARGS, 'fit_test', tmpdir, timeout=20)
|
||||
|
||||
# load the results of the script
|
||||
result_path = os.path.join(tmpdir, 'ddp.result')
|
||||
|
@ -73,7 +66,7 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
|
|||
|
||||
model_outs = result['result']
|
||||
for out in model_outs:
|
||||
assert out['test_acc'] > 0.90
|
||||
assert out['test_acc'] > 0.7
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
|
|
|
@ -20,7 +20,9 @@ from pytorch_lightning.callbacks import EarlyStopping
|
|||
from pytorch_lightning.core import memory
|
||||
from pytorch_lightning.trainer import Trainer
|
||||
from pytorch_lightning.trainer.states import TrainerState
|
||||
from tests.base import EvalModelTemplate
|
||||
from tests.helpers import BoringModel
|
||||
from tests.helpers.datamodules import ClassifDataModule
|
||||
from tests.helpers.simple_models import ClassificationModel
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
|
@ -29,7 +31,7 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
|
|||
|
||||
trainer_options = dict(
|
||||
default_root_dir=tmpdir,
|
||||
callbacks=[EarlyStopping()],
|
||||
callbacks=[EarlyStopping(monitor='train_acc')],
|
||||
max_epochs=50,
|
||||
limit_train_batches=10,
|
||||
limit_val_batches=10,
|
||||
|
@ -37,8 +39,9 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
|
|||
accelerator='ddp_spawn',
|
||||
)
|
||||
|
||||
model = EvalModelTemplate()
|
||||
tpipes.run_model_test(trainer_options, model)
|
||||
dm = ClassifDataModule()
|
||||
model = ClassificationModel()
|
||||
tpipes.run_model_test(trainer_options, model, dm)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
|
@ -55,7 +58,7 @@ def test_multi_gpu_model_ddp_spawn(tmpdir):
|
|||
progress_bar_refresh_rate=0,
|
||||
)
|
||||
|
||||
model = EvalModelTemplate()
|
||||
model = BoringModel()
|
||||
|
||||
tpipes.run_model_test(trainer_options, model)
|
||||
|
||||
|
@ -68,7 +71,7 @@ def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
|
|||
"""Make sure DDP works with dataloaders passed to fit()"""
|
||||
tutils.set_random_master_port()
|
||||
|
||||
model = EvalModelTemplate()
|
||||
model = BoringModel()
|
||||
fit_options = dict(train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())
|
||||
|
||||
trainer = Trainer(
|
||||
|
|
|
@ -11,27 +11,61 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
import pytorch_lightning as pl
|
||||
import tests.helpers.pipelines as tpipes
|
||||
import tests.helpers.utils as tutils
|
||||
from pytorch_lightning.callbacks import EarlyStopping
|
||||
from pytorch_lightning.core import memory
|
||||
from tests.base import EvalModelTemplate
|
||||
from tests.helpers import BoringModel
|
||||
from tests.helpers.datamodules import ClassifDataModule
|
||||
from tests.helpers.simple_models import ClassificationModel
|
||||
|
||||
PRETEND_N_OF_GPUS = 16
|
||||
|
||||
|
||||
class CustomClassificationModelDP(ClassificationModel):
|
||||
|
||||
def _step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
return {'logits': logits, 'y': y}
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
out = self._step(batch, batch_idx)
|
||||
loss = F.cross_entropy(out['logits'], out['y'])
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
return self._step(batch, batch_idx)
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
return self._step(batch, batch_idx)
|
||||
|
||||
def validation_step_end(self, outputs):
|
||||
self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
|
||||
|
||||
def test_step_end(self, outputs):
|
||||
self.log('test_acc', self.test_acc(outputs['logits'], outputs['y']))
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
def test_multi_gpu_early_stop_dp(tmpdir):
|
||||
"""Make sure DDP works. with early stopping"""
|
||||
tutils.set_random_master_port()
|
||||
|
||||
dm = ClassifDataModule()
|
||||
model = CustomClassificationModelDP()
|
||||
|
||||
trainer_options = dict(
|
||||
default_root_dir=tmpdir,
|
||||
callbacks=[EarlyStopping()],
|
||||
callbacks=[EarlyStopping(monitor='val_acc')],
|
||||
max_epochs=50,
|
||||
limit_train_batches=10,
|
||||
limit_val_batches=10,
|
||||
|
@ -39,8 +73,7 @@ def test_multi_gpu_early_stop_dp(tmpdir):
|
|||
accelerator='dp',
|
||||
)
|
||||
|
||||
model = EvalModelTemplate()
|
||||
tpipes.run_model_test(trainer_options, model)
|
||||
tpipes.run_model_test(trainer_options, model, dm)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
|
@ -57,7 +90,7 @@ def test_multi_gpu_model_dp(tmpdir):
|
|||
progress_bar_refresh_rate=0,
|
||||
)
|
||||
|
||||
model = EvalModelTemplate()
|
||||
model = BoringModel()
|
||||
|
||||
tpipes.run_model_test(trainer_options, model)
|
||||
|
||||
|
@ -65,14 +98,13 @@ def test_multi_gpu_model_dp(tmpdir):
|
|||
memory.get_memory_profile('min_max')
|
||||
|
||||
|
||||
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
def test_dp_test(tmpdir):
|
||||
tutils.set_random_master_port()
|
||||
|
||||
import os
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
|
||||
|
||||
model = EvalModelTemplate()
|
||||
dm = ClassifDataModule()
|
||||
model = CustomClassificationModelDP()
|
||||
trainer = pl.Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
max_epochs=2,
|
||||
|
@ -81,17 +113,17 @@ def test_dp_test(tmpdir):
|
|||
gpus=[0, 1],
|
||||
accelerator='dp',
|
||||
)
|
||||
trainer.fit(model)
|
||||
trainer.fit(model, datamodule=dm)
|
||||
assert 'ckpt' in trainer.checkpoint_callback.best_model_path
|
||||
results = trainer.test()
|
||||
results = trainer.test(datamodule=dm)
|
||||
assert 'test_acc' in results[0]
|
||||
|
||||
old_weights = model.c_d1.weight.clone().detach().cpu()
|
||||
old_weights = model.layer_0.weight.clone().detach().cpu()
|
||||
|
||||
results = trainer.test(model)
|
||||
results = trainer.test(model, datamodule=dm)
|
||||
assert 'test_acc' in results[0]
|
||||
|
||||
# make sure weights didn't change
|
||||
new_weights = model.c_d1.weight.clone().detach().cpu()
|
||||
new_weights = model.layer_0.weight.clone().detach().cpu()
|
||||
|
||||
assert torch.all(torch.eq(old_weights, new_weights))
|
||||
|
|
|
@ -111,7 +111,7 @@ class EvalModelTemplate(
|
|||
x = self.c_d1_drop(x)
|
||||
|
||||
x = self.c_d2(x)
|
||||
logits = F.log_softmax(x, dim=1)
|
||||
logits = F.softmax(x, dim=1)
|
||||
|
||||
return logits
|
||||
|
||||
|
|
|
@ -385,9 +385,8 @@ def test_full_loop_dp(tmpdir):
|
|||
return {'logits': logits, 'y': y}
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
_, y = batch
|
||||
out = self._step(batch, batch_idx)
|
||||
loss = F.cross_entropy(out['logits'], y)
|
||||
loss = F.cross_entropy(out['logits'], out['y'])
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
|
|
|
@ -13,39 +13,41 @@
|
|||
# limitations under the License.
|
||||
import torch
|
||||
|
||||
from pytorch_lightning import LightningDataModule, Trainer
|
||||
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
|
||||
from pytorch_lightning.metrics.functional import accuracy
|
||||
from pytorch_lightning.trainer.states import TrainerState
|
||||
from pytorch_lightning.utilities import DistributedType
|
||||
from tests.helpers import BoringModel
|
||||
from tests.helpers.utils import get_default_logger, load_model_from_checkpoint, reset_seed
|
||||
|
||||
|
||||
def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50):
|
||||
def run_model_test_without_loggers(
|
||||
trainer_options: dict, model: LightningModule, data: LightningDataModule = None, min_acc: float = 0.50
|
||||
):
|
||||
reset_seed()
|
||||
|
||||
# fit model
|
||||
trainer = Trainer(**trainer_options)
|
||||
trainer.fit(model)
|
||||
trainer.fit(model, datamodule=data)
|
||||
|
||||
# correct result and ok accuracy
|
||||
assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
|
||||
|
||||
pretrained_model = load_model_from_checkpoint(
|
||||
trainer.logger, trainer.checkpoint_callback.best_model_path, type(model)
|
||||
)
|
||||
model2 = load_model_from_checkpoint(trainer.logger, trainer.checkpoint_callback.best_model_path, type(model))
|
||||
|
||||
# test new model accuracy
|
||||
test_loaders = model.test_dataloader()
|
||||
test_loaders = model2.test_dataloader() if not data else data.test_dataloader()
|
||||
if not isinstance(test_loaders, list):
|
||||
test_loaders = [test_loaders]
|
||||
|
||||
for dataloader in test_loaders:
|
||||
run_prediction(pretrained_model, dataloader, min_acc=min_acc)
|
||||
if not isinstance(model2, BoringModel):
|
||||
for dataloader in test_loaders:
|
||||
run_prediction_eval_model_template(model2, dataloader, min_acc=min_acc)
|
||||
|
||||
|
||||
def run_model_test(
|
||||
trainer_options,
|
||||
model,
|
||||
model: LightningModule,
|
||||
data: LightningDataModule = None,
|
||||
on_gpu: bool = True,
|
||||
version=None,
|
||||
|
@ -76,8 +78,9 @@ def run_model_test(
|
|||
if not isinstance(test_loaders, list):
|
||||
test_loaders = [test_loaders]
|
||||
|
||||
for dataloader in test_loaders:
|
||||
run_prediction(pretrained_model, dataloader, min_acc=min_acc)
|
||||
if not isinstance(model, BoringModel):
|
||||
for dataloader in test_loaders:
|
||||
run_prediction_eval_model_template(model, dataloader, min_acc=min_acc)
|
||||
|
||||
if with_hpc:
|
||||
if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2):
|
||||
|
@ -92,50 +95,17 @@ def run_model_test(
|
|||
trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu)
|
||||
|
||||
|
||||
def run_prediction(trained_model, dataloader, dp=False, min_acc=0.25):
|
||||
if isinstance(trained_model, BoringModel):
|
||||
return _boring_model_run_prediction(trained_model, dataloader, min_acc)
|
||||
else:
|
||||
return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc=min_acc)
|
||||
|
||||
|
||||
def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min_acc=0.50):
|
||||
# run prediction on 1 batch
|
||||
batch = next(iter(dataloader))
|
||||
x, y = batch
|
||||
x = x.view(x.size(0), -1)
|
||||
|
||||
if dp:
|
||||
with torch.no_grad():
|
||||
output = trained_model(batch, 0)
|
||||
acc = output['val_acc']
|
||||
acc = torch.mean(acc).item()
|
||||
|
||||
else:
|
||||
with torch.no_grad():
|
||||
y_hat = trained_model(x)
|
||||
y_hat = y_hat.cpu()
|
||||
|
||||
# acc
|
||||
labels_hat = torch.argmax(y_hat, dim=1)
|
||||
|
||||
y = y.cpu()
|
||||
acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)
|
||||
acc = torch.tensor(acc)
|
||||
acc = acc.item()
|
||||
|
||||
assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})"
|
||||
|
||||
|
||||
# TODO: This test compares a loss value with a min accuracy - complete non-sense!
|
||||
# create BoringModels that make actual predictions!
|
||||
def _boring_model_run_prediction(trained_model, dataloader, min_acc=0.25):
|
||||
@torch.no_grad()
|
||||
def run_prediction_eval_model_template(trained_model, dataloader, min_acc=0.50):
|
||||
# run prediction on 1 batch
|
||||
trained_model.cpu()
|
||||
trained_model.eval()
|
||||
|
||||
batch = next(iter(dataloader))
|
||||
x, y = batch
|
||||
x = x.flatten(1)
|
||||
|
||||
with torch.no_grad():
|
||||
output = trained_model(batch)
|
||||
y_hat = trained_model(x)
|
||||
acc = accuracy(y_hat.cpu(), y.cpu(), top_k=2).item()
|
||||
|
||||
acc = trained_model.loss(batch, output)
|
||||
assert acc >= min_acc, f"This model is expected to get, {min_acc} in test set but got {acc}"
|
||||
assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})"
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
# Copyright The PyTorch Lightning team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from tests.models.data.horovod.train_default_model import run_test_from_config
|
||||
|
||||
|
||||
def test_horovod_model_script(tmpdir):
|
||||
"""This just for testing/debugging horovod script without horovod..."""
|
||||
trainer_options = dict(
|
||||
default_root_dir=str(tmpdir),
|
||||
weights_save_path=str(tmpdir),
|
||||
gradient_clip_val=1.0,
|
||||
progress_bar_refresh_rate=0,
|
||||
max_epochs=1,
|
||||
limit_train_batches=0.4,
|
||||
limit_val_batches=0.2,
|
||||
deterministic=True,
|
||||
)
|
||||
run_test_from_config(trainer_options, check_size=False, on_gpu=False)
|
|
@ -37,7 +37,6 @@ else:
|
|||
print('You requested to import Horovod which is missing or not supported for your OS.')
|
||||
|
||||
from tests.helpers import BoringModel # noqa: E402
|
||||
from tests.helpers.pipelines import run_prediction # noqa: E402
|
||||
from tests.helpers.utils import reset_seed, set_random_master_port # noqa: E402
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
@ -45,7 +44,7 @@ parser.add_argument('--trainer-options', required=True)
|
|||
parser.add_argument('--on-gpu', action='store_true', default=False)
|
||||
|
||||
|
||||
def run_test_from_config(trainer_options):
|
||||
def run_test_from_config(trainer_options, on_gpu, check_size=True):
|
||||
"""Trains the default model with the given config."""
|
||||
set_random_master_port()
|
||||
reset_seed()
|
||||
|
@ -60,7 +59,8 @@ def run_test_from_config(trainer_options):
|
|||
assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
|
||||
|
||||
# Horovod should be initialized following training. If not, this will raise an exception.
|
||||
assert hvd.size() == 2
|
||||
if check_size:
|
||||
assert hvd.size() == 2
|
||||
|
||||
if trainer.global_rank > 0:
|
||||
return
|
||||
|
@ -74,15 +74,16 @@ def run_test_from_config(trainer_options):
|
|||
test_loaders = [test_loaders]
|
||||
|
||||
for dataloader in test_loaders:
|
||||
run_prediction(pretrained_model, dataloader)
|
||||
batch = next(iter(dataloader))
|
||||
pretrained_model(batch)
|
||||
|
||||
# test HPC saving
|
||||
trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger)
|
||||
# test HPC loading
|
||||
checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(ckpt_path)
|
||||
trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=args.on_gpu)
|
||||
trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu)
|
||||
|
||||
if args.on_gpu:
|
||||
if on_gpu:
|
||||
trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1)
|
||||
# Test the root_gpu property
|
||||
assert trainer.root_gpu == hvd.local_rank()
|
||||
|
@ -90,4 +91,4 @@ def run_test_from_config(trainer_options):
|
|||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
run_test_from_config(json.loads(args.trainer_options))
|
||||
run_test_from_config(json.loads(args.trainer_options), args.on_gpu)
|
||||
|
|
|
@ -24,6 +24,8 @@ from pytorch_lightning import Trainer
|
|||
from pytorch_lightning.utilities import device_parser
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
from tests.helpers import BoringModel
|
||||
from tests.helpers.datamodules import ClassifDataModule
|
||||
from tests.helpers.simple_models import ClassificationModel
|
||||
|
||||
PRETEND_N_OF_GPUS = 16
|
||||
|
||||
|
@ -41,8 +43,9 @@ def test_multi_gpu_none_backend(tmpdir):
|
|||
gpus=2,
|
||||
)
|
||||
|
||||
model = BoringModel()
|
||||
tpipes.run_model_test(trainer_options, model, min_acc=0.20)
|
||||
dm = ClassifDataModule()
|
||||
model = ClassificationModel()
|
||||
tpipes.run_model_test(trainer_options, model, dm)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
|
|
|
@ -85,6 +85,28 @@ class GenericValTestLossBoringModel(GenericParentValTestLossBoringModel[int]):
|
|||
pass
|
||||
|
||||
|
||||
class CustomClassificationModelDP(ClassificationModel):
|
||||
|
||||
def _step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
return {'logits': logits, 'y': y}
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
out = self._step(batch, batch_idx)
|
||||
loss = F.cross_entropy(out['logits'], out['y'])
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
return self._step(batch, batch_idx)
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
return self._step(batch, batch_idx)
|
||||
|
||||
def validation_step_end(self, outputs):
|
||||
self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
|
||||
|
||||
|
||||
def test_model_properties_resume_from_checkpoint(tmpdir):
|
||||
"""
|
||||
Test that properties like `current_epoch` and `global_step`
|
||||
|
@ -198,28 +220,6 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
|
|||
|
||||
tutils.set_random_master_port()
|
||||
|
||||
class CustomClassificationModelDP(ClassificationModel):
|
||||
|
||||
def _step(self, batch, batch_idx):
|
||||
x, y = batch
|
||||
logits = self(x)
|
||||
return {'logits': logits, 'y': y}
|
||||
|
||||
def training_step(self, batch, batch_idx):
|
||||
_, y = batch
|
||||
out = self._step(batch, batch_idx)
|
||||
loss = F.cross_entropy(out['logits'], y)
|
||||
return loss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
return self._step(batch, batch_idx)
|
||||
|
||||
def test_step(self, batch, batch_idx):
|
||||
return self._step(batch, batch_idx)
|
||||
|
||||
def validation_step_end(self, outputs):
|
||||
self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
|
||||
|
||||
dm = ClassifDataModule()
|
||||
model = CustomClassificationModelDP(lr=0.1)
|
||||
|
||||
|
@ -259,7 +259,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
|
|||
dataloaders = [dataloaders]
|
||||
|
||||
for dataloader in dataloaders:
|
||||
tpipes.run_prediction(pretrained_model, dataloader)
|
||||
tpipes.run_prediction_eval_model_template(pretrained_model, dataloader)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
|
@ -307,7 +307,7 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
|
|||
dataloaders = [dataloaders]
|
||||
|
||||
for dataloader in dataloaders:
|
||||
tpipes.run_prediction(pretrained_model, dataloader, min_acc=0.1)
|
||||
tpipes.run_prediction_eval_model_template(pretrained_model, dataloader, min_acc=0.1)
|
||||
|
||||
|
||||
def test_running_test_pretrained_model_cpu(tmpdir):
|
||||
|
@ -398,7 +398,8 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
|
|||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
|
||||
def test_dp_resume(tmpdir):
|
||||
"""Make sure DP continues training correctly."""
|
||||
model = BoringModel()
|
||||
model = CustomClassificationModelDP(lr=0.1)
|
||||
dm = ClassifDataModule()
|
||||
|
||||
trainer_options = dict(max_epochs=1, gpus=2, accelerator='dp', default_root_dir=tmpdir)
|
||||
|
||||
|
@ -416,7 +417,7 @@ def test_dp_resume(tmpdir):
|
|||
# fit model
|
||||
trainer = Trainer(**trainer_options)
|
||||
trainer.is_slurm_managing_tasks = True
|
||||
trainer.fit(model)
|
||||
trainer.fit(model, datamodule=dm)
|
||||
|
||||
# track epoch before saving. Increment since we finished the current epoch, don't want to rerun
|
||||
real_global_epoch = trainer.current_epoch + 1
|
||||
|
@ -439,7 +440,7 @@ def test_dp_resume(tmpdir):
|
|||
trainer_options['max_epochs'] = 1
|
||||
new_trainer = Trainer(**trainer_options)
|
||||
|
||||
class CustomModel(BoringModel):
|
||||
class CustomModel(CustomClassificationModelDP):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
@ -451,19 +452,17 @@ def test_dp_resume(tmpdir):
|
|||
|
||||
# if model and state loaded correctly, predictions will be good even though we
|
||||
# haven't trained with the new loaded model
|
||||
dp_model = new_trainer.model
|
||||
dp_model.eval()
|
||||
new_trainer._running_stage = RunningStage.EVALUATING
|
||||
|
||||
dataloader = self.train_dataloader()
|
||||
tpipes.run_prediction(self.trainer.lightning_module, dataloader)
|
||||
tpipes.run_prediction_eval_model_template(self.trainer.lightning_module, dataloader=dataloader)
|
||||
self.on_train_start_called = True
|
||||
|
||||
# new model
|
||||
model = CustomModel()
|
||||
|
||||
# fit new model which should load hpc weights
|
||||
new_trainer.fit(model)
|
||||
new_trainer.fit(model, datamodule=dm)
|
||||
assert model.on_train_start_called
|
||||
|
||||
# test freeze on gpu
|
||||
|
|
|
@ -223,12 +223,19 @@ def test_tpu_grad_norm(tmpdir):
|
|||
@pl_multi_process_test
|
||||
def test_dataloaders_passed_to_fit(tmpdir):
|
||||
"""Test if dataloaders passed to trainer works on TPU"""
|
||||
|
||||
tutils.reset_seed()
|
||||
model = BoringModel()
|
||||
|
||||
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, tpu_cores=8)
|
||||
trainer.fit(model, train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())
|
||||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
max_epochs=1,
|
||||
tpu_cores=8,
|
||||
)
|
||||
trainer.fit(
|
||||
model,
|
||||
train_dataloader=model.train_dataloader(),
|
||||
val_dataloaders=model.val_dataloader(),
|
||||
)
|
||||
assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
|
||||
|
||||
|
||||
|
|
|
@ -130,7 +130,7 @@ def test_multiple_val_dataloader(tmpdir):
|
|||
|
||||
# make sure predictions are good for each val set
|
||||
for dataloader in trainer.val_dataloaders:
|
||||
tpipes.run_prediction(trained_model=model, dataloader=dataloader)
|
||||
tpipes.run_prediction_eval_model_template(trained_model=model, dataloader=dataloader)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific'])
|
||||
|
@ -153,8 +153,8 @@ def test_multiple_test_dataloader(tmpdir, ckpt_path):
|
|||
trainer = Trainer(
|
||||
default_root_dir=tmpdir,
|
||||
max_epochs=1,
|
||||
limit_val_batches=0.1,
|
||||
limit_train_batches=0.2,
|
||||
limit_val_batches=10,
|
||||
limit_train_batches=100,
|
||||
)
|
||||
trainer.fit(model)
|
||||
if ckpt_path == 'specific':
|
||||
|
@ -162,12 +162,11 @@ def test_multiple_test_dataloader(tmpdir, ckpt_path):
|
|||
trainer.test(ckpt_path=ckpt_path)
|
||||
|
||||
# verify there are 2 test loaders
|
||||
assert len(trainer.test_dataloaders) == 2, \
|
||||
'Multiple test_dataloaders not initiated properly'
|
||||
assert len(trainer.test_dataloaders) == 2, 'Multiple test_dataloaders not initiated properly'
|
||||
|
||||
# make sure predictions are good for each test set
|
||||
for dataloader in trainer.test_dataloaders:
|
||||
tpipes.run_prediction(trainer.model, dataloader)
|
||||
tpipes.run_prediction_eval_model_template(trainer.model, dataloader)
|
||||
|
||||
# run the test method
|
||||
trainer.test(ckpt_path=ckpt_path)
|
||||
|
|
|
@ -229,8 +229,8 @@ def test_accumulation_and_early_stopping(tmpdir):
|
|||
def test_suggestion_parameters_work(tmpdir):
|
||||
""" Test that default skipping does not alter results in basic case """
|
||||
|
||||
hparams = EvalModelTemplate.get_default_hparams()
|
||||
model = EvalModelTemplate(**hparams)
|
||||
dm = ClassifDataModule()
|
||||
model = ClassificationModel()
|
||||
|
||||
# logger file to get meta
|
||||
trainer = Trainer(
|
||||
|
@ -238,12 +238,11 @@ def test_suggestion_parameters_work(tmpdir):
|
|||
max_epochs=3,
|
||||
)
|
||||
|
||||
lrfinder = trainer.tuner.lr_find(model)
|
||||
lrfinder = trainer.tuner.lr_find(model, datamodule=dm)
|
||||
lr1 = lrfinder.suggestion(skip_begin=10) # default
|
||||
lr2 = lrfinder.suggestion(skip_begin=80) # way too high, should have an impact
|
||||
lr2 = lrfinder.suggestion(skip_begin=150) # way too high, should have an impact
|
||||
|
||||
assert lr1 != lr2, \
|
||||
'Skipping parameter did not influence learning rate'
|
||||
assert lr1 != lr2, 'Skipping parameter did not influence learning rate'
|
||||
|
||||
|
||||
def test_suggestion_with_non_finite_values(tmpdir):
|
||||
|
|
|
@ -113,8 +113,8 @@ def test_lightning_getattr(tmpdir):
|
|||
|
||||
for m in models:
|
||||
with pytest.raises(
|
||||
AttributeError,
|
||||
match="is neither stored in the model namespace nor the `hparams` namespace/dict, nor the datamodule."
|
||||
AttributeError,
|
||||
match="is neither stored in the model namespace nor the `hparams` namespace/dict, nor the datamodule."
|
||||
):
|
||||
lightning_getattr(m, "this_attr_not_exist")
|
||||
|
||||
|
@ -140,7 +140,7 @@ def test_lightning_setattr(tmpdir):
|
|||
|
||||
for m in models:
|
||||
with pytest.raises(
|
||||
AttributeError,
|
||||
match="is neither stored in the model namespace nor the `hparams` namespace/dict, nor the datamodule."
|
||||
AttributeError,
|
||||
match="is neither stored in the model namespace nor the `hparams` namespace/dict, nor the datamodule."
|
||||
):
|
||||
lightning_setattr(m, "this_attr_not_exist", None)
|
||||
|
|
Loading…
Reference in New Issue