diff --git a/.azure-pipelines/gpu_benchmark.yml b/.azure-pipelines/gpu_benchmark.yml new file mode 100644 index 0000000000..4ca1b531f8 --- /dev/null +++ b/.azure-pipelines/gpu_benchmark.yml @@ -0,0 +1,27 @@ +name: GPU Parity testing + +on: + schedule: + - cron: "0 0 * * *" # At the end of every day + +jobs: + parity-test: + timeoutInMinutes: 120 + + cancelTimeoutInMinutes: 2 + + pool: gridai-spot-pool + + container: + # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6" + + workspace: + clean: all + + steps: + - bash: | + python -m pytest benchmarks -v --durations=0 + displayName: 'Testing: benchmarks' + env: + PL_RUNNING_BENCHMARKS: 1 diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py index 734288b072..73e7776f40 100644 --- a/benchmarks/__init__.py +++ b/benchmarks/__init__.py @@ -13,5 +13,6 @@ # limitations under the License. import os -BENCHMARK_ROOT = os.path.dirname(__file__) -PROJECT_ROOT = os.path.dirname(BENCHMARK_ROOT) +_BENCHMARK_ROOT = os.path.dirname(__file__) +_PROJECT_ROOT = os.path.dirname(_BENCHMARK_ROOT) +_PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets') diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py index e01d45e442..c665070312 100644 --- a/benchmarks/test_basic_parity.py +++ b/benchmarks/test_basic_parity.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import gc +import os import time import numpy as np @@ -20,7 +21,11 @@ import torch from tqdm import tqdm from pytorch_lightning import LightningModule, seed_everything, Trainer -from tests.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN +from tests.helpers.advanced_models import ParityModuleCIFAR, ParityModuleMNIST, ParityModuleRNN + +_EXTEND_BENCHMARKS = os.getenv("PL_RUNNING_BENCHMARKS", '0') == '1' +_SHORT_BENCHMARKS = not _EXTEND_BENCHMARKS +_MARK_SHORT_BM = pytest.mark.skipif(_SHORT_BENCHMARKS, reason="Only run during Benchmarking") def assert_parity_relative(pl_values, pt_values, norm_by: float = 1, max_diff: float = 0.1): @@ -43,20 +48,16 @@ def assert_parity_absolute(pl_values, pt_values, norm_by: float = 1, max_diff: f # ParityModuleMNIST runs with num_workers=1 @pytest.mark.parametrize( - 'cls_model,max_diff_speed,max_diff_memory', + 'cls_model,max_diff_speed,max_diff_memory,num_epochs,num_runs', [ - (ParityModuleRNN, 0.05, 0.001), - (ParityModuleMNIST, 0.25, 0.001), # todo: lower this thr + (ParityModuleRNN, 0.05, 0.001, 4, 3), + (ParityModuleMNIST, 0.25, 0.001, 4, 3), # todo: lower this thr + pytest.param(ParityModuleCIFAR, 4.0, 0.0002, 2, 2, marks=_MARK_SHORT_BM), ] ) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_pytorch_parity( - tmpdir, - cls_model: LightningModule, - max_diff_speed: float, - max_diff_memory: float, - num_epochs: int = 4, - num_runs: int = 3, + tmpdir, cls_model: LightningModule, max_diff_speed: float, max_diff_memory: float, num_epochs: int, num_runs: int ): """ Verify that the same pytorch and lightning models achieve the same results diff --git a/pl_examples/basic_examples/profiler_example.py b/pl_examples/basic_examples/profiler_example.py index 688eb15ef9..03643aeeb8 100644 --- a/pl_examples/basic_examples/profiler_example.py +++ b/pl_examples/basic_examples/profiler_example.py @@ -29,7 +29,7 @@ import torchvision import torchvision.models as models import torchvision.transforms as T -from pl_examples import cli_lightning_logo +from pl_examples import _DATASETS_PATH, cli_lightning_logo from pytorch_lightning import LightningDataModule, LightningModule from pytorch_lightning.utilities.cli import LightningCLI @@ -75,11 +75,13 @@ class CIFAR10DataModule(LightningDataModule): transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()]) def train_dataloader(self, *args, **kwargs): - trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=self.transform) + trainset = torchvision.datasets.CIFAR10( + root=_DATASETS_PATH, train=True, download=True, transform=self.transform + ) return torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, num_workers=0) def val_dataloader(self, *args, **kwargs): - valset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=self.transform) + valset = torchvision.datasets.CIFAR10(root=_DATASETS_PATH, train=False, download=True, transform=self.transform) return torch.utils.data.DataLoader(valset, batch_size=32, shuffle=True, num_workers=0) diff --git a/tests/__init__.py b/tests/__init__.py index fc634e6b73..09e1800789 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -19,8 +19,8 @@ import numpy as np _TEST_ROOT = os.path.dirname(__file__) _PROJECT_ROOT = os.path.dirname(_TEST_ROOT) _TEMP_PATH = os.path.join(_PROJECT_ROOT, 'test_temp') -PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets') -PATH_LEGACY = os.path.join(_PROJECT_ROOT, 'legacy') +_PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets') +_PATH_LEGACY = os.path.join(_PROJECT_ROOT, 'legacy') # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages if _PROJECT_ROOT not in os.getenv('PYTHONPATH', ""): diff --git a/tests/base/model_template.py b/tests/base/model_template.py index 86578fef4c..bd3578f5cf 100644 --- a/tests/base/model_template.py +++ b/tests/base/model_template.py @@ -18,7 +18,7 @@ import torch.nn as nn import torch.nn.functional as F from pytorch_lightning.core.lightning import LightningModule -from tests import PATH_DATASETS +from tests import _PATH_DATASETS from tests.base.model_optimizers import ConfigureOptimizersPool from tests.base.model_test_dataloaders import TestDataloaderVariations from tests.base.model_test_epoch_ends import TestEpochEndVariations @@ -59,7 +59,7 @@ class EvalModelTemplate( in_features: int = 28 * 28, learning_rate: float = 0.001 * 8, optimizer_name: str = 'adam', - data_root: str = PATH_DATASETS, + data_root: str = _PATH_DATASETS, out_features: int = 10, hidden_dim: int = 1000, b1: float = 0.5, @@ -131,7 +131,7 @@ class EvalModelTemplate( in_features=28 * 28, learning_rate=0.001 * 8, optimizer_name='adam', - data_root=PATH_DATASETS, + data_root=_PATH_DATASETS, out_features=10, hidden_dim=1000, b1=0.5, diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py index 13ae730037..231a1d7252 100644 --- a/tests/checkpointing/test_legacy_checkpoints.py +++ b/tests/checkpointing/test_legacy_checkpoints.py @@ -18,9 +18,9 @@ import sys import pytest from pytorch_lightning import Trainer -from tests import PATH_LEGACY +from tests import _PATH_LEGACY -LEGACY_CHECKPOINTS_PATH = os.path.join(PATH_LEGACY, 'checkpoints') +LEGACY_CHECKPOINTS_PATH = os.path.join(_PATH_LEGACY, 'checkpoints') CHECKPOINT_EXTENSION = ".ckpt" diff --git a/tests/conftest.py b/tests/conftest.py index 3f767d8b6f..0519a5ad30 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,11 +16,19 @@ import sys import threading from functools import partial, wraps from http.server import SimpleHTTPRequestHandler +from pathlib import Path import pytest import torch.distributed import torch.multiprocessing as mp +from tests import _PATH_DATASETS + + +@pytest.fixture(scope="session") +def datadir(): + return Path(_PATH_DATASETS) + @pytest.fixture(scope="function", autouse=True) def preserve_global_rank_variable(): diff --git a/tests/helpers/advanced_models.py b/tests/helpers/advanced_models.py index 8f3b9663aa..c9c2c18648 100644 --- a/tests/helpers/advanced_models.py +++ b/tests/helpers/advanced_models.py @@ -19,9 +19,14 @@ import torch.nn.functional as F from torch.utils.data import DataLoader from pytorch_lightning.core.lightning import LightningModule -from tests import PATH_DATASETS +from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE +from tests import _PATH_DATASETS from tests.helpers.datasets import AverageDataset, MNIST, TrialMNIST +if _TORCHVISION_AVAILABLE: + from torchvision import models, transforms + from torchvision.datasets import CIFAR10 + class Generator(nn.Module): @@ -155,7 +160,7 @@ class BasicGAN(LightningModule): return [opt_g, opt_d], [] def train_dataloader(self): - return DataLoader(TrialMNIST(root=PATH_DATASETS, train=True, download=True), batch_size=16) + return DataLoader(TrialMNIST(root=_PATH_DATASETS, train=True, download=True), batch_size=16) class ParityModuleRNN(LightningModule): @@ -213,7 +218,47 @@ class ParityModuleMNIST(LightningModule): def train_dataloader(self): return DataLoader(MNIST( - root=PATH_DATASETS, + root=_PATH_DATASETS, train=True, download=True, ), batch_size=128, num_workers=1) + + +class ParityModuleCIFAR(LightningModule): + + def __init__(self, backbone="resnet101", hidden_dim=1024, learning_rate=1e-3, pretrained=True): + super().__init__() + self.save_hyperparameters() + + self.learning_rate = learning_rate + self.num_classes = 10 + self.backbone = getattr(models, backbone)(pretrained=pretrained) + + self.classifier = torch.nn.Sequential( + torch.nn.Linear(1000, hidden_dim), torch.nn.Linear(hidden_dim, self.num_classes) + ) + self.transform = transforms.Compose([ + transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + ]) + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.backbone(x) + y_hat = self.classifier(y_hat) + loss = F.cross_entropy(y_hat, y) + return {'loss': loss} + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.learning_rate) + + def train_dataloader(self): + return DataLoader( + CIFAR10( + root=_PATH_DATASETS, + train=True, + download=True, + transform=self.transform, + ), + batch_size=32, + num_workers=1 + ) diff --git a/tests/helpers/boring_model.py b/tests/helpers/boring_model.py index 185baac51f..c1ef8de045 100644 --- a/tests/helpers/boring_model.py +++ b/tests/helpers/boring_model.py @@ -154,7 +154,7 @@ class BoringModel(LightningModule): class BoringDataModule(LightningDataModule): - def __init__(self, data_dir: str = './'): + def __init__(self, data_dir: str = "./"): super().__init__() self.data_dir = data_dir self.non_picklable = None diff --git a/tests/helpers/test_datasets.py b/tests/helpers/test_datasets.py index 6f48b1b714..b6652608bd 100644 --- a/tests/helpers/test_datasets.py +++ b/tests/helpers/test_datasets.py @@ -16,14 +16,14 @@ import pickle import cloudpickle import pytest -from tests import PATH_DATASETS +from tests import _PATH_DATASETS from tests.helpers.datasets import AverageDataset, MNIST, TrialMNIST @pytest.mark.parametrize( 'dataset_cls,args', [ - (MNIST, dict(root=PATH_DATASETS)), - (TrialMNIST, dict(root=PATH_DATASETS)), + (MNIST, dict(root=_PATH_DATASETS)), + (TrialMNIST, dict(root=_PATH_DATASETS)), (AverageDataset, {}), ] )