Parity test (#7832)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jirka <jirka.borovec@seznam.cz>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
This commit is contained in:
Kaushik B 2021-07-21 02:53:53 +05:30 committed by GitHub
parent ea13f6021c
commit 4c79b3a5b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 113 additions and 29 deletions

View File

@ -0,0 +1,27 @@
name: GPU Parity testing
on:
schedule:
- cron: "0 0 * * *" # At the end of every day
jobs:
parity-test:
timeoutInMinutes: 120
cancelTimeoutInMinutes: 2
pool: gridai-spot-pool
container:
# base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6"
workspace:
clean: all
steps:
- bash: |
python -m pytest benchmarks -v --durations=0
displayName: 'Testing: benchmarks'
env:
PL_RUNNING_BENCHMARKS: 1

View File

@ -13,5 +13,6 @@
# limitations under the License.
import os
BENCHMARK_ROOT = os.path.dirname(__file__)
PROJECT_ROOT = os.path.dirname(BENCHMARK_ROOT)
_BENCHMARK_ROOT = os.path.dirname(__file__)
_PROJECT_ROOT = os.path.dirname(_BENCHMARK_ROOT)
_PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets')

View File

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import os
import time
import numpy as np
@ -20,7 +21,11 @@ import torch
from tqdm import tqdm
from pytorch_lightning import LightningModule, seed_everything, Trainer
from tests.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN
from tests.helpers.advanced_models import ParityModuleCIFAR, ParityModuleMNIST, ParityModuleRNN
_EXTEND_BENCHMARKS = os.getenv("PL_RUNNING_BENCHMARKS", '0') == '1'
_SHORT_BENCHMARKS = not _EXTEND_BENCHMARKS
_MARK_SHORT_BM = pytest.mark.skipif(_SHORT_BENCHMARKS, reason="Only run during Benchmarking")
def assert_parity_relative(pl_values, pt_values, norm_by: float = 1, max_diff: float = 0.1):
@ -43,20 +48,16 @@ def assert_parity_absolute(pl_values, pt_values, norm_by: float = 1, max_diff: f
# ParityModuleMNIST runs with num_workers=1
@pytest.mark.parametrize(
'cls_model,max_diff_speed,max_diff_memory',
'cls_model,max_diff_speed,max_diff_memory,num_epochs,num_runs',
[
(ParityModuleRNN, 0.05, 0.001),
(ParityModuleMNIST, 0.25, 0.001), # todo: lower this thr
(ParityModuleRNN, 0.05, 0.001, 4, 3),
(ParityModuleMNIST, 0.25, 0.001, 4, 3), # todo: lower this thr
pytest.param(ParityModuleCIFAR, 4.0, 0.0002, 2, 2, marks=_MARK_SHORT_BM),
]
)
@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
def test_pytorch_parity(
tmpdir,
cls_model: LightningModule,
max_diff_speed: float,
max_diff_memory: float,
num_epochs: int = 4,
num_runs: int = 3,
tmpdir, cls_model: LightningModule, max_diff_speed: float, max_diff_memory: float, num_epochs: int, num_runs: int
):
"""
Verify that the same pytorch and lightning models achieve the same results

View File

@ -29,7 +29,7 @@ import torchvision
import torchvision.models as models
import torchvision.transforms as T
from pl_examples import cli_lightning_logo
from pl_examples import _DATASETS_PATH, cli_lightning_logo
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.utilities.cli import LightningCLI
@ -75,11 +75,13 @@ class CIFAR10DataModule(LightningDataModule):
transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()])
def train_dataloader(self, *args, **kwargs):
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=self.transform)
trainset = torchvision.datasets.CIFAR10(
root=_DATASETS_PATH, train=True, download=True, transform=self.transform
)
return torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, num_workers=0)
def val_dataloader(self, *args, **kwargs):
valset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=self.transform)
valset = torchvision.datasets.CIFAR10(root=_DATASETS_PATH, train=False, download=True, transform=self.transform)
return torch.utils.data.DataLoader(valset, batch_size=32, shuffle=True, num_workers=0)

View File

@ -19,8 +19,8 @@ import numpy as np
_TEST_ROOT = os.path.dirname(__file__)
_PROJECT_ROOT = os.path.dirname(_TEST_ROOT)
_TEMP_PATH = os.path.join(_PROJECT_ROOT, 'test_temp')
PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets')
PATH_LEGACY = os.path.join(_PROJECT_ROOT, 'legacy')
_PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets')
_PATH_LEGACY = os.path.join(_PROJECT_ROOT, 'legacy')
# todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages
if _PROJECT_ROOT not in os.getenv('PYTHONPATH', ""):

View File

@ -18,7 +18,7 @@ import torch.nn as nn
import torch.nn.functional as F
from pytorch_lightning.core.lightning import LightningModule
from tests import PATH_DATASETS
from tests import _PATH_DATASETS
from tests.base.model_optimizers import ConfigureOptimizersPool
from tests.base.model_test_dataloaders import TestDataloaderVariations
from tests.base.model_test_epoch_ends import TestEpochEndVariations
@ -59,7 +59,7 @@ class EvalModelTemplate(
in_features: int = 28 * 28,
learning_rate: float = 0.001 * 8,
optimizer_name: str = 'adam',
data_root: str = PATH_DATASETS,
data_root: str = _PATH_DATASETS,
out_features: int = 10,
hidden_dim: int = 1000,
b1: float = 0.5,
@ -131,7 +131,7 @@ class EvalModelTemplate(
in_features=28 * 28,
learning_rate=0.001 * 8,
optimizer_name='adam',
data_root=PATH_DATASETS,
data_root=_PATH_DATASETS,
out_features=10,
hidden_dim=1000,
b1=0.5,

View File

@ -18,9 +18,9 @@ import sys
import pytest
from pytorch_lightning import Trainer
from tests import PATH_LEGACY
from tests import _PATH_LEGACY
LEGACY_CHECKPOINTS_PATH = os.path.join(PATH_LEGACY, 'checkpoints')
LEGACY_CHECKPOINTS_PATH = os.path.join(_PATH_LEGACY, 'checkpoints')
CHECKPOINT_EXTENSION = ".ckpt"

View File

@ -16,11 +16,19 @@ import sys
import threading
from functools import partial, wraps
from http.server import SimpleHTTPRequestHandler
from pathlib import Path
import pytest
import torch.distributed
import torch.multiprocessing as mp
from tests import _PATH_DATASETS
@pytest.fixture(scope="session")
def datadir():
return Path(_PATH_DATASETS)
@pytest.fixture(scope="function", autouse=True)
def preserve_global_rank_variable():

View File

@ -19,9 +19,14 @@ import torch.nn.functional as F
from torch.utils.data import DataLoader
from pytorch_lightning.core.lightning import LightningModule
from tests import PATH_DATASETS
from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE
from tests import _PATH_DATASETS
from tests.helpers.datasets import AverageDataset, MNIST, TrialMNIST
if _TORCHVISION_AVAILABLE:
from torchvision import models, transforms
from torchvision.datasets import CIFAR10
class Generator(nn.Module):
@ -155,7 +160,7 @@ class BasicGAN(LightningModule):
return [opt_g, opt_d], []
def train_dataloader(self):
return DataLoader(TrialMNIST(root=PATH_DATASETS, train=True, download=True), batch_size=16)
return DataLoader(TrialMNIST(root=_PATH_DATASETS, train=True, download=True), batch_size=16)
class ParityModuleRNN(LightningModule):
@ -213,7 +218,47 @@ class ParityModuleMNIST(LightningModule):
def train_dataloader(self):
return DataLoader(MNIST(
root=PATH_DATASETS,
root=_PATH_DATASETS,
train=True,
download=True,
), batch_size=128, num_workers=1)
class ParityModuleCIFAR(LightningModule):
def __init__(self, backbone="resnet101", hidden_dim=1024, learning_rate=1e-3, pretrained=True):
super().__init__()
self.save_hyperparameters()
self.learning_rate = learning_rate
self.num_classes = 10
self.backbone = getattr(models, backbone)(pretrained=pretrained)
self.classifier = torch.nn.Sequential(
torch.nn.Linear(1000, hidden_dim), torch.nn.Linear(hidden_dim, self.num_classes)
)
self.transform = transforms.Compose([
transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self.backbone(x)
y_hat = self.classifier(y_hat)
loss = F.cross_entropy(y_hat, y)
return {'loss': loss}
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.learning_rate)
def train_dataloader(self):
return DataLoader(
CIFAR10(
root=_PATH_DATASETS,
train=True,
download=True,
transform=self.transform,
),
batch_size=32,
num_workers=1
)

View File

@ -154,7 +154,7 @@ class BoringModel(LightningModule):
class BoringDataModule(LightningDataModule):
def __init__(self, data_dir: str = './'):
def __init__(self, data_dir: str = "./"):
super().__init__()
self.data_dir = data_dir
self.non_picklable = None

View File

@ -16,14 +16,14 @@ import pickle
import cloudpickle
import pytest
from tests import PATH_DATASETS
from tests import _PATH_DATASETS
from tests.helpers.datasets import AverageDataset, MNIST, TrialMNIST
@pytest.mark.parametrize(
'dataset_cls,args', [
(MNIST, dict(root=PATH_DATASETS)),
(TrialMNIST, dict(root=PATH_DATASETS)),
(MNIST, dict(root=_PATH_DATASETS)),
(TrialMNIST, dict(root=_PATH_DATASETS)),
(AverageDataset, {}),
]
)