diff --git a/.azure-pipelines/gpu_benchmark.yml b/.azure-pipelines/gpu_benchmark.yml
new file mode 100644
index 0000000000..4ca1b531f8
--- /dev/null
+++ b/.azure-pipelines/gpu_benchmark.yml
@@ -0,0 +1,27 @@
+name: GPU Parity testing
+
+on:
+  schedule:
+    - cron: "0 0 * * *" # At the end of every day
+
+jobs:
+  parity-test:
+    timeoutInMinutes: 120
+
+    cancelTimeoutInMinutes: 2
+
+    pool: gridai-spot-pool
+
+    container:
+      # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6"
+
+    workspace:
+      clean: all
+
+    steps:
+      - bash: |
+          python -m pytest benchmarks -v --durations=0
+        displayName: 'Testing: benchmarks'
+        env:
+          PL_RUNNING_BENCHMARKS: 1
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index 734288b072..73e7776f40 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 import os
 
-BENCHMARK_ROOT = os.path.dirname(__file__)
-PROJECT_ROOT = os.path.dirname(BENCHMARK_ROOT)
+_BENCHMARK_ROOT = os.path.dirname(__file__)
+_PROJECT_ROOT = os.path.dirname(_BENCHMARK_ROOT)
+_PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets')
diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py
index e01d45e442..c665070312 100644
--- a/benchmarks/test_basic_parity.py
+++ b/benchmarks/test_basic_parity.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import gc
+import os
 import time
 
 import numpy as np
@@ -20,7 +21,11 @@ import torch
 from tqdm import tqdm
 
 from pytorch_lightning import LightningModule, seed_everything, Trainer
-from tests.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN
+from tests.helpers.advanced_models import ParityModuleCIFAR, ParityModuleMNIST, ParityModuleRNN
+
+_EXTEND_BENCHMARKS = os.getenv("PL_RUNNING_BENCHMARKS", '0') == '1'
+_SHORT_BENCHMARKS = not _EXTEND_BENCHMARKS
+_MARK_SHORT_BM = pytest.mark.skipif(_SHORT_BENCHMARKS, reason="Only run during Benchmarking")
 
 
 def assert_parity_relative(pl_values, pt_values, norm_by: float = 1, max_diff: float = 0.1):
@@ -43,20 +48,16 @@ def assert_parity_absolute(pl_values, pt_values, norm_by: float = 1, max_diff: f
 
 # ParityModuleMNIST runs with num_workers=1
 @pytest.mark.parametrize(
-    'cls_model,max_diff_speed,max_diff_memory',
+    'cls_model,max_diff_speed,max_diff_memory,num_epochs,num_runs',
     [
-        (ParityModuleRNN, 0.05, 0.001),
-        (ParityModuleMNIST, 0.25, 0.001),  # todo: lower this thr
+        (ParityModuleRNN, 0.05, 0.001, 4, 3),
+        (ParityModuleMNIST, 0.25, 0.001, 4, 3),  # todo: lower this thr
+        pytest.param(ParityModuleCIFAR, 4.0, 0.0002, 2, 2, marks=_MARK_SHORT_BM),
     ]
 )
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_pytorch_parity(
-    tmpdir,
-    cls_model: LightningModule,
-    max_diff_speed: float,
-    max_diff_memory: float,
-    num_epochs: int = 4,
-    num_runs: int = 3,
+    tmpdir, cls_model: LightningModule, max_diff_speed: float, max_diff_memory: float, num_epochs: int, num_runs: int
 ):
     """
     Verify that the same  pytorch and lightning models achieve the same results
diff --git a/pl_examples/basic_examples/profiler_example.py b/pl_examples/basic_examples/profiler_example.py
index 688eb15ef9..03643aeeb8 100644
--- a/pl_examples/basic_examples/profiler_example.py
+++ b/pl_examples/basic_examples/profiler_example.py
@@ -29,7 +29,7 @@ import torchvision
 import torchvision.models as models
 import torchvision.transforms as T
 
-from pl_examples import cli_lightning_logo
+from pl_examples import _DATASETS_PATH, cli_lightning_logo
 from pytorch_lightning import LightningDataModule, LightningModule
 from pytorch_lightning.utilities.cli import LightningCLI
 
@@ -75,11 +75,13 @@ class CIFAR10DataModule(LightningDataModule):
     transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()])
 
     def train_dataloader(self, *args, **kwargs):
-        trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=self.transform)
+        trainset = torchvision.datasets.CIFAR10(
+            root=_DATASETS_PATH, train=True, download=True, transform=self.transform
+        )
         return torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, num_workers=0)
 
     def val_dataloader(self, *args, **kwargs):
-        valset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=self.transform)
+        valset = torchvision.datasets.CIFAR10(root=_DATASETS_PATH, train=False, download=True, transform=self.transform)
         return torch.utils.data.DataLoader(valset, batch_size=32, shuffle=True, num_workers=0)
 
 
diff --git a/tests/__init__.py b/tests/__init__.py
index fc634e6b73..09e1800789 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -19,8 +19,8 @@ import numpy as np
 _TEST_ROOT = os.path.dirname(__file__)
 _PROJECT_ROOT = os.path.dirname(_TEST_ROOT)
 _TEMP_PATH = os.path.join(_PROJECT_ROOT, 'test_temp')
-PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets')
-PATH_LEGACY = os.path.join(_PROJECT_ROOT, 'legacy')
+_PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets')
+_PATH_LEGACY = os.path.join(_PROJECT_ROOT, 'legacy')
 
 # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages
 if _PROJECT_ROOT not in os.getenv('PYTHONPATH', ""):
diff --git a/tests/base/model_template.py b/tests/base/model_template.py
index 86578fef4c..bd3578f5cf 100644
--- a/tests/base/model_template.py
+++ b/tests/base/model_template.py
@@ -18,7 +18,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 
 from pytorch_lightning.core.lightning import LightningModule
-from tests import PATH_DATASETS
+from tests import _PATH_DATASETS
 from tests.base.model_optimizers import ConfigureOptimizersPool
 from tests.base.model_test_dataloaders import TestDataloaderVariations
 from tests.base.model_test_epoch_ends import TestEpochEndVariations
@@ -59,7 +59,7 @@ class EvalModelTemplate(
         in_features: int = 28 * 28,
         learning_rate: float = 0.001 * 8,
         optimizer_name: str = 'adam',
-        data_root: str = PATH_DATASETS,
+        data_root: str = _PATH_DATASETS,
         out_features: int = 10,
         hidden_dim: int = 1000,
         b1: float = 0.5,
@@ -131,7 +131,7 @@ class EvalModelTemplate(
             in_features=28 * 28,
             learning_rate=0.001 * 8,
             optimizer_name='adam',
-            data_root=PATH_DATASETS,
+            data_root=_PATH_DATASETS,
             out_features=10,
             hidden_dim=1000,
             b1=0.5,
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
index 13ae730037..231a1d7252 100644
--- a/tests/checkpointing/test_legacy_checkpoints.py
+++ b/tests/checkpointing/test_legacy_checkpoints.py
@@ -18,9 +18,9 @@ import sys
 import pytest
 
 from pytorch_lightning import Trainer
-from tests import PATH_LEGACY
+from tests import _PATH_LEGACY
 
-LEGACY_CHECKPOINTS_PATH = os.path.join(PATH_LEGACY, 'checkpoints')
+LEGACY_CHECKPOINTS_PATH = os.path.join(_PATH_LEGACY, 'checkpoints')
 CHECKPOINT_EXTENSION = ".ckpt"
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 3f767d8b6f..0519a5ad30 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -16,11 +16,19 @@ import sys
 import threading
 from functools import partial, wraps
 from http.server import SimpleHTTPRequestHandler
+from pathlib import Path
 
 import pytest
 import torch.distributed
 import torch.multiprocessing as mp
 
+from tests import _PATH_DATASETS
+
+
+@pytest.fixture(scope="session")
+def datadir():
+    return Path(_PATH_DATASETS)
+
 
 @pytest.fixture(scope="function", autouse=True)
 def preserve_global_rank_variable():
diff --git a/tests/helpers/advanced_models.py b/tests/helpers/advanced_models.py
index 8f3b9663aa..c9c2c18648 100644
--- a/tests/helpers/advanced_models.py
+++ b/tests/helpers/advanced_models.py
@@ -19,9 +19,14 @@ import torch.nn.functional as F
 from torch.utils.data import DataLoader
 
 from pytorch_lightning.core.lightning import LightningModule
-from tests import PATH_DATASETS
+from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE
+from tests import _PATH_DATASETS
 from tests.helpers.datasets import AverageDataset, MNIST, TrialMNIST
 
+if _TORCHVISION_AVAILABLE:
+    from torchvision import models, transforms
+    from torchvision.datasets import CIFAR10
+
 
 class Generator(nn.Module):
 
@@ -155,7 +160,7 @@ class BasicGAN(LightningModule):
         return [opt_g, opt_d], []
 
     def train_dataloader(self):
-        return DataLoader(TrialMNIST(root=PATH_DATASETS, train=True, download=True), batch_size=16)
+        return DataLoader(TrialMNIST(root=_PATH_DATASETS, train=True, download=True), batch_size=16)
 
 
 class ParityModuleRNN(LightningModule):
@@ -213,7 +218,47 @@ class ParityModuleMNIST(LightningModule):
 
     def train_dataloader(self):
         return DataLoader(MNIST(
-            root=PATH_DATASETS,
+            root=_PATH_DATASETS,
             train=True,
             download=True,
         ), batch_size=128, num_workers=1)
+
+
+class ParityModuleCIFAR(LightningModule):
+
+    def __init__(self, backbone="resnet101", hidden_dim=1024, learning_rate=1e-3, pretrained=True):
+        super().__init__()
+        self.save_hyperparameters()
+
+        self.learning_rate = learning_rate
+        self.num_classes = 10
+        self.backbone = getattr(models, backbone)(pretrained=pretrained)
+
+        self.classifier = torch.nn.Sequential(
+            torch.nn.Linear(1000, hidden_dim), torch.nn.Linear(hidden_dim, self.num_classes)
+        )
+        self.transform = transforms.Compose([
+            transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+        ])
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.backbone(x)
+        y_hat = self.classifier(y_hat)
+        loss = F.cross_entropy(y_hat, y)
+        return {'loss': loss}
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)
+
+    def train_dataloader(self):
+        return DataLoader(
+            CIFAR10(
+                root=_PATH_DATASETS,
+                train=True,
+                download=True,
+                transform=self.transform,
+            ),
+            batch_size=32,
+            num_workers=1
+        )
diff --git a/tests/helpers/boring_model.py b/tests/helpers/boring_model.py
index 185baac51f..c1ef8de045 100644
--- a/tests/helpers/boring_model.py
+++ b/tests/helpers/boring_model.py
@@ -154,7 +154,7 @@ class BoringModel(LightningModule):
 
 class BoringDataModule(LightningDataModule):
 
-    def __init__(self, data_dir: str = './'):
+    def __init__(self, data_dir: str = "./"):
         super().__init__()
         self.data_dir = data_dir
         self.non_picklable = None
diff --git a/tests/helpers/test_datasets.py b/tests/helpers/test_datasets.py
index 6f48b1b714..b6652608bd 100644
--- a/tests/helpers/test_datasets.py
+++ b/tests/helpers/test_datasets.py
@@ -16,14 +16,14 @@ import pickle
 import cloudpickle
 import pytest
 
-from tests import PATH_DATASETS
+from tests import _PATH_DATASETS
 from tests.helpers.datasets import AverageDataset, MNIST, TrialMNIST
 
 
 @pytest.mark.parametrize(
     'dataset_cls,args', [
-        (MNIST, dict(root=PATH_DATASETS)),
-        (TrialMNIST, dict(root=PATH_DATASETS)),
+        (MNIST, dict(root=_PATH_DATASETS)),
+        (TrialMNIST, dict(root=_PATH_DATASETS)),
         (AverageDataset, {}),
     ]
 )