diff --git a/.gitignore b/.gitignore index 8a81bf71ea..59fb3269ce 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,6 @@ docs/source/pl_examples*.rst docs/source/pytorch_lightning*.rst docs/source/tests*.rst docs/source/*.md -tests/tests/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 20461f0e66..14f9c010fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Removed - Removed duplicated module `pytorch_lightning.utilities.arg_parse` for loading CLI arguments ([#1167](https://github.com/PyTorchLightning/pytorch-lightning/issues/1167)) +- Dropped `torchvision` dependency in tests and added own MNIST dataset class instead ([#986](https://github.com/PyTorchLightning/pytorch-lightning/issues/986)) ### Fixed diff --git a/environment.yml b/environment.yml index 80b3fca2fe..45e0e3da30 100644 --- a/environment.yml +++ b/environment.yml @@ -15,7 +15,6 @@ dependencies: - future>=0.17.1 # For dev and testing - - torchvision>=0.4.0 - tox - coverage - codecov @@ -26,7 +25,6 @@ dependencies: - autopep8 - check-manifest - twine==1.13.0 - - pillow<7.0.0 - pip: - test-tube>=0.7.5 diff --git a/tests/base/datasets.py b/tests/base/datasets.py new file mode 100644 index 0000000000..0fbdfae6f2 --- /dev/null +++ b/tests/base/datasets.py @@ -0,0 +1,107 @@ +import logging +import os +import urllib.request +from typing import Tuple + +import torch +from torch import Tensor +from torch.utils.data import Dataset + + +class MNIST(Dataset): + """ + Customized `MNIST `_ dataset for testing Pytorch Lightning + without the torchvision dependency. + + Part of the code was copied from + https://github.com/pytorch/vision/blob/build/v0.5.0/torchvision/datasets/mnist.py + + Args: + root: Root directory of dataset where ``MNIST/processed/training.pt`` + and ``MNIST/processed/test.pt`` exist. + train: If ``True``, creates dataset from ``training.pt``, + otherwise from ``test.pt``. + normalize: mean and std deviation of the MNIST dataset. + download: If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + """ + + RESOURCES = ( + "https://pl-public-data.s3.amazonaws.com/MNIST/processed/training.pt", + "https://pl-public-data.s3.amazonaws.com/MNIST/processed/test.pt", + ) + + TRAIN_FILE_NAME = 'training.pt' + TEST_FILE_NAME = 'test.pt' + + def __init__(self, root: str, train: bool = True, normalize: tuple = (0.5, 1.0), download: bool = False): + super(MNIST, self).__init__() + self.root = root + self.train = train # training set or test set + self.normalize = normalize + + if download: + self.download() + + if not self._check_exists(): + raise RuntimeError('Dataset not found.') + + data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME + self.data, self.targets = torch.load(os.path.join(self.processed_folder, data_file)) + + def __getitem__(self, idx: int) -> Tuple[Tensor, int]: + img = self.data[idx].float().unsqueeze(0) + target = int(self.targets[idx]) + + if self.normalize is not None: + img = normalize_tensor(img, mean=self.normalize[0], std=self.normalize[1]) + + return img, target + + def __len__(self) -> int: + return len(self.data) + + @property + def processed_folder(self) -> str: + return os.path.join(self.root, 'MNIST', 'processed') + + def _check_exists(self) -> bool: + train_file = os.path.join(self.processed_folder, self.TRAIN_FILE_NAME) + test_file = os.path.join(self.processed_folder, self.TEST_FILE_NAME) + return os.path.isfile(train_file) and os.path.isfile(test_file) + + def download(self) -> None: + """Download the MNIST data if it doesn't exist in processed_folder already.""" + + if self._check_exists(): + return + + os.makedirs(self.processed_folder, exist_ok=True) + + for url in self.RESOURCES: + logging.info(f'Downloading {url}') + fpath = os.path.join(self.processed_folder, os.path.basename(url)) + urllib.request.urlretrieve(url, fpath) + + +def normalize_tensor(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> Tensor: + tensor = tensor.clone() + mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device) + std = torch.as_tensor(std, dtype=tensor.dtype, device=tensor.device) + tensor.sub_(mean).div_(std) + return tensor + + +class TestingMNIST(MNIST): + + def __init__(self, root, train=True, normalize=(0.5, 1.0), download=False, num_samples=8000): + super().__init__( + root, + train=train, + normalize=normalize, + download=download + ) + # take just a subset of MNIST dataset + self.data = self.data[:num_samples] + self.targets = self.targets[:num_samples] diff --git a/tests/base/debug.py b/tests/base/debug.py index 59abdd8503..b601982d28 100644 --- a/tests/base/debug.py +++ b/tests/base/debug.py @@ -1,9 +1,9 @@ import torch from torch.nn import functional as F from torch.utils.data import DataLoader -from torchvision.datasets import MNIST import pytorch_lightning as pl +from tests.base.datasets import MNIST # from test_models import assert_ok_test_acc, load_model, \ diff --git a/tests/base/models.py b/tests/base/models.py index 1b60f3b20c..00baed58a2 100644 --- a/tests/base/models.py +++ b/tests/base/models.py @@ -7,8 +7,8 @@ import torch.nn as nn import torch.nn.functional as F from torch import optim from torch.utils.data import DataLoader -from torchvision import transforms -from torchvision.datasets import MNIST + +from tests.base.datasets import TestingMNIST try: from test_tube import HyperOptArgumentParser @@ -18,29 +18,6 @@ except ImportError: from pytorch_lightning.core.lightning import LightningModule -# TODO: remove after getting own MNIST -# TEMPORAL FIX, https://github.com/pytorch/vision/issues/1938 -import urllib.request -opener = urllib.request.build_opener() -opener.addheaders = [('User-agent', 'Mozilla/5.0')] -urllib.request.install_opener(opener) - - -class TestingMNIST(MNIST): - - def __init__(self, root, train=True, transform=None, target_transform=None, - download=False, num_samples=8000): - super().__init__( - root, - train=train, - transform=transform, - target_transform=target_transform, - download=download - ) - # take just a subset of MNIST dataset - self.data = self.data[:num_samples] - self.targets = self.targets[:num_samples] - class DictHparamsModel(LightningModule): @@ -61,8 +38,7 @@ class DictHparamsModel(LightningModule): return torch.optim.Adam(self.parameters(), lr=0.02) def train_dataloader(self): - return DataLoader(TestingMNIST(os.getcwd(), train=True, download=True, - transform=transforms.ToTensor()), batch_size=32) + return DataLoader(TestingMNIST(os.getcwd(), train=True, download=True), batch_size=32) class TestModelBase(LightningModule): @@ -178,17 +154,13 @@ class TestModelBase(LightningModule): return [optimizer], [scheduler] def prepare_data(self): - transform = transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.5,), (1.0,))]) _ = TestingMNIST(root=self.hparams.data_root, train=True, - transform=transform, download=True, num_samples=2000) + download=True, num_samples=2000) def _dataloader(self, train): # init data generators - transform = transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.5,), (1.0,))]) dataset = TestingMNIST(root=self.hparams.data_root, train=train, - transform=transform, download=False, num_samples=2000) + download=False, num_samples=2000) # when using multi-node we need to add the datasampler batch_size = self.hparams.batch_size diff --git a/tests/datasets/__init__.py b/tests/datasets/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/requirements.txt b/tests/requirements.txt index b82220b764..e93ccb51d7 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -2,7 +2,6 @@ -r ../requirements-extra.txt # extended list of dependencies dor development and run lint and tests -torchvision>=0.4.0, < 0.5 # the 0.5. has some issues with torch JIT tox coverage codecov @@ -11,5 +10,4 @@ pytest-cov pytest-flake8 flake8 check-manifest -twine==1.13.0 -pillow<7.0.0 \ No newline at end of file +twine==1.13.0 \ No newline at end of file