2020-03-30 22:25:37 +00:00
|
|
|
import logging
|
|
|
|
import os
|
2020-07-27 23:07:09 +00:00
|
|
|
import random
|
|
|
|
import time
|
2020-08-06 14:58:51 +00:00
|
|
|
import urllib.request
|
2020-04-02 16:28:44 +00:00
|
|
|
from typing import Tuple, Optional, Sequence
|
2020-03-30 22:25:37 +00:00
|
|
|
|
|
|
|
import torch
|
|
|
|
from torch import Tensor
|
|
|
|
from torch.utils.data import Dataset
|
|
|
|
|
2020-04-22 00:33:10 +00:00
|
|
|
from tests import PACKAGE_ROOT
|
2020-04-02 16:28:44 +00:00
|
|
|
|
|
|
|
#: local path to test datasets
|
2020-04-22 00:33:10 +00:00
|
|
|
PATH_DATASETS = os.path.join(PACKAGE_ROOT, 'Datasets')
|
2020-04-02 16:28:44 +00:00
|
|
|
|
2020-03-30 22:25:37 +00:00
|
|
|
|
|
|
|
class MNIST(Dataset):
|
|
|
|
"""
|
|
|
|
Customized `MNIST <http://yann.lecun.com/exdb/mnist/>`_ dataset for testing Pytorch Lightning
|
|
|
|
without the torchvision dependency.
|
|
|
|
|
|
|
|
Part of the code was copied from
|
|
|
|
https://github.com/pytorch/vision/blob/build/v0.5.0/torchvision/datasets/mnist.py
|
|
|
|
|
|
|
|
Args:
|
|
|
|
root: Root directory of dataset where ``MNIST/processed/training.pt``
|
|
|
|
and ``MNIST/processed/test.pt`` exist.
|
|
|
|
train: If ``True``, creates dataset from ``training.pt``,
|
|
|
|
otherwise from ``test.pt``.
|
|
|
|
normalize: mean and std deviation of the MNIST dataset.
|
|
|
|
download: If true, downloads the dataset from the internet and
|
|
|
|
puts it in root directory. If dataset is already downloaded, it is not
|
|
|
|
downloaded again.
|
2020-04-02 16:28:44 +00:00
|
|
|
|
|
|
|
Examples:
|
|
|
|
>>> dataset = MNIST(download=True)
|
|
|
|
>>> len(dataset)
|
|
|
|
60000
|
|
|
|
>>> torch.bincount(dataset.targets)
|
|
|
|
tensor([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949])
|
2020-03-30 22:25:37 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
RESOURCES = (
|
|
|
|
"https://pl-public-data.s3.amazonaws.com/MNIST/processed/training.pt",
|
|
|
|
"https://pl-public-data.s3.amazonaws.com/MNIST/processed/test.pt",
|
|
|
|
)
|
|
|
|
|
|
|
|
TRAIN_FILE_NAME = 'training.pt'
|
|
|
|
TEST_FILE_NAME = 'test.pt'
|
2020-04-02 16:28:44 +00:00
|
|
|
cache_folder_name = 'complete'
|
2020-03-30 22:25:37 +00:00
|
|
|
|
2020-04-02 16:28:44 +00:00
|
|
|
def __init__(self, root: str = PATH_DATASETS, train: bool = True,
|
2020-04-16 02:16:40 +00:00
|
|
|
normalize: tuple = (0.5, 1.0), download: bool = True):
|
2020-04-02 16:28:44 +00:00
|
|
|
super().__init__()
|
2020-03-30 22:25:37 +00:00
|
|
|
self.root = root
|
|
|
|
self.train = train # training set or test set
|
|
|
|
self.normalize = normalize
|
|
|
|
|
2020-04-02 16:28:44 +00:00
|
|
|
self.prepare_data(download)
|
2020-03-30 22:25:37 +00:00
|
|
|
|
2020-04-02 16:28:44 +00:00
|
|
|
if not self._check_exists(self.cached_folder_path):
|
2020-03-30 22:25:37 +00:00
|
|
|
raise RuntimeError('Dataset not found.')
|
|
|
|
|
|
|
|
data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME
|
2020-07-27 23:07:09 +00:00
|
|
|
self.data, self.targets = _try_load(os.path.join(self.cached_folder_path, data_file))
|
2020-03-30 22:25:37 +00:00
|
|
|
|
|
|
|
def __getitem__(self, idx: int) -> Tuple[Tensor, int]:
|
|
|
|
img = self.data[idx].float().unsqueeze(0)
|
|
|
|
target = int(self.targets[idx])
|
|
|
|
|
|
|
|
if self.normalize is not None:
|
|
|
|
img = normalize_tensor(img, mean=self.normalize[0], std=self.normalize[1])
|
|
|
|
|
|
|
|
return img, target
|
|
|
|
|
|
|
|
def __len__(self) -> int:
|
|
|
|
return len(self.data)
|
|
|
|
|
|
|
|
@property
|
2020-04-02 16:28:44 +00:00
|
|
|
def cached_folder_path(self) -> str:
|
|
|
|
return os.path.join(self.root, 'MNIST', self.cache_folder_name)
|
2020-03-30 22:25:37 +00:00
|
|
|
|
2020-04-02 16:28:44 +00:00
|
|
|
def _check_exists(self, data_folder: str) -> bool:
|
|
|
|
existing = True
|
|
|
|
for fname in (self.TRAIN_FILE_NAME, self.TEST_FILE_NAME):
|
|
|
|
existing = existing and os.path.isfile(os.path.join(data_folder, fname))
|
|
|
|
return existing
|
2020-03-30 22:25:37 +00:00
|
|
|
|
2020-04-02 16:28:44 +00:00
|
|
|
def prepare_data(self, download: bool):
|
|
|
|
if download:
|
|
|
|
self._download(self.cached_folder_path)
|
|
|
|
|
|
|
|
def _download(self, data_folder: str) -> None:
|
|
|
|
"""Download the MNIST data if it doesn't exist in cached_folder_path already."""
|
2020-03-30 22:25:37 +00:00
|
|
|
|
2020-04-02 16:28:44 +00:00
|
|
|
if self._check_exists(data_folder):
|
2020-03-30 22:25:37 +00:00
|
|
|
return
|
|
|
|
|
2020-04-02 16:28:44 +00:00
|
|
|
os.makedirs(data_folder, exist_ok=True)
|
2020-03-30 22:25:37 +00:00
|
|
|
|
|
|
|
for url in self.RESOURCES:
|
|
|
|
logging.info(f'Downloading {url}')
|
2020-04-02 16:28:44 +00:00
|
|
|
fpath = os.path.join(data_folder, os.path.basename(url))
|
2020-03-30 22:25:37 +00:00
|
|
|
urllib.request.urlretrieve(url, fpath)
|
|
|
|
|
|
|
|
|
2020-07-27 23:07:09 +00:00
|
|
|
def _try_load(path_data, trials: int = 30, delta: float = 1.):
|
|
|
|
"""Resolving loading from the same time from multiple concurrentprocesses."""
|
|
|
|
res, exp = None, None
|
|
|
|
assert trials, "at least some trial has to be set"
|
|
|
|
assert os.path.isfile(path_data), 'missing file: %s' % path_data
|
|
|
|
for _ in range(trials):
|
|
|
|
try:
|
|
|
|
res = torch.load(path_data)
|
|
|
|
except Exception as ex:
|
|
|
|
exp = ex
|
|
|
|
time.sleep(delta * random.random())
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
# raise the caught exception if any
|
|
|
|
if exp:
|
|
|
|
raise exp
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
2020-03-30 22:25:37 +00:00
|
|
|
def normalize_tensor(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> Tensor:
|
|
|
|
tensor = tensor.clone()
|
|
|
|
mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device)
|
|
|
|
std = torch.as_tensor(std, dtype=tensor.dtype, device=tensor.device)
|
|
|
|
tensor.sub_(mean).div_(std)
|
|
|
|
return tensor
|
|
|
|
|
|
|
|
|
2020-04-16 02:16:40 +00:00
|
|
|
class TrialMNIST(MNIST):
|
2020-04-02 16:28:44 +00:00
|
|
|
"""Constrain image dataset
|
|
|
|
|
|
|
|
Args:
|
|
|
|
root: Root directory of dataset where ``MNIST/processed/training.pt``
|
|
|
|
and ``MNIST/processed/test.pt`` exist.
|
|
|
|
train: If ``True``, creates dataset from ``training.pt``,
|
|
|
|
otherwise from ``test.pt``.
|
|
|
|
normalize: mean and std deviation of the MNIST dataset.
|
|
|
|
download: If true, downloads the dataset from the internet and
|
|
|
|
puts it in root directory. If dataset is already downloaded, it is not
|
|
|
|
downloaded again.
|
|
|
|
num_samples: number of examples per selected class/digit
|
|
|
|
digits: list selected MNIST digits/classes
|
|
|
|
|
|
|
|
Examples:
|
2020-04-16 02:16:40 +00:00
|
|
|
>>> dataset = TrialMNIST(download=True)
|
2020-04-02 16:28:44 +00:00
|
|
|
>>> len(dataset)
|
|
|
|
300
|
|
|
|
>>> sorted(set([d.item() for d in dataset.targets]))
|
|
|
|
[0, 1, 2]
|
|
|
|
>>> torch.bincount(dataset.targets)
|
|
|
|
tensor([100, 100, 100])
|
|
|
|
"""
|
|
|
|
|
2020-06-27 01:38:25 +00:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
root: str = PATH_DATASETS,
|
|
|
|
train: bool = True,
|
|
|
|
normalize: tuple = (0.5, 1.0),
|
|
|
|
download: bool = False,
|
|
|
|
num_samples: int = 100,
|
|
|
|
digits: Optional[Sequence] = (0, 1, 2),
|
|
|
|
):
|
2020-04-02 16:28:44 +00:00
|
|
|
|
|
|
|
# number of examples per class
|
|
|
|
self.num_samples = num_samples
|
|
|
|
# take just a subset of MNIST dataset
|
|
|
|
self.digits = digits if digits else list(range(10))
|
|
|
|
|
|
|
|
self.cache_folder_name = 'digits-' + '-'.join(str(d) for d in sorted(self.digits)) \
|
|
|
|
+ f'_nb-{self.num_samples}'
|
2020-03-30 22:25:37 +00:00
|
|
|
|
|
|
|
super().__init__(
|
|
|
|
root,
|
|
|
|
train=train,
|
|
|
|
normalize=normalize,
|
|
|
|
download=download
|
|
|
|
)
|
2020-04-02 16:28:44 +00:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _prepare_subset(full_data: torch.Tensor, full_targets: torch.Tensor,
|
|
|
|
num_samples: int, digits: Sequence):
|
|
|
|
classes = {d: 0 for d in digits}
|
|
|
|
indexes = []
|
|
|
|
for idx, target in enumerate(full_targets):
|
|
|
|
label = target.item()
|
|
|
|
if classes.get(label, float('inf')) >= num_samples:
|
|
|
|
continue
|
|
|
|
indexes.append(idx)
|
|
|
|
classes[label] += 1
|
|
|
|
if all(classes[k] >= num_samples for k in classes):
|
|
|
|
break
|
|
|
|
data = full_data[indexes]
|
|
|
|
targets = full_targets[indexes]
|
|
|
|
return data, targets
|
|
|
|
|
|
|
|
def prepare_data(self, download: bool) -> None:
|
|
|
|
if self._check_exists(self.cached_folder_path):
|
|
|
|
return
|
|
|
|
if download:
|
|
|
|
self._download(super().cached_folder_path)
|
|
|
|
|
|
|
|
for fname in (self.TRAIN_FILE_NAME, self.TEST_FILE_NAME):
|
2020-04-16 02:16:40 +00:00
|
|
|
path_fname = os.path.join(super().cached_folder_path, fname)
|
|
|
|
assert os.path.isfile(path_fname), 'Missing cached file: %s' % path_fname
|
2020-07-27 23:07:09 +00:00
|
|
|
data, targets = _try_load(path_fname)
|
2020-04-02 16:28:44 +00:00
|
|
|
data, targets = self._prepare_subset(data, targets, self.num_samples, self.digits)
|
|
|
|
torch.save((data, targets), os.path.join(self.cached_folder_path, fname))
|
2020-06-15 21:05:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
class AverageDataset(Dataset):
|
|
|
|
|
|
|
|
def __init__(self, dataset_len=300, sequence_len=100):
|
|
|
|
self.dataset_len = dataset_len
|
|
|
|
self.sequence_len = sequence_len
|
|
|
|
self.input_seq = torch.randn(dataset_len, sequence_len, 10)
|
|
|
|
top, bottom = self.input_seq.chunk(2, -1)
|
|
|
|
self.output_seq = top + bottom.roll(shifts=1, dims=-1)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return self.dataset_len
|
|
|
|
|
|
|
|
def __getitem__(self, item):
|
|
|
|
return self.input_seq[item], self.output_seq[item]
|