2023-10-09 15:06:32 +00:00
|
|
|
# Copyright The Lightning AI team.
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
import json
|
|
|
|
import os
|
2023-11-07 19:40:21 +00:00
|
|
|
import sys
|
2023-10-09 15:06:32 +00:00
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import pytest
|
2023-10-13 13:52:23 +00:00
|
|
|
from lightning import seed_everything
|
2023-10-16 13:33:50 +00:00
|
|
|
from lightning.data.streaming.reader import BinaryReader
|
|
|
|
from lightning.data.streaming.sampler import ChunkedIndex
|
2023-11-07 09:46:13 +00:00
|
|
|
from lightning.data.streaming.writer import BinaryWriter
|
|
|
|
from lightning.data.utilities.format import _FORMAT_TO_RATIO
|
2023-10-09 15:06:32 +00:00
|
|
|
from lightning_utilities.core.imports import RequirementCache
|
|
|
|
|
|
|
|
_PIL_AVAILABLE = RequirementCache("PIL")
|
|
|
|
|
|
|
|
|
|
|
|
def test_binary_writer_with_ints_and_chunk_bytes(tmpdir):
|
|
|
|
with pytest.raises(FileNotFoundError, match="The provided cache directory `dontexists` doesn't exist."):
|
|
|
|
BinaryWriter("dontexists", {})
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="No compresion algorithms are installed."):
|
|
|
|
BinaryWriter(tmpdir, {"i": "int"}, compression="something_else")
|
|
|
|
|
|
|
|
binary_writer = BinaryWriter(tmpdir, chunk_bytes=90)
|
|
|
|
|
|
|
|
for i in range(100):
|
|
|
|
binary_writer[i] = {"i": i, "i+1": i + 1, "i+2": i + 2}
|
|
|
|
|
|
|
|
assert len(os.listdir(tmpdir)) == 19
|
|
|
|
binary_writer.done()
|
|
|
|
binary_writer.merge()
|
|
|
|
assert len(os.listdir(tmpdir)) == 21
|
|
|
|
|
|
|
|
with open(os.path.join(tmpdir, "index.json")) as f:
|
|
|
|
data = json.load(f)
|
|
|
|
|
|
|
|
assert data["chunks"][0]["chunk_size"] == 6
|
|
|
|
assert data["chunks"][1]["chunk_size"] == 5
|
|
|
|
assert data["chunks"][-1]["chunk_size"] == 4
|
|
|
|
|
|
|
|
chunk_sizes = np.cumsum([chunk["chunk_size"] for chunk in data["chunks"]])
|
|
|
|
|
2023-11-07 09:46:13 +00:00
|
|
|
reader = BinaryReader(tmpdir, max_cache_size=10 ^ 9)
|
2023-10-09 15:06:32 +00:00
|
|
|
for i in range(100):
|
|
|
|
for chunk_index, chunk_start in enumerate(chunk_sizes):
|
|
|
|
if i >= chunk_start:
|
|
|
|
continue
|
|
|
|
break
|
|
|
|
data = reader.read(ChunkedIndex(i, chunk_index=chunk_index))
|
|
|
|
assert data == {"i": i, "i+1": i + 1, "i+2": i + 2}
|
|
|
|
|
|
|
|
|
|
|
|
def test_binary_writer_with_ints_and_chunk_size(tmpdir):
|
2023-10-13 13:52:23 +00:00
|
|
|
seed_everything(42)
|
|
|
|
|
2023-10-09 15:06:32 +00:00
|
|
|
with pytest.raises(FileNotFoundError, match="The provided cache directory `dontexists` doesn't exist."):
|
|
|
|
BinaryWriter("dontexists", {})
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="No compresion algorithms are installed."):
|
|
|
|
BinaryWriter(tmpdir, {"i": "int"}, compression="something_else")
|
|
|
|
|
|
|
|
binary_writer = BinaryWriter(tmpdir, chunk_size=25)
|
|
|
|
|
2023-10-13 13:52:23 +00:00
|
|
|
indices = list(range(100))
|
|
|
|
indices = indices[:5] + np.random.permutation(indices[5:]).tolist()
|
|
|
|
|
|
|
|
for i in indices:
|
2023-10-09 15:06:32 +00:00
|
|
|
binary_writer[i] = {"i": i, "i+1": i + 1, "i+2": i + 2}
|
|
|
|
|
2023-10-13 13:52:23 +00:00
|
|
|
assert len(os.listdir(tmpdir)) >= 2
|
2023-10-09 15:06:32 +00:00
|
|
|
binary_writer.done()
|
|
|
|
binary_writer.merge()
|
|
|
|
assert len(os.listdir(tmpdir)) == 5
|
|
|
|
|
|
|
|
with open(os.path.join(tmpdir, "index.json")) as f:
|
|
|
|
data = json.load(f)
|
|
|
|
|
|
|
|
assert data["chunks"][0]["chunk_size"] == 25
|
|
|
|
assert data["chunks"][1]["chunk_size"] == 25
|
|
|
|
assert data["chunks"][-1]["chunk_size"] == 25
|
|
|
|
|
2023-11-07 09:46:13 +00:00
|
|
|
reader = BinaryReader(tmpdir, max_cache_size=10 ^ 9)
|
2023-10-09 15:06:32 +00:00
|
|
|
for i in range(100):
|
|
|
|
data = reader.read(ChunkedIndex(i, chunk_index=i // 25))
|
|
|
|
assert data == {"i": i, "i+1": i + 1, "i+2": i + 2}
|
|
|
|
|
|
|
|
|
2023-11-07 19:40:21 +00:00
|
|
|
@pytest.mark.skipif(condition=not _PIL_AVAILABLE or sys.platform == "darwin", reason="Requires: ['pil']")
|
2023-10-09 15:06:32 +00:00
|
|
|
def test_binary_writer_with_jpeg_and_int(tmpdir):
|
|
|
|
"""Validate the writer and reader can serialize / deserialize a pair of image and label."""
|
|
|
|
from PIL import Image
|
|
|
|
|
|
|
|
cache_dir = os.path.join(tmpdir, "chunks")
|
|
|
|
os.makedirs(cache_dir, exist_ok=True)
|
|
|
|
binary_writer = BinaryWriter(cache_dir, chunk_bytes=2 << 12)
|
|
|
|
|
|
|
|
imgs = []
|
|
|
|
|
|
|
|
for i in range(100):
|
|
|
|
path = os.path.join(tmpdir, f"img{i}.jpeg")
|
|
|
|
np_data = np.random.randint(255, size=(28, 28), dtype=np.uint8)
|
|
|
|
img = Image.fromarray(np_data).convert("L")
|
|
|
|
img.save(path, format="jpeg", quality=100)
|
|
|
|
img = Image.open(path)
|
|
|
|
imgs.append(img)
|
|
|
|
binary_writer[i] = {"x": img, "y": i}
|
|
|
|
|
|
|
|
assert len(os.listdir(cache_dir)) == 24
|
|
|
|
binary_writer.done()
|
|
|
|
binary_writer.merge()
|
|
|
|
assert len(os.listdir(cache_dir)) == 26
|
|
|
|
|
|
|
|
with open(os.path.join(cache_dir, "index.json")) as f:
|
|
|
|
data = json.load(f)
|
|
|
|
|
|
|
|
assert data["chunks"][0]["chunk_size"] == 4
|
|
|
|
assert data["chunks"][1]["chunk_size"] == 4
|
|
|
|
assert data["chunks"][-1]["chunk_size"] == 4
|
|
|
|
|
2023-11-07 09:46:13 +00:00
|
|
|
reader = BinaryReader(cache_dir, max_cache_size=10 ^ 9)
|
2023-10-09 15:06:32 +00:00
|
|
|
for i in range(100):
|
|
|
|
data = reader.read(ChunkedIndex(i, chunk_index=i // 4))
|
|
|
|
np.testing.assert_array_equal(np.asarray(data["x"]).squeeze(0), imgs[i])
|
|
|
|
assert data["y"] == i
|
|
|
|
|
|
|
|
|
2023-11-07 19:40:21 +00:00
|
|
|
@pytest.mark.skipif(condition=not _PIL_AVAILABLE or sys.platform == "darwin", reason="Requires: ['pil']")
|
2023-10-09 15:06:32 +00:00
|
|
|
def test_binary_writer_with_jpeg_filepath_and_int(tmpdir):
|
|
|
|
"""Validate the writer and reader can serialize / deserialize a pair of image and label."""
|
|
|
|
from PIL import Image
|
|
|
|
|
|
|
|
cache_dir = os.path.join(tmpdir, "chunks")
|
|
|
|
os.makedirs(cache_dir, exist_ok=True)
|
|
|
|
binary_writer = BinaryWriter(cache_dir, chunk_bytes=2 << 12)
|
|
|
|
|
|
|
|
imgs = []
|
|
|
|
|
|
|
|
for i in range(100):
|
|
|
|
path = os.path.join(tmpdir, f"img{i}.jpeg")
|
|
|
|
np_data = np.random.randint(255, size=(28, 28), dtype=np.uint8)
|
|
|
|
img = Image.fromarray(np_data).convert("L")
|
|
|
|
img.save(path, format="jpeg", quality=100)
|
|
|
|
img = Image.open(path)
|
|
|
|
imgs.append(img)
|
|
|
|
binary_writer[i] = {"x": path, "y": i}
|
|
|
|
|
|
|
|
assert len(os.listdir(cache_dir)) == 24
|
|
|
|
binary_writer.done()
|
|
|
|
binary_writer.merge()
|
|
|
|
assert len(os.listdir(cache_dir)) == 26
|
|
|
|
|
|
|
|
with open(os.path.join(cache_dir, "index.json")) as f:
|
|
|
|
data = json.load(f)
|
|
|
|
|
|
|
|
assert data["chunks"][0]["chunk_size"] == 4
|
|
|
|
assert data["chunks"][1]["chunk_size"] == 4
|
|
|
|
assert data["chunks"][-1]["chunk_size"] == 4
|
2023-10-13 13:52:23 +00:00
|
|
|
assert sum([chunk["chunk_size"] for chunk in data["chunks"]]) == 100
|
2023-10-09 15:06:32 +00:00
|
|
|
|
2023-11-07 09:46:13 +00:00
|
|
|
reader = BinaryReader(cache_dir, max_cache_size=10 ^ 9)
|
2023-10-09 15:06:32 +00:00
|
|
|
for i in range(100):
|
|
|
|
data = reader.read(ChunkedIndex(i, chunk_index=i // 4))
|
|
|
|
np.testing.assert_array_equal(np.asarray(data["x"]).squeeze(0), imgs[i])
|
|
|
|
assert data["y"] == i
|
2023-10-19 12:41:35 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(condition=not _PIL_AVAILABLE, reason="Requires: ['pil']")
|
|
|
|
def test_binary_writer_with_jpeg_and_png(tmpdir):
|
|
|
|
from PIL import Image
|
|
|
|
|
|
|
|
cache_dir = os.path.join(tmpdir, "chunks")
|
|
|
|
os.makedirs(cache_dir, exist_ok=True)
|
|
|
|
binary_writer = BinaryWriter(cache_dir, chunk_bytes=2 << 12)
|
|
|
|
|
|
|
|
np_data = np.random.randint(255, size=(28, 28), dtype=np.uint8)
|
|
|
|
img = Image.fromarray(np_data).convert("L")
|
|
|
|
path = os.path.join(tmpdir, "img.jpeg")
|
|
|
|
img.save(path, format="jpeg", quality=100)
|
|
|
|
img_jpeg = Image.open(path)
|
|
|
|
|
|
|
|
binary_writer[0] = {"x": img_jpeg, "y": 0}
|
|
|
|
binary_writer[1] = {"x": img, "y": 1}
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="The data format changed between items"):
|
|
|
|
binary_writer[2] = {"x": 2, "y": 1}
|
2023-11-02 15:06:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_writer_human_format(tmpdir):
|
|
|
|
for k, v in _FORMAT_TO_RATIO.items():
|
|
|
|
binary_writer = BinaryWriter(tmpdir, chunk_bytes=f"{1}{k}")
|
|
|
|
assert binary_writer._chunk_bytes == v
|
|
|
|
|
|
|
|
binary_writer = BinaryWriter(tmpdir, chunk_bytes="64MB")
|
2023-11-07 09:46:13 +00:00
|
|
|
assert binary_writer._chunk_bytes == 64000000
|