Enable no op optimize (#19490)

This commit is contained in:
thomas chaton 2024-02-16 20:27:20 +00:00 committed by GitHub
parent 53ea76a75c
commit bbc5488a62
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 25 additions and 4 deletions

View File

@ -724,7 +724,12 @@ class DataChunkRecipe(DataRecipe):
size = sum([c["dim"] if c["dim"] is not None else c["chunk_size"] for c in config["chunks"]])
num_bytes = sum([c["chunk_bytes"] for c in config["chunks"]])
data_format = tree_unflatten(config["config"]["data_format"], treespec_loads(config["config"]["data_spec"]))
if config["config"] is not None:
data_format = tree_unflatten(
config["config"]["data_format"], treespec_loads(config["config"]["data_spec"])
)
else:
data_format = None
num_chunks = len(config["chunks"])
# The platform can't store more than 1024 entries.
@ -735,7 +740,7 @@ class DataChunkRecipe(DataRecipe):
size=size,
num_bytes=num_bytes,
data_format=data_format,
compression=config["config"]["compression"],
compression=config["config"]["compression"] if config["config"] else None,
num_chunks=len(config["chunks"]),
num_bytes_per_chunk=num_bytes_per_chunk,
)

View File

@ -2,7 +2,7 @@ import io
import os
import urllib
from contextlib import contextmanager
from subprocess import Popen
from subprocess import DEVNULL, Popen
from typing import Any, Callable, List, Optional, Tuple, Union
from lightning.data.constants import _IS_IN_STUDIO, _LIGHTNING_CLOUD_LATEST
@ -134,7 +134,7 @@ def optimize_dns(enable: bool) -> None:
f"sudo /home/zeus/miniconda3/envs/cloudspace/bin/python"
f" -c 'from lightning.data.processing.utilities import _optimize_dns; _optimize_dns({enable})'"
)
Popen(cmd, shell=True).wait() # E501
Popen(cmd, shell=True, stdout=DEVNULL, stderr=DEVNULL).wait() # E501
def _optimize_dns(enable: bool) -> None:

View File

@ -1023,3 +1023,19 @@ def test_map_is_last(num_workers, expected, tmpdir):
)
assert sorted(os.listdir(tmpdir)) == expected
def no_op(index):
pass
def test_empty_optimize(tmpdir):
optimize(
no_op,
list(range(10)),
output_dir=str(tmpdir),
chunk_bytes="64MB",
num_workers=1,
)
assert os.listdir(tmpdir) == ["index.json"]