mirror of https://github.com/explosion/spaCy.git
Support local filesystem remotes for projects (#11762)
* Support local filesystem remotes for projects * Fix support for local filesystem remotes for projects * Use `FluidPath` instead of `Pathy` to support both filesystem and remote paths * Create missing parent directories if required for local filesystem * Add a more general `_file_exists` method to support both `Pathy`, `Path`, and `smart_open`-compatible URLs * Add explicit `smart_open` dependency starting with support for `compression` flag * Update `pathy` dependency to exclude older versions that aren't compatible with required `smart_open` version * Update docs to refer to `Pathy` instead of `smart_open` for project remotes (technically you can still push to any `smart_open`-compatible path but you can't pull from them) * Add tests for local filesystem remotes * Update pathy for general BlobStat sorting * Add import * Remove _file_exists since only Pathy remotes are supported * Format CLI docs * Clean up merge
This commit is contained in:
parent
96c9cf3448
commit
1ebe7db07c
|
@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.1.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.8.0
|
||||||
pathy>=0.3.5
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -52,7 +52,7 @@ install_requires =
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.8.0
|
||||||
pathy>=0.3.5
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
SDIST_SUFFIX = ".tar.gz"
|
SDIST_SUFFIX = ".tar.gz"
|
||||||
|
@ -331,7 +331,7 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
|
||||||
"""Upload a file.
|
"""Upload a file.
|
||||||
|
|
||||||
src (Path): The source path.
|
src (Path): The source path.
|
||||||
|
@ -339,13 +339,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||||
"""
|
"""
|
||||||
import smart_open
|
import smart_open
|
||||||
|
|
||||||
|
# Create parent directories for local paths
|
||||||
|
if isinstance(dest, Path):
|
||||||
|
if not dest.parent.exists():
|
||||||
|
dest.parent.mkdir(parents=True)
|
||||||
|
|
||||||
dest = str(dest)
|
dest = str(dest)
|
||||||
with smart_open.open(dest, mode="wb") as output_file:
|
with smart_open.open(dest, mode="wb") as output_file:
|
||||||
with src.open(mode="rb") as input_file:
|
with src.open(mode="rb") as input_file:
|
||||||
output_file.write(input_file.read())
|
output_file.write(input_file.read())
|
||||||
|
|
||||||
|
|
||||||
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
def download_file(
|
||||||
|
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
|
||||||
|
) -> None:
|
||||||
"""Download a file using smart_open.
|
"""Download a file using smart_open.
|
||||||
|
|
||||||
url (str): The URL of the file.
|
url (str): The URL of the file.
|
||||||
|
@ -368,7 +375,7 @@ def ensure_pathy(path):
|
||||||
slow and annoying Google Cloud warning)."""
|
slow and annoying Google Cloud warning)."""
|
||||||
from pathy import Pathy # noqa: F811
|
from pathy import Pathy # noqa: F811
|
||||||
|
|
||||||
return Pathy(path)
|
return Pathy.fluid(path)
|
||||||
|
|
||||||
|
|
||||||
def git_checkout(
|
def git_checkout(
|
||||||
|
|
|
@ -5,15 +5,17 @@ import hashlib
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import tarfile
|
import tarfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
from .._util import get_hash, get_checksum, upload_file, download_file
|
||||||
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
from .._util import ensure_pathy, make_tempdir
|
||||||
|
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
||||||
from ...git_info import GIT_VERSION
|
from ...git_info import GIT_VERSION
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
class RemoteStorage:
|
class RemoteStorage:
|
||||||
|
@ -28,7 +30,7 @@ class RemoteStorage:
|
||||||
self.url = ensure_pathy(url)
|
self.url = ensure_pathy(url)
|
||||||
self.compression = compression
|
self.compression = compression
|
||||||
|
|
||||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||||
"""Compress a file or directory within a project and upload it to a remote
|
"""Compress a file or directory within a project and upload it to a remote
|
||||||
storage. If an object exists at the full URL, nothing is done.
|
storage. If an object exists at the full URL, nothing is done.
|
||||||
|
|
||||||
|
@ -49,9 +51,7 @@ class RemoteStorage:
|
||||||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||||
tar_file.add(str(loc), arcname=str(path))
|
tar_file.add(str(loc), arcname=str(path))
|
||||||
with tar_loc.open(mode="rb") as input_file:
|
upload_file(tar_loc, url)
|
||||||
with url.open(mode="wb") as output_file:
|
|
||||||
output_file.write(input_file.read())
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def pull(
|
def pull(
|
||||||
|
@ -60,7 +60,7 @@ class RemoteStorage:
|
||||||
*,
|
*,
|
||||||
command_hash: Optional[str] = None,
|
command_hash: Optional[str] = None,
|
||||||
content_hash: Optional[str] = None,
|
content_hash: Optional[str] = None,
|
||||||
) -> Optional["Pathy"]:
|
) -> Optional["FluidPath"]:
|
||||||
"""Retrieve a file from the remote cache. If the file already exists,
|
"""Retrieve a file from the remote cache. If the file already exists,
|
||||||
nothing is done.
|
nothing is done.
|
||||||
|
|
||||||
|
@ -110,25 +110,37 @@ class RemoteStorage:
|
||||||
*,
|
*,
|
||||||
command_hash: Optional[str] = None,
|
command_hash: Optional[str] = None,
|
||||||
content_hash: Optional[str] = None,
|
content_hash: Optional[str] = None,
|
||||||
) -> Optional["Pathy"]:
|
) -> Optional["FluidPath"]:
|
||||||
"""Find the best matching version of a file within the storage,
|
"""Find the best matching version of a file within the storage,
|
||||||
or `None` if no match can be found. If both the creation and content hash
|
or `None` if no match can be found. If both the creation and content hash
|
||||||
are specified, only exact matches will be returned. Otherwise, the most
|
are specified, only exact matches will be returned. Otherwise, the most
|
||||||
recent matching file is preferred.
|
recent matching file is preferred.
|
||||||
"""
|
"""
|
||||||
name = self.encode_name(str(path))
|
name = self.encode_name(str(path))
|
||||||
|
urls = []
|
||||||
if command_hash is not None and content_hash is not None:
|
if command_hash is not None and content_hash is not None:
|
||||||
url = self.make_url(path, command_hash, content_hash)
|
url = self.url / name / command_hash / content_hash
|
||||||
urls = [url] if url.exists() else []
|
urls = [url] if url.exists() else []
|
||||||
elif command_hash is not None:
|
elif command_hash is not None:
|
||||||
|
if (self.url / name / command_hash).exists():
|
||||||
urls = list((self.url / name / command_hash).iterdir())
|
urls = list((self.url / name / command_hash).iterdir())
|
||||||
else:
|
else:
|
||||||
urls = list((self.url / name).iterdir())
|
if (self.url / name).exists():
|
||||||
|
for sub_dir in (self.url / name).iterdir():
|
||||||
|
urls.extend(sub_dir.iterdir())
|
||||||
if content_hash is not None:
|
if content_hash is not None:
|
||||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||||
|
if len(urls) >= 2:
|
||||||
|
try:
|
||||||
|
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
||||||
|
except Exception:
|
||||||
|
msg.warn(
|
||||||
|
"Unable to sort remote files by last modified. The file(s) "
|
||||||
|
"pulled from the cache may not be the most recent."
|
||||||
|
)
|
||||||
return urls[-1] if urls else None
|
return urls[-1] if urls else None
|
||||||
|
|
||||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ import math
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from typing import Tuple, List, Dict, Any
|
from typing import Tuple, List, Dict, Any
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
import time
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -28,6 +29,7 @@ from spacy.cli.download import get_compatibility, get_version
|
||||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||||
from spacy.cli.package import get_third_party_dependencies
|
from spacy.cli.package import get_third_party_dependencies
|
||||||
from spacy.cli.package import _is_permitted_package_name
|
from spacy.cli.package import _is_permitted_package_name
|
||||||
|
from spacy.cli.project.remote_storage import RemoteStorage
|
||||||
from spacy.cli.project.run import _check_requirements
|
from spacy.cli.project.run import _check_requirements
|
||||||
from spacy.cli.validate import get_model_pkgs
|
from spacy.cli.validate import get_model_pkgs
|
||||||
from spacy.cli.find_threshold import find_threshold
|
from spacy.cli.find_threshold import find_threshold
|
||||||
|
@ -862,6 +864,60 @@ def test_span_length_freq_dist_output_must_be_correct():
|
||||||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_remote_storage():
|
||||||
|
with make_tempdir() as d:
|
||||||
|
filename = "a.txt"
|
||||||
|
|
||||||
|
content_hashes = ("aaaa", "cccc", "bbbb")
|
||||||
|
for i, content_hash in enumerate(content_hashes):
|
||||||
|
# make sure that each subsequent file has a later timestamp
|
||||||
|
if i > 0:
|
||||||
|
time.sleep(1)
|
||||||
|
content = f"{content_hash} content"
|
||||||
|
loc_file = d / "root" / filename
|
||||||
|
if not loc_file.parent.exists():
|
||||||
|
loc_file.parent.mkdir(parents=True)
|
||||||
|
with loc_file.open(mode="w") as file_:
|
||||||
|
file_.write(content)
|
||||||
|
|
||||||
|
# push first version to remote storage
|
||||||
|
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||||
|
remote.push(filename, "aaaa", content_hash)
|
||||||
|
|
||||||
|
# retrieve with full hashes
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
# retrieve with command hash
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename, command_hash="aaaa")
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
# retrieve with content hash
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename, content_hash=content_hash)
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
# retrieve with no hashes
|
||||||
|
loc_file.unlink()
|
||||||
|
remote.pull(filename)
|
||||||
|
with loc_file.open(mode="r") as file_:
|
||||||
|
assert file_.read() == content
|
||||||
|
|
||||||
|
|
||||||
|
def test_local_remote_storage_pull_missing():
|
||||||
|
# pulling from a non-existent remote pulls nothing gracefully
|
||||||
|
with make_tempdir() as d:
|
||||||
|
filename = "a.txt"
|
||||||
|
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||||
|
assert remote.pull(filename, command_hash="aaaa") is None
|
||||||
|
assert remote.pull(filename) is None
|
||||||
|
|
||||||
|
|
||||||
def test_cli_find_threshold(capsys):
|
def test_cli_find_threshold(capsys):
|
||||||
thresholds = numpy.linspace(0, 1, 10)
|
thresholds = numpy.linspace(0, 1, 10)
|
||||||
|
|
||||||
|
|
|
@ -1391,12 +1391,13 @@ If the contents are different, the new version of the file is uploaded. Deleting
|
||||||
obsolete files is left up to you.
|
obsolete files is left up to you.
|
||||||
|
|
||||||
Remotes can be defined in the `remotes` section of the
|
Remotes can be defined in the `remotes` section of the
|
||||||
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
|
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
|
||||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
|
||||||
communicate with the remote storages, so you can use any protocol that
|
remote storages, so you can use any protocol that `Pathy` supports, including
|
||||||
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
[S3](https://aws.amazon.com/s3/),
|
||||||
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
[Google Cloud Storage](https://cloud.google.com/storage), and the local
|
||||||
you may need to install extra dependencies to use certain protocols.
|
filesystem, although you may need to install extra dependencies to use certain
|
||||||
|
protocols.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy project push [remote] [project_dir]
|
$ python -m spacy project push [remote] [project_dir]
|
||||||
|
@ -1435,12 +1436,13 @@ outputs, so if you change the config back, you'll be able to fetch back the
|
||||||
result.
|
result.
|
||||||
|
|
||||||
Remotes can be defined in the `remotes` section of the
|
Remotes can be defined in the `remotes` section of the
|
||||||
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
|
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
|
||||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
|
||||||
communicate with the remote storages, so you can use any protocol that
|
remote storages, so you can use any protocol that `Pathy` supports, including
|
||||||
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
[S3](https://aws.amazon.com/s3/),
|
||||||
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
[Google Cloud Storage](https://cloud.google.com/storage), and the local
|
||||||
you may need to install extra dependencies to use certain protocols.
|
filesystem, although you may need to install extra dependencies to use certain
|
||||||
|
protocols.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy project pull [remote] [project_dir]
|
$ python -m spacy project pull [remote] [project_dir]
|
||||||
|
|
|
@ -259,9 +259,9 @@ pipelines.
|
||||||
> This can be used in a project command like so:
|
> This can be used in a project command like so:
|
||||||
>
|
>
|
||||||
> ```yaml
|
> ```yaml
|
||||||
> - name: "echo-path"
|
> - name: 'echo-path'
|
||||||
> script:
|
> script:
|
||||||
> - "echo ${env.ENV_PATH}"
|
> - 'echo ${env.ENV_PATH}'
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
|
@ -643,12 +643,13 @@ locally.
|
||||||
|
|
||||||
You can list one or more remotes in the `remotes` section of your
|
You can list one or more remotes in the `remotes` section of your
|
||||||
[`project.yml`](#project-yml) by mapping a string name to the URL of the
|
[`project.yml`](#project-yml) by mapping a string name to the URL of the
|
||||||
storage. Under the hood, spaCy uses the
|
storage. Under the hood, spaCy uses
|
||||||
[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
|
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
|
||||||
communicate with the remote storages, so you can use any protocol that
|
remote storages, so you can use any protocol that `Pathy` supports, including
|
||||||
`smart-open` supports, including [S3](https://aws.amazon.com/s3/),
|
[S3](https://aws.amazon.com/s3/),
|
||||||
[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
|
[Google Cloud Storage](https://cloud.google.com/storage), and the local
|
||||||
you may need to install extra dependencies to use certain protocols.
|
filesystem, although you may need to install extra dependencies to use certain
|
||||||
|
protocols.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -661,7 +662,6 @@ you may need to install extra dependencies to use certain protocols.
|
||||||
remotes:
|
remotes:
|
||||||
default: 's3://my-spacy-bucket'
|
default: 's3://my-spacy-bucket'
|
||||||
local: '/mnt/scratch/cache'
|
local: '/mnt/scratch/cache'
|
||||||
stuff: 'ssh://myserver.example.com/whatever'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
<Infobox title="How it works" emoji="💡">
|
<Infobox title="How it works" emoji="💡">
|
||||||
|
|
Loading…
Reference in New Issue