2020-08-23 16:32:09 +00:00
|
|
|
from pathlib import Path
|
|
|
|
from wasabi import msg
|
|
|
|
from .remote_storage import RemoteStorage
|
|
|
|
from .remote_storage import get_command_hash
|
2021-08-02 16:13:53 +00:00
|
|
|
from .._util import project_cli, Arg, logger
|
2020-08-23 16:32:09 +00:00
|
|
|
from .._util import load_project_config
|
2020-08-24 01:27:09 +00:00
|
|
|
from .run import update_lockfile
|
2020-08-23 16:32:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
@project_cli.command("pull")
|
|
|
|
def project_pull_cli(
|
|
|
|
# fmt: off
|
|
|
|
remote: str = Arg("default", help="Name or path of remote storage"),
|
|
|
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
|
|
# fmt: on
|
|
|
|
):
|
2020-08-25 15:13:50 +00:00
|
|
|
"""Retrieve available precomputed outputs from a remote storage.
|
2020-08-23 16:32:09 +00:00
|
|
|
You can alias remotes in your project.yml by mapping them to storage paths.
|
|
|
|
A storage can be anything that the smart-open library can upload to, e.g.
|
2020-09-04 10:58:50 +00:00
|
|
|
AWS, Google Cloud Storage, SSH, local directories etc.
|
|
|
|
|
2021-01-30 09:09:38 +00:00
|
|
|
DOCS: https://spacy.io/api/cli#project-pull
|
2020-08-23 16:32:09 +00:00
|
|
|
"""
|
|
|
|
for url, output_path in project_pull(project_dir, remote):
|
|
|
|
if url is not None:
|
|
|
|
msg.good(f"Pulled {output_path} from {url}")
|
|
|
|
|
|
|
|
|
|
|
|
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
2020-09-24 08:30:42 +00:00
|
|
|
# TODO: We don't have tests for this :(. It would take a bit of mockery to
|
|
|
|
# set up. I guess see if it breaks first?
|
2020-08-23 16:32:09 +00:00
|
|
|
config = load_project_config(project_dir)
|
|
|
|
if remote in config.get("remotes", {}):
|
|
|
|
remote = config["remotes"][remote]
|
|
|
|
storage = RemoteStorage(project_dir, remote)
|
2020-09-24 08:30:42 +00:00
|
|
|
commands = list(config.get("commands", []))
|
|
|
|
# We use a while loop here because we don't know how the commands
|
|
|
|
# will be ordered. A command might need dependencies from one that's later
|
|
|
|
# in the list.
|
|
|
|
while commands:
|
|
|
|
for i, cmd in enumerate(list(commands)):
|
2023-02-02 10:15:22 +00:00
|
|
|
logger.debug("CMD: %s.", cmd["name"])
|
2020-09-24 08:30:42 +00:00
|
|
|
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
|
|
|
if all(dep.exists() for dep in deps):
|
|
|
|
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
|
|
|
for output_path in cmd.get("outputs", []):
|
|
|
|
url = storage.pull(output_path, command_hash=cmd_hash)
|
2021-08-06 11:38:06 +00:00
|
|
|
logger.debug(
|
2023-02-02 10:15:22 +00:00
|
|
|
"URL: %s for %s with command hash %s",
|
|
|
|
url,
|
|
|
|
output_path,
|
|
|
|
cmd_hash,
|
2021-08-06 11:38:06 +00:00
|
|
|
)
|
2020-09-24 08:30:42 +00:00
|
|
|
yield url, output_path
|
2020-08-24 01:27:09 +00:00
|
|
|
|
2020-09-24 08:30:42 +00:00
|
|
|
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
|
|
|
if all(loc.exists() for loc in out_locs):
|
|
|
|
update_lockfile(project_dir, cmd)
|
|
|
|
# We remove the command from the list here, and break, so that
|
|
|
|
# we iterate over the loop again.
|
2020-09-24 14:11:33 +00:00
|
|
|
commands.pop(i)
|
2020-09-24 08:30:42 +00:00
|
|
|
break
|
2021-08-02 16:13:53 +00:00
|
|
|
else:
|
2023-02-02 10:15:22 +00:00
|
|
|
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
2020-09-24 08:30:42 +00:00
|
|
|
else:
|
|
|
|
# If we didn't break the for loop, break the while loop.
|
|
|
|
break
|