spaCy/spacy/cli/project.py

455 lines
17 KiB
Python
Raw Normal View History

2020-06-27 11:02:10 +00:00
from typing import List, Dict, Any, Optional
import typer
import srsly
from pathlib import Path
from wasabi import msg
2020-06-21 22:30:05 +00:00
import subprocess
import shlex
2020-06-21 22:15:06 +00:00
import os
import re
2020-06-22 12:53:31 +00:00
import shutil
2020-06-25 10:26:53 +00:00
import sys
2020-06-27 11:02:10 +00:00
import murmurhash
2020-06-27 12:15:41 +00:00
import hashlib
2020-06-27 11:02:10 +00:00
from ._app import app, Arg, Opt, COMMAND, NAME
from .. import about
from ..schemas import ProjectConfigSchema, validate
2020-06-22 12:53:31 +00:00
from ..util import ensure_path, run_command, make_tempdir, working_dir
2020-06-21 19:35:01 +00:00
CONFIG_FILE = "project.yml"
2020-06-27 11:02:10 +00:00
DVC_CONFIG = "dvc.yaml"
2020-06-22 12:53:31 +00:00
DIRS = [
"assets",
"metas",
"configs",
"packages",
"metrics",
"scripts",
"notebooks",
"training",
"corpus",
]
2020-06-21 22:15:06 +00:00
CACHES = [
Path.home() / ".torch",
Path.home() / ".caches" / "torch",
os.environ.get("TORCH_HOME"),
Path.home() / ".keras",
]
2020-06-27 11:02:10 +00:00
DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
# it directly and edit the project.yml instead and re-run the project."""
project_cli = typer.Typer(help="Command-line interface for spaCy projects")
2020-06-21 22:15:06 +00:00
@project_cli.callback(invoke_without_command=True)
2020-06-27 11:02:10 +00:00
def callback(ctx: typer.Context):
"""This runs before every project command and ensures DVC is installed and
everything is up to date.
"""
2020-06-21 22:15:06 +00:00
try:
2020-06-21 22:30:05 +00:00
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
except Exception:
2020-06-21 22:15:06 +00:00
msg.fail(
2020-06-21 22:30:05 +00:00
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
2020-06-21 22:15:06 +00:00
"You can install the Python package from pip (pip install dvc) or "
"conda (conda install -c conda-forge dvc). For more details, see the "
"documentation: https://dvc.org/doc/install",
exits=1,
)
@project_cli.command("clone")
2020-06-21 19:35:01 +00:00
def project_clone_cli(
# fmt: off
name: str = Arg(..., help="The name of the template to fetch"),
2020-06-22 12:53:31 +00:00
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
2020-06-27 11:02:10 +00:00
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
2020-06-27 12:15:41 +00:00
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
2020-06-22 12:53:31 +00:00
verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information")
# fmt: on
):
"""Clone a project template from a repository."""
2020-06-27 13:03:21 +00:00
project_clone(
name, dest, repo=repo, git=git, no_init=no_init, verbose=verbose, silent=True
)
2020-06-21 22:15:06 +00:00
2020-06-22 12:53:31 +00:00
def project_clone(
2020-06-27 11:02:10 +00:00
name: str,
dest: Path,
*,
repo: str = about.__projects__,
git: bool = False,
2020-06-27 12:15:41 +00:00
no_init: bool = False,
2020-06-27 13:03:21 +00:00
silent: bool = False,
2020-06-27 11:02:10 +00:00
verbose: bool = False,
2020-06-22 12:53:31 +00:00
) -> None:
2020-06-21 22:15:06 +00:00
dest = ensure_path(dest)
2020-06-22 12:53:31 +00:00
check_clone_dest(dest)
# When cloning a subdirectory with DVC, it will create a folder of that name
# within the destination dir, so we use a tempdir and then copy it into the
# parent directory to create the cloned directory
2020-06-27 13:03:21 +00:00
dest = dest.resolve()
2020-06-22 12:53:31 +00:00
with make_tempdir() as tmp_dir:
cmd = ["dvc", "get", repo, name, "-o", str(tmp_dir)]
if verbose:
2020-06-27 13:03:21 +00:00
cmd.append("--verbose")
if silent:
cmd.append("--quiet")
2020-06-22 12:53:31 +00:00
print(" ".join(cmd))
run_command(cmd)
shutil.move(str(tmp_dir / Path(name).name), str(dest))
2020-06-21 22:15:06 +00:00
msg.good(f"Cloned project '{name}' from {repo}")
2020-06-22 12:53:31 +00:00
for sub_dir in DIRS:
dir_path = dest / sub_dir
if not dir_path.exists():
dir_path.mkdir(parents=True)
2020-06-27 12:15:41 +00:00
if not no_init:
2020-06-27 13:03:21 +00:00
project_init(dest, git=git, silent=silent)
msg.good(f"Your project is now ready!", dest)
2020-06-27 12:15:41 +00:00
print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}")
@project_cli.command("init")
def project_init_cli(
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
):
"""Initialize a project directory with DVC and Git (optional). This should
typically be taken care of automatically when you run the "project clone"
command.
"""
2020-06-27 12:40:28 +00:00
project_init(path, git=git, silent=True)
2020-06-27 12:15:41 +00:00
2020-06-27 12:40:28 +00:00
def project_init(
dest: Path, *, git: bool = False, silent: bool = False, analytics: bool = False
):
2020-06-27 11:02:10 +00:00
with working_dir(dest):
init_cmd = ["dvc", "init"]
2020-06-27 12:40:28 +00:00
if silent:
init_cmd.append("--quiet")
2020-06-27 11:02:10 +00:00
if not git:
init_cmd.append("--no-scm")
if git:
run_command(["git", "init"])
run_command(init_cmd)
2020-06-27 12:40:28 +00:00
if not analytics:
# TODO: find a better solution for this?
run_command(["dvc", "config", "core.analytics", "false"])
config = load_project_config(dest)
2020-06-28 10:24:59 +00:00
setup_check_dvc(dest, config)
2020-06-21 22:15:06 +00:00
2020-06-27 11:02:10 +00:00
@project_cli.command("assets")
def project_assets_cli(
# fmt: off
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
# fmt: on
2020-06-21 22:15:06 +00:00
):
"""Use Data Version Control to get the assets for the project."""
2020-06-27 12:15:41 +00:00
project_assets(path)
2020-06-21 22:15:06 +00:00
2020-06-27 12:15:41 +00:00
def project_assets(project_path: Path) -> None:
2020-06-21 22:15:06 +00:00
project_path = ensure_path(project_path)
config = load_project_config(project_path)
2020-06-28 10:24:59 +00:00
setup_check_dvc(project_path, config)
2020-06-21 22:15:06 +00:00
assets = config.get("assets", {})
if not assets:
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
2020-06-27 11:02:10 +00:00
msg.info(f"Fetching {len(assets)} asset(s)")
2020-06-21 22:15:06 +00:00
variables = config.get("variables", {})
for asset in assets:
url = asset["url"].format(**variables)
dest = asset["dest"].format(**variables)
2020-06-27 12:15:41 +00:00
fetch_asset(project_path, url, dest, asset.get("checksum"))
2020-06-27 19:13:06 +00:00
def fetch_asset(
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
) -> None:
2020-06-27 12:15:41 +00:00
check_asset(url)
2020-06-28 10:40:11 +00:00
dest_path = (project_path / dest).resolve()
2020-06-27 12:15:41 +00:00
if dest_path.exists() and checksum:
# If there's already a file, check for checksum
2020-06-28 11:07:31 +00:00
# TODO: add support for caches (dvc import-url with local path)
2020-06-27 12:15:41 +00:00
if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}")
return
with working_dir(project_path):
try:
2020-06-28 11:07:31 +00:00
# If these fail, we don't want to output an error or info message.
# Try with tracking the source first, then just downloading with
# DVC, then a regular non-DVC download.
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
except subprocess.CalledProcessError:
2020-06-27 12:15:41 +00:00
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
2020-06-28 11:07:31 +00:00
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
run_command(["dvc", "add", str(dest_path)])
2020-06-27 12:15:41 +00:00
except subprocess.CalledProcessError:
# TODO: replace curl
2020-06-28 10:40:11 +00:00
run_command(["curl", url, "--output", str(dest_path), "--progress-bar"])
2020-06-28 11:07:31 +00:00
run_command(["dvc", "add", str(dest_path)])
if checksum and checksum != get_checksum(dest_path):
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
2020-06-27 12:15:41 +00:00
msg.good(f"Fetched asset {dest}")
2020-06-27 11:02:10 +00:00
@project_cli.command(
"run-all",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def project_run_all_cli(
# fmt: off
ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run all commands. Additional arguments are passed to dvc repro."""
if show_help:
print_run_help(project_dir)
else:
project_run_all(project_dir, *ctx.args)
2020-06-27 11:02:10 +00:00
def project_run_all(project_dir: Path, *dvc_args) -> None:
config = load_project_config(project_dir)
2020-06-28 10:24:59 +00:00
setup_check_dvc(project_dir, config)
2020-06-27 11:02:10 +00:00
dvc_cmd = ["dvc", "repro", *dvc_args]
2020-06-27 12:15:41 +00:00
with working_dir(project_dir):
run_command(dvc_cmd)
2020-06-27 11:02:10 +00:00
@project_cli.command(
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
2020-06-21 19:35:01 +00:00
def project_run_cli(
# fmt: off
2020-06-27 11:02:10 +00:00
ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
2020-06-22 12:53:31 +00:00
subcommand: str = Arg(None, help="Name of command defined in project config"),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run scripts defined in the project."""
2020-06-27 11:02:10 +00:00
if show_help or not subcommand:
2020-06-22 12:53:31 +00:00
print_run_help(project_dir, subcommand)
else:
2020-06-27 11:02:10 +00:00
project_run(project_dir, subcommand, *ctx.args)
2020-06-22 12:53:31 +00:00
2020-06-27 11:02:10 +00:00
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
2020-06-22 12:53:31 +00:00
"""Simulate a CLI help prompt using the info available in the project config."""
config = load_project_config(project_dir)
config_commands = config.get("commands", [])
commands = {cmd["name"]: cmd for cmd in config_commands}
if subcommand:
if subcommand not in commands:
msg.fail(f"Can't find command '{subcommand}' in project config", exits=1)
print(f"Usage: {COMMAND} project run {project_dir} {subcommand}")
help_text = commands[subcommand].get("help")
if help_text:
msg.text(f"\n{help_text}\n")
else:
print(f"\nAvailable commands in {CONFIG_FILE}")
print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]")
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
2020-06-21 19:35:01 +00:00
2020-06-27 11:02:10 +00:00
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
config = load_project_config(project_dir)
2020-06-28 10:24:59 +00:00
setup_check_dvc(project_dir, config)
config_commands = config.get("commands", [])
variables = config.get("variables", {})
commands = {cmd["name"]: cmd for cmd in config_commands}
2020-06-27 11:02:10 +00:00
if subcommand not in commands:
msg.fail(f"Can't find command '{subcommand}' in project config", exits=1)
2020-06-27 11:02:10 +00:00
if subcommand in config.get("run", []):
# This is one of the pipeline commands tracked in DVC
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
run_command(dvc_cmd)
else:
cmd = commands[subcommand]
# Deps in non-DVC commands aren't tracked, but if they're defined,
# make sure they exist before running the command
for dep in cmd.get("deps", []):
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
msg.fail(err, exits=1)
2020-06-27 11:02:10 +00:00
with working_dir(project_dir):
run_commands(cmd["script"], variables)
2020-06-27 11:02:10 +00:00
@project_cli.command("exec")
def project_exec_cli(
# fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(..., help="Name of command defined in project config"),
# fmt: on
):
"""Internals"""
project_exec(project_dir, subcommand)
def project_exec(project_dir: Path, subcommand: str):
config = load_project_config(project_dir)
config_commands = config.get("commands", [])
variables = config.get("variables", {})
commands = {cmd["name"]: cmd for cmd in config_commands}
with working_dir(project_dir):
run_commands(commands[subcommand]["script"], variables)
@project_cli.command("update-dvc")
def project_update_dvc_cli(
# fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
config = load_project_config(project_dir)
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
if updated:
msg.good(f"Updated DVC config from {CONFIG_FILE}")
else:
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
app.add_typer(project_cli, name="project")
2020-06-21 19:35:01 +00:00
def load_project_config(path: Path) -> Dict[str, Any]:
config_path = path / CONFIG_FILE
if not config_path.exists():
msg.fail("Can't find project config", config_path, exits=1)
config = srsly.read_yaml(config_path)
errors = validate(ProjectConfigSchema, config)
if errors:
msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1)
return config
2020-06-27 11:02:10 +00:00
def update_dvc_config(
path: Path,
config: Dict[str, Any],
verbose: bool = False,
silent: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yml file in the
project directory. The file is auto-generated based on the config.
"""
config_hash = get_hash(config)
dvc_config_path = path / DVC_CONFIG
if dvc_config_path.exists():
# Cneck if the file was generated using the current config, if not, redo
with dvc_config_path.open("r", encoding="utf8") as f:
ref_hash = f.readline().strip().replace("# ", "")
if ref_hash == config_hash and not force:
return False # Nothing has changed in project config, don't need to update
dvc_config_path.unlink()
variables = config.get("variables", {})
commands = []
# We only want to include commands that are part of the main list of "run"
# commands in project.yml and should be run in sequence
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
for name in config.get("run", []):
if name not in config_commands:
msg.fail(f"Can't find command '{name}' in project config", exits=1)
command = config_commands[name]
deps = command.get("deps", [])
outputs = command.get("outputs", [])
outputs_no_cache = command.get("outputs_no_cache", [])
if not deps and not outputs and not outputs_no_cache:
continue
# Default to "." as the project path since dvc.yaml is auto-generated
# and we don't want arbitrary paths in there
project_cmd = ["python", "-m", NAME, "project", "exec", ".", name]
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
if verbose:
dvc_cmd.append("--verbose")
if silent:
dvc_cmd.append("--quiet")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
commands.append(" ".join(full_cmd))
2020-06-27 12:15:41 +00:00
with working_dir(path):
run_commands(commands, variables, silent=True)
2020-06-27 11:02:10 +00:00
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
return True
2020-06-28 10:24:59 +00:00
def setup_check_dvc(project_path: Path, config: Dict[str, Any]) -> None:
if not (project_path / ".dvc").exists():
msg.fail("Project not initialized as a DVC project", exits=1)
with msg.loading("Updating DVC config..."):
updated = update_dvc_config(project_path, config, silent=True)
if updated:
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
2020-06-27 11:02:10 +00:00
def run_commands(
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
) -> None:
2020-06-21 19:35:01 +00:00
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
2020-06-25 10:26:53 +00:00
command = shlex.split(command)
# TODO: is this needed / a good idea?
if len(command) and command[0] == "python":
command[0] = sys.executable
2020-06-28 10:24:59 +00:00
elif len(command) and command[0] == "pip":
command = [sys.executable, "-m", "pip", *command[1:]]
2020-06-27 11:02:10 +00:00
if not silent:
print(" ".join(command))
2020-06-25 10:26:53 +00:00
run_command(command)
2020-06-21 22:15:06 +00:00
def check_asset(url: str) -> None:
# If the asset URL is a regular GitHub URL it's likely a mistake
# TODO: support loading from GitHub URLs? Automatically convert to raw?
if re.match("(http(s?)):\/\/github.com", url):
msg.warn(
"Downloading from a regular GitHub URL. This will only download "
"the source of the page, not the actual file. If you want to "
"download the raw file, click on 'Download' on the GitHub page "
"and copy the raw.githubusercontent.com URL instead."
)
2020-06-22 12:53:31 +00:00
# url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/").replace("/tree/", "/")
def check_clone_dest(dest: Path) -> None:
"""Check and validate that the destination path can be used to clone."""
if not dest:
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
if dest.exists():
# Directory already exists (not allowed, clone needs to create it)
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
if not dest.parent.exists():
# We're not creating parents, parent dir should exist
msg.fail(
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
exits=1,
)
2020-06-27 11:02:10 +00:00
def get_hash(data) -> str:
return str(murmurhash.hash(srsly.json_dumps(data, sort_keys=True)))
2020-06-27 12:15:41 +00:00
def get_checksum(path: Path) -> str:
return hashlib.md5(path.read_bytes()).hexdigest()