From a60562f208b4565f8cafc07cfcadcc719f889baa Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 9 Jul 2020 23:51:18 +0200 Subject: [PATCH] Update project CLI hashes, directories, skipping (#5741) * Update project CLI hashes, directories, skipping * Improve clone success message * Remove unused context args * Move project-specific utils to project utils The hashing/checksum functions may not end up being general-purpose functions and are more designed for the projects, so they shouldn't live in spacy.util * Improve run help and add workflows * Add note re: directory checksum speed * Fix cloning from subdirectories and output messages * Remove hard-coded dirs --- spacy/cli/project/assets.py | 8 ++++-- spacy/cli/project/clone.py | 34 +++++++++-------------- spacy/cli/project/dvc.py | 6 +++-- spacy/cli/project/run.py | 54 ++++++++++++++++++++++++------------- spacy/cli/project/util.py | 40 +++++++++++++++++++++++++-- spacy/schemas.py | 7 ++--- spacy/util.py | 27 +++---------------- 7 files changed, 103 insertions(+), 73 deletions(-) diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 0ef3419f3..2270574ab 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -6,9 +6,9 @@ import tqdm import re import shutil -from ...util import ensure_path, get_checksum, working_dir +from ...util import ensure_path, working_dir from .._app import project_cli, Arg -from .util import PROJECT_FILE, load_project_config +from .util import PROJECT_FILE, load_project_config, get_checksum # TODO: find a solution for caches @@ -94,6 +94,10 @@ def fetch_asset( if checksum == get_checksum(dest_path): msg.good(f"Skipping download with matching checksum: {dest}") return dest_path + # We might as well support the user here and create parent directories in + # case the asset dir isn't listed as a dir to create in the project.yml + if not dest_path.parent.exists(): + dest_path.parent.mkdir(parents=True) with working_dir(project_path): url = convert_asset_url(url) try: diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index ca85bfb22..6ce2d309e 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -3,23 +3,12 @@ from pathlib import Path from wasabi import msg import subprocess import shutil +import re from ... import about from ...util import ensure_path, run_command, make_tempdir from .._app import project_cli, Arg, Opt, COMMAND - - -DIRS = [ - "assets", - "metas", - "configs", - "packages", - "metrics", - "scripts", - "notebooks", - "training", - "corpus", -] +from .util import PROJECT_FILE @project_cli.command("clone") @@ -50,6 +39,7 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N dest = ensure_path(dest) check_clone(name, dest, repo) project_dir = dest.resolve() + repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" @@ -64,16 +54,16 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N run_command(["git", "-C", str(tmp_dir), "fetch"]) run_command(["git", "-C", str(tmp_dir), "checkout"]) except subprocess.CalledProcessError: - err = f"Could not clone '{name}' in the repo '{repo}'." + err = f"Could not clone '{name}' from repo '{repo_name}'" msg.fail(err) - shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) - msg.good(f"Cloned project '{name}' from {repo} into {project_dir}") - for sub_dir in DIRS: - dir_path = project_dir / sub_dir - if not dir_path.exists(): - dir_path.mkdir(parents=True) - msg.good(f"Your project is now ready!", dest) - print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") + # We need Path(name) to make sure we also support subdirectories + shutil.move(str(tmp_dir / Path(name)), str(project_dir)) + msg.good(f"Cloned '{name}' from {repo_name}", project_dir) + if not (project_dir / PROJECT_FILE).exists(): + msg.warn(f"No {PROJECT_FILE} found in directory") + else: + msg.good(f"Your project is now ready!") + print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") def check_clone(name: str, dest: Path, repo: str) -> None: diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py index dce97179e..c29618820 100644 --- a/spacy/cli/project/dvc.py +++ b/spacy/cli/project/dvc.py @@ -5,9 +5,9 @@ import subprocess from pathlib import Path from wasabi import msg -from .util import PROJECT_FILE, load_project_config +from .util import PROJECT_FILE, load_project_config, get_hash from .._app import project_cli, Arg, Opt, NAME, COMMAND -from ...util import get_hash, working_dir, split_command, join_command, run_command +from ...util import working_dir, split_command, join_command, run_command DVC_CONFIG = "dvc.yaml" @@ -116,6 +116,8 @@ def update_dvc_config( outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"] + if command.get("no_skip"): + dvc_cmd.append("--always-changed") full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] dvc_commands.append(join_command(full_cmd)) with working_dir(path): diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index db7633ade..a8cc58c01 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -1,22 +1,18 @@ from typing import Optional, List, Dict, Sequence, Any from pathlib import Path from wasabi import msg -import typer import sys import srsly -from ...util import working_dir, run_command, split_command, is_cwd, get_checksum -from ...util import get_hash, join_command +from ...util import working_dir, run_command, split_command, is_cwd, join_command from .._app import project_cli, Arg, Opt, COMMAND -from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config +from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash +from .util import get_checksum -@project_cli.command( - "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) +@project_cli.command("run") def project_run_cli( # fmt: off - ctx: typer.Context, subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), @@ -32,7 +28,7 @@ def project_run_cli( if show_help or not subcommand: print_run_help(project_dir, subcommand) else: - project_run(project_dir, subcommand, *ctx.args, force=force, dry=dry) + project_run(project_dir, subcommand, force=force, dry=dry) def project_run( @@ -73,7 +69,8 @@ def project_run( else: msg.divider(subcommand) run_commands(cmd["script"], variables, dry=dry) - update_lockfile(current_dir, cmd, variables) + if not dry: + update_lockfile(current_dir, cmd, variables) def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: @@ -87,19 +84,35 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: config = load_project_config(project_dir) config_commands = config.get("commands", []) commands = {cmd["name"]: cmd for cmd in config_commands} + workflows = config.get("workflows", {}) project_loc = "" if is_cwd(project_dir) else project_dir if subcommand: - validate_subcommand(commands.keys(), subcommand) + validate_subcommand(commands.keys(), workflows.keys(), subcommand) print(f"Usage: {COMMAND} project run {subcommand} {project_loc}") - help_text = commands[subcommand].get("help") - if help_text: - msg.text(f"\n{help_text}\n") + if subcommand in commands: + help_text = commands[subcommand].get("help") + if help_text: + print(f"\n{help_text}\n") + elif subcommand in workflows: + steps = workflows[subcommand] + print(f"\nWorkflow consisting of {len(steps)} commands:") + steps_data = [ + (f"{i + 1}. {step}", commands[step].get("help", "")) + for i, step in enumerate(steps) + ] + msg.table(steps_data) + help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help" + print(f"For command details, run: {help_cmd}") else: - print(f"\nAvailable commands in {PROJECT_FILE}") - print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") - msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:") - print(f"{COMMAND} project run {project_loc}") + print("") + if config_commands: + print(f"Available commands in {PROJECT_FILE}") + print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") + msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) + if workflows: + print(f"Available workflows in {PROJECT_FILE}") + print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}") + msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()]) def run_commands( @@ -179,6 +192,9 @@ def check_rerun( if command["name"] not in data: # We don't have info about this command return True entry = data[command["name"]] + # Always run commands with no outputs (otherwise they'd always be skipped) + if not entry.get("outs", []): + return True # If the entry in the lockfile matches the lockfile entry that would be # generated from the current command, we don't rerun because it means that # all inputs/outputs, hashes and scripts are the same and nothing changed diff --git a/spacy/cli/project/util.py b/spacy/cli/project/util.py index 5f2dc59ee..1111ddc2d 100644 --- a/spacy/cli/project/util.py +++ b/spacy/cli/project/util.py @@ -1,7 +1,8 @@ -from typing import Dict, Any +from typing import Dict, Any, Union from pathlib import Path from wasabi import msg import srsly +import hashlib from ...schemas import ProjectConfigSchema, validate @@ -11,7 +12,8 @@ PROJECT_LOCK = "project.lock" def load_project_config(path: Path) -> Dict[str, Any]: - """Load the project.yml file from a directory and validate it. + """Load the project.yml file from a directory and validate it. Also make + sure that all directories defined in the config exist. path (Path): The path to the project directory. RETURNS (Dict[str, Any]): The loaded project.yml. @@ -28,6 +30,11 @@ def load_project_config(path: Path) -> Dict[str, Any]: if errors: msg.fail(invalid_err, "\n".join(errors), exits=1) validate_project_commands(config) + # Make sure directories defined in config exist + for subdir in config.get("directories", []): + dir_path = path / subdir + if not dir_path.exists(): + dir_path.mkdir(parents=True) return config @@ -55,3 +62,32 @@ def validate_project_commands(config: Dict[str, Any]) -> None: f"section of the {PROJECT_FILE}.", exits=1, ) + + +def get_hash(data) -> str: + """Get the hash for a JSON-serializable object. + + data: The data to hash. + RETURNS (str): The hash. + """ + data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") + return hashlib.md5(data_str).hexdigest() + + +def get_checksum(path: Union[Path, str]) -> str: + """Get the checksum for a file or directory given its file path. If a + directory path is provided, this uses all files in that directory. + + path (Union[Path, str]): The file or directory path. + RETURNS (str): The checksum. + """ + path = Path(path) + if path.is_file(): + return hashlib.md5(Path(path).read_bytes()).hexdigest() + if path.is_dir(): + # TODO: this is currently pretty slow + dir_checksum = hashlib.md5() + for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): + dir_checksum.update(sub_file.read_bytes()) + return dir_checksum.hexdigest() + raise ValueError(f"Can't get checksum for {path}: not a file or directory") diff --git a/spacy/schemas.py b/spacy/schemas.py index b7307b5b2..c67814dfd 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -232,9 +232,10 @@ class ProjectConfigCommand(BaseModel): name: StrictStr = Field(..., title="Name of command") help: Optional[StrictStr] = Field(None, title="Command description") script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") - deps: List[StrictStr] = Field([], title="Data Version Control dependencies") - outputs: List[StrictStr] = Field([], title="Data Version Control outputs") - outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)") + deps: List[StrictStr] = Field([], title="File dependencies required by this command") + outputs: List[StrictStr] = Field([], title="Outputs produced by this command") + outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)") + no_skip: bool = Field(False, title="Never skip this command, even if nothing changed") # fmt: on class Config: diff --git a/spacy/util.py b/spacy/util.py index 071d81f2f..4ed002f37 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -20,7 +20,6 @@ import subprocess from contextlib import contextmanager import tempfile import shutil -import hashlib import shlex try: @@ -511,25 +510,6 @@ def make_tempdir(): warnings.warn(Warnings.W091.format(dir=d, msg=e)) -def get_hash(data) -> str: - """Get the hash for a JSON-serializable object. - - data: The data to hash. - RETURNS (str): The hash. - """ - data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") - return hashlib.md5(data_str).hexdigest() - - -def get_checksum(path: Union[Path, str]) -> str: - """Get the checksum for a file given its file path. - - path (Union[Path, str]): The file path. - RETURNS (str): The checksum. - """ - return hashlib.md5(Path(path).read_bytes()).hexdigest() - - def is_cwd(path: Union[Path, str]) -> bool: """Check whether a path is the current working directory. @@ -756,12 +736,12 @@ def minibatch_by_padded_size(docs, size, buffer=256, discard_oversize=False): pass else: yield subbatch - + def _batch_by_length(seqs, max_words): - """Given a list of sequences, return a batched list of indices into the + """Given a list of sequences, return a batched list of indices into the list, where the batches are grouped by length, in descending order. - + Batches may be at most max_words in size, defined as max sequence length * size. """ # Use negative index so we can get sort by position ascending. @@ -785,6 +765,7 @@ def _batch_by_length(seqs, max_words): batches.reverse() return batches + def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False): """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by