From 8cb7f9ccff5da3a5eaeb3c3ebe99214f6673d084 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 7 Jul 2020 20:51:50 +0200 Subject: [PATCH] Improve assets and DVC handling (#5719) * Improve assets and DVC handling * Remove outdated comment [ci skip] --- spacy/cli/project.py | 305 ++++++++++++++++++++++++++++--------------- spacy/schemas.py | 2 +- 2 files changed, 202 insertions(+), 105 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 200471127..33a8ff11a 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Optional, Sequence +from typing import List, Dict, Any, Optional, Sequence, Union import typer import srsly from pathlib import Path @@ -18,7 +18,7 @@ from ..util import ensure_path, run_command, make_tempdir, working_dir from ..util import get_hash, get_checksum, split_command -CONFIG_FILE = "project.yml" +PROJECT_FILE = "project.yml" DVC_CONFIG = "dvc.yaml" DVC_DIR = ".dvc" DIRS = [ @@ -38,12 +38,12 @@ CACHES = [ os.environ.get("TORCH_HOME"), Path.home() / ".keras", ] -DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit -# it directly and edit the project.yml instead and re-run the project.""" +DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. Do not edit +# it directly and edit the {PROJECT_FILE} instead and re-run the project.""" CLI_HELP = f"""Command-line interface for spaCy projects and working with project templates. You'd typically start by cloning a project template to a local directory and fetching its assets like datasets etc. See the project's -{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data +{PROJECT_FILE} for the available commands. Under the hood, spaCy uses DVC (Data Version Control) to manage input and output files and to ensure steps are only re-run if their inputs change. """ @@ -91,7 +91,7 @@ def project_init_cli( # fmt: off path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), - force: bool = Opt(False, "--force", "-F", help="Force initiziation"), + force: bool = Opt(False, "--force", "-F", "-f", help="Force initiziation"), # fmt: on ): """Initialize a project directory with DVC and optionally Git. This should @@ -100,7 +100,7 @@ def project_init_cli( be a Git repo, it should be initialized with Git first, before initializing DVC. This allows DVC to integrate with Git. """ - project_init(path, git=git, force=force, silent=True) + project_init(path, git=git, force=force) @project_cli.command("assets") @@ -110,11 +110,11 @@ def project_assets_cli( # fmt: on ): """Use DVC (Data Version Control) to fetch project assets. Assets are - defined in the "assets" section of the project config. If possible, DVC + defined in the "assets" section of the project.yml. If possible, DVC will try to track the files so you can pull changes from upstream. It will also try and store the checksum so the assets are versioned. If the file can't be tracked or checked, it will be downloaded without DVC. If a checksum - is provided in the project config, the file is only downloaded if no local + is provided in the project.yml, the file is only downloaded if no local file with the same checksum exists. """ project_assets(project_dir) @@ -132,7 +132,7 @@ def project_run_all_cli( # fmt: on ): """Run all commands defined in the project. This command will use DVC and - the defined outputs and dependencies in the project config to determine + the defined outputs and dependencies in the project.yml to determine which steps need to be re-run and where to start. This means you're only re-generating data if the inputs have changed. @@ -151,12 +151,12 @@ def project_run_all_cli( def project_run_cli( # fmt: off ctx: typer.Context, - subcommand: str = Arg(None, help="Name of command defined in project config"), + subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") # fmt: on ): - """Run a named script defined in the project config. If the command is + """Run a named script defined in the project.yml. If the command is part of the default pipeline defined in the "run" section, DVC is used to determine whether the step should re-run if its inputs have changed, or whether everything is up to date. If the script is not part of the default @@ -175,13 +175,13 @@ def project_run_cli( @project_cli.command("exec", hidden=True) def project_exec_cli( # fmt: off - subcommand: str = Arg(..., help="Name of command defined in project config"), + subcommand: str = Arg(..., help=f"Name of command defined in the {PROJECT_FILE}"), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), # fmt: on ): - """Execute a command defined in the project config. This CLI command is + """Execute a command defined in the project.yml. This CLI command is only called internally in auto-generated DVC pipelines, as a shortcut for - multi-step commands in the project config. You typically shouldn't have to + multi-step commands in the project.yml. You typically shouldn't have to call it yourself. To run a command, call "run" or "run-all". """ project_exec(project_dir, subcommand) @@ -196,15 +196,15 @@ def project_update_dvc_cli( # fmt: on ): """Update the auto-generated DVC config file. Uses the steps defined in the - "run" section of the project config. This typically happens automatically + "run" section of the project.yml. This typically happens automatically when running a command, but can also be triggered manually if needed. """ config = load_project_config(project_dir) updated = update_dvc_config(project_dir, config, verbose=verbose, force=force) if updated: - msg.good(f"Updated DVC config from {CONFIG_FILE}") + msg.good(f"Updated DVC config from {PROJECT_FILE}") else: - msg.info(f"No changes found in {CONFIG_FILE}, no update needed") + msg.info(f"No changes found in {PROJECT_FILE}, no update needed") app.add_typer(project_cli, name="project") @@ -241,7 +241,7 @@ def project_clone( cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" try: run_command(cmd) - except SystemExit: + except DVCError: err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'." msg.fail(err) with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: @@ -249,7 +249,7 @@ def project_clone( try: run_command(["git", "-C", str(tmp_dir), "fetch"]) run_command(["git", "-C", str(tmp_dir), "checkout"]) - except SystemExit: + except DVCError: err = f"Could not clone '{name}' in the repo '{repo}'." msg.fail(err) shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) @@ -282,27 +282,29 @@ def project_init( with working_dir(project_dir) as cwd: if git: run_command(["git", "init"]) - init_cmd = ["dvc", "init"] - if silent: - init_cmd.append("--quiet") - if not git: - init_cmd.append("--no-scm") - if force: - init_cmd.append("--force") - run_command(init_cmd) + flags = {"--force": force, "--quiet": silent, "--no-scm": not git} + try: + run_dvc_command(["init"], flags=flags) + except DVCError: + msg.fail( + "Failed to initialize project. This likely means that the " + "project is already initialized and has a .dvc directory. " + "To force-initialize, use the --force flag.", + exits=1, + ) # We don't want to have analytics on by default – our users should # opt-in explicitly. If they want it, they can always enable it. if not analytics: - run_command(["dvc", "config", "core.analytics", "false"]) - # Remove unused and confusing plot templates from .dvc directory - # TODO: maybe we shouldn't do this, but it's otherwise super confusing - # once you commit your changes via Git and it creates a bunch of files - # that have no purpose + run_dvc_command(["config", "core.analytics", "false"]) + # Remove unused and confusing plot templates from .dvc directory. + # Otherwise super confusing once you commit your changes via Git and it + # creates a bunch of files that have no purpose. plots_dir = cwd / DVC_DIR / "plots" if plots_dir.exists(): shutil.rmtree(str(plots_dir)) config = load_project_config(cwd) setup_check_dvc(cwd, config) + msg.good("Initialized project") def project_assets(project_dir: Path) -> None: @@ -315,19 +317,33 @@ def project_assets(project_dir: Path) -> None: setup_check_dvc(project_path, config) assets = config.get("assets", {}) if not assets: - msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) + msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") variables = config.get("variables", {}) fetched_assets = [] for asset in assets: - url = asset["url"].format(**variables) dest = asset["dest"].format(**variables) - fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum")) + url = asset.get("url") + checksum = asset.get("checksum") + if not url: + # project.yml defines asset without URL that the user has to place + if not Path(dest).exists(): + err = f"No URL provided for asset. You need to add this file yourself: {dest}" + msg.warn(err) + else: + if checksum == get_checksum(dest): + msg.good(f"Asset exists with matching checksum: {dest}") + fetched_assets.append((project_path / dest).resolve()) + else: + msg.fail(f"Asset available but with incorrect checksum: {dest}") + continue + url = url.format(**variables) + fetched_path = fetch_asset(project_path, url, dest, checksum) if fetched_path: fetched_assets.append(str(fetched_path)) if fetched_assets: with working_dir(project_path): - run_command(["dvc", "add", *fetched_assets, "--external"]) + run_dvc_command(["add", *fetched_assets, "--external"]) def fetch_asset( @@ -359,19 +375,17 @@ def fetch_asset( # Try with tracking the source first, then just downloading with # DVC, then a regular non-DVC download. try: - dvc_cmd = ["dvc", "import-url", url, str(dest_path)] - print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - except subprocess.CalledProcessError: - dvc_cmd = ["dvc", "get-url", url, str(dest_path)] - print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - except subprocess.CalledProcessError: + run_dvc_command(["import-url", url, str(dest_path)]) + except DVCError: + run_dvc_command(["get-url", url, str(dest_path)]) + except DVCError: try: download_file(url, dest_path) except requests.exceptions.HTTPError as e: msg.fail(f"Download failed: {dest}", e) return None if checksum and checksum != get_checksum(dest_path): - msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") + msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") msg.good(f"Fetched asset {dest}") return dest_path @@ -384,13 +398,17 @@ def project_run_all(project_dir: Path, *dvc_args) -> None: """ config = load_project_config(project_dir) setup_check_dvc(project_dir, config) - dvc_cmd = ["dvc", "repro", *dvc_args] with working_dir(project_dir): - run_command(dvc_cmd) + try: + run_dvc_command(["repro", *dvc_args]) + except DVCError: + # We could raise a custom error here, but the output produced by + # DVC is already pretty substantial. + sys.exit(1) def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: - """Simulate a CLI help prompt using the info available in the project config. + """Simulate a CLI help prompt using the info available in the project.yml. project_dir (Path): The project directory. subcommand (Optional[str]): The subcommand or None. If a subcommand is @@ -408,15 +426,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: if help_text: msg.text(f"\n{help_text}\n") else: - print(f"\nAvailable commands in {CONFIG_FILE}") + print(f"\nAvailable commands in {PROJECT_FILE}") print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}") msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) - msg.text("Run all commands defined in the 'run' block of the project config:") + msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:") print(f"{COMMAND} project run-all {project_dir}") def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: - """Run a named script defined in the project config. If the script is part + """Run a named script defined in the project.yml. If the script is part of the default pipeline (defined in the "run" section), DVC is used to execute the command, so it can determine whether to rerun it. It then calls into "exec" to execute it. @@ -433,9 +451,13 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: validate_subcommand(commands.keys(), subcommand) if subcommand in config.get("run", []): # This is one of the pipeline commands tracked in DVC - dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] with working_dir(project_dir): - run_command(dvc_cmd) + try: + run_dvc_command(["repro", subcommand, *dvc_args]) + except DVCError: + # We could raise a custom error here, but the output produced by + # DVC is already pretty substantial. + sys.exit(1) else: cmd = commands[subcommand] # Deps in non-DVC commands aren't tracked, but if they're defined, @@ -448,8 +470,8 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: run_commands(cmd["script"], variables) -def project_exec(project_dir: Path, subcommand: str): - """Execute a command defined in the project config. +def project_exec(project_dir: Path, subcommand: str) -> None: + """Execute a command defined in the project.yml. project_dir (Path): Path to project directory. subcommand (str): Name of command to run. @@ -468,15 +490,15 @@ def project_exec(project_dir: Path, subcommand: str): def load_project_config(path: Path) -> Dict[str, Any]: - """Load the project config file from a directory and validate it. + """Load the project.yml file from a directory and validate it. path (Path): The path to the project directory. - RETURNS (Dict[str, Any]): The loaded project config. + RETURNS (Dict[str, Any]): The loaded project.yml. """ - config_path = path / CONFIG_FILE + config_path = path / PROJECT_FILE if not config_path.exists(): - msg.fail("Can't find project config", config_path, exits=1) - invalid_err = f"Invalid project config in {CONFIG_FILE}" + msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1) + invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct." try: config = srsly.read_yaml(config_path) except ValueError as e: @@ -500,7 +522,7 @@ def update_dvc_config( dict, so if any of the config values change, the DVC config is regenerated. path (Path): The path to the project directory. - config (Dict[str, Any]): The loaded project config. + config (Dict[str, Any]): The loaded project.yml. verbose (bool): Whether to print additional info (via DVC). silent (bool): Don't output anything (via DVC). force (bool): Force update, even if hashes match. @@ -514,10 +536,10 @@ def update_dvc_config( with dvc_config_path.open("r", encoding="utf8") as f: ref_hash = f.readline().strip().replace("# ", "") if ref_hash == config_hash and not force: - return False # Nothing has changed in project config, don't need to update + return False # Nothing has changed in project.yml, don't need to update dvc_config_path.unlink() variables = config.get("variables", {}) - commands = [] + dvc_commands = [] # We only want to include commands that are part of the main list of "run" # commands in project.yml and should be run in sequence config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} @@ -535,15 +557,12 @@ def update_dvc_config( deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] - dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"] - if verbose: - dvc_cmd.append("--verbose") - if silent: - dvc_cmd.append("--quiet") + dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"] full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] - commands.append(" ".join(full_cmd)) + dvc_commands.append(" ".join(full_cmd)) with working_dir(path): - run_commands(commands, variables, silent=True) + dvc_flags = {"--verbose": verbose, "--quiet": silent} + run_dvc_commands(dvc_commands, variables, flags=dvc_flags) with dvc_config_path.open("r+", encoding="utf8") as f: content = f.read() f.seek(0, 0) @@ -571,7 +590,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: DVC project. project_dir (Path): The path to the project directory. - config (Dict[str, Any]): The loaded project config. + config (Dict[str, Any]): The loaded project.yml. """ if not project_dir.exists(): msg.fail(f"Can't find project directory: {project_dir}") @@ -586,38 +605,7 @@ def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None: with msg.loading("Updating DVC config..."): updated = update_dvc_config(project_dir, config, silent=True) if updated: - msg.good(f"Updated DVC config from changed {CONFIG_FILE}") - - -def run_commands( - commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False -) -> None: - """Run a sequence of commands in a subprocess, in order. - - commands (List[str]): The string commands. - variables (Dict[str, str]): Dictionary of variable names, mapped to their - values. Will be used to substitute format string variables in the - commands. - silent (bool): Don't print the commands. - """ - for command in commands: - # Substitute variables, e.g. "./{NAME}.json" - command = command.format(**variables) - command = split_command(command) - # Not sure if this is needed or a good idea. Motivation: users may often - # use commands in their config that reference "python" and we want to - # make sure that it's always executing the same Python that spaCy is - # executed with and the pip in the same env, not some other Python/pip. - # Also ensures cross-compatibility if user 1 writes "python3" (because - # that's how it's set up on their system), and user 2 without the - # shortcut tries to re-run the command. - if len(command) and command[0] in ("python", "python3"): - command[0] = sys.executable - elif len(command) and command[0] in ("pip", "pip3"): - command = [sys.executable, "-m", "pip", *command[1:]] - if not silent: - print(f"Running command: {' '.join(command)}") - run_command(command) + msg.good(f"Updated DVC config from changed {PROJECT_FILE}") def convert_asset_url(url: str) -> str: @@ -627,7 +615,7 @@ def convert_asset_url(url: str) -> str: RETURNS (str): The converted URL. """ # If the asset URL is a regular GitHub URL it's likely a mistake - if re.match("(http(s?)):\/\/github.com", url): + if re.match(r"(http(s?)):\/\/github.com", url): converted = url.replace("github.com", "raw.githubusercontent.com") converted = re.sub(r"/(tree|blob)/", "/", converted) msg.warn( @@ -679,7 +667,7 @@ def validate_subcommand(commands: Sequence[str], subcommand: str) -> None: """ if subcommand not in commands: msg.fail( - f"Can't find command '{subcommand}' in {CONFIG_FILE}. " + f"Can't find command '{subcommand}' in {PROJECT_FILE}. " f"Available commands: {', '.join(commands)}", exits=1, ) @@ -706,3 +694,112 @@ def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: for data in response.iter_content(chunk_size=chunk_size): size = f.write(data) bar.update(size) + + +def run_commands( + commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False +) -> None: + """Run a sequence of commands in a subprocess, in order. + + commands (List[str]): The string commands. + variables (Dict[str, str]): Dictionary of variable names, mapped to their + values. Will be used to substitute format string variables in the + commands. + silent (bool): Don't print the commands. + """ + for command in commands: + # Substitute variables, e.g. "./{NAME}.json" + command = command.format(**variables) + command = split_command(command) + # Not sure if this is needed or a good idea. Motivation: users may often + # use commands in their config that reference "python" and we want to + # make sure that it's always executing the same Python that spaCy is + # executed with and the pip in the same env, not some other Python/pip. + # Also ensures cross-compatibility if user 1 writes "python3" (because + # that's how it's set up on their system), and user 2 without the + # shortcut tries to re-run the command. + if len(command) and command[0] in ("python", "python3"): + command[0] = sys.executable + elif len(command) and command[0] in ("pip", "pip3"): + command = [sys.executable, "-m", "pip", *command[1:]] + if not silent: + print(f"Running command: {' '.join(command)}") + run_command(command) + + +def run_dvc_commands( + commands: List[str] = tuple(), + variables: Dict[str, str] = {}, + flags: Dict[str, bool] = {}, +) -> None: + """Run a sequence of DVC commands in a subprocess, in order. + + commands (List[str]): The string commands without the leading "dvc". + variables (Dict[str, str]): Dictionary of variable names, mapped to their + values. Will be used to substitute format string variables in the + commands. + flags (Dict[str, bool]): Conditional flags to be added to command. Makes it + easier to pass flags like --quiet that depend on a variable or + command-line setting while avoiding lots of nested conditionals. + """ + for command in commands: + # Substitute variables, e.g. "./{NAME}.json" + command = command.format(**variables) + command = split_command(command) + run_dvc_command(command, flags=flags) + + +def run_dvc_command( + command: Union[str, List[str]], flags: Dict[str, bool] = {}, silent: bool = False +) -> None: + """Run a DVC command in a subprocess. This wrapper gives us a bit more + control over how the output and errors are presented. Raises a DVC error if + the "dvc" command returns a non-zero exit code and uses the error message + logged by DVC. + + command (Union[str, List[str]]): The command, without the leading "dvc". + flags (Dict[str, bool]): Conditional flags to be added to command. Makes it + easier to pass flags like --quiet that depend on a variable or + command-line setting while avoiding lots of nested conditionals. + silent (bool): Don't print any output. + """ + if isinstance(command, str): + command = split_command(command) + dvc_command = ["dvc", *command] + # Add the flags if they are set to True + for flag, is_active in flags.items(): + if is_active: + dvc_command.append(flag) + proc = subprocess.Popen(dvc_command, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + if not silent: + lines = proc.stdout.read().decode("utf8").split("\n\n") + for line in lines: + line = line.strip() + if is_relevant_dvc_output(line): + print(f"{line}\n") + _, err = proc.communicate() # Important: otherwise returncode will be None! + if proc.returncode != 0: + if isinstance(err, bytes): + err = err.decode("utf8") + raise DVCError(err) + + +def is_relevant_dvc_output(line: str) -> bool: + """Check whether the output by DVC is something we want to keep. + + line (str): A line written to stdout,. + RETURNS (bool): Whether to use/print the line. + """ + # Writing them like this for readability but maybe replace with regex? + conditions = [ + not line, + line.startswith("What's next?"), + line.startswith("Having any troubles?"), + ] + return not any(conditions) + + +class DVCError(RuntimeError): + """Custom error type for anything produced by the DVC CLI.""" + + pass diff --git a/spacy/schemas.py b/spacy/schemas.py index 38e08b4cb..ca17fe50b 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -222,7 +222,7 @@ class TrainingSchema(BaseModel): class ProjectConfigAsset(BaseModel): # fmt: off dest: StrictStr = Field(..., title="Destination of downloaded asset") - url: StrictStr = Field(..., title="URL of asset") + url: Optional[StrictStr] = Field(None, title="URL of asset") checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") # fmt: on