mirror of https://github.com/explosion/spaCy.git
Tidy up, document and add custom clone logic
This commit is contained in:
parent
dc7a9be9f8
commit
2f6ee0d018
|
@ -1,62 +0,0 @@
|
||||||
import tempfile
|
|
||||||
import typer
|
|
||||||
from pathlib import Path
|
|
||||||
import subprocess
|
|
||||||
import shlex
|
|
||||||
import shutil
|
|
||||||
from contextlib import contextmanager
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def make_tempdir():
|
|
||||||
d = Path(tempfile.mkdtemp())
|
|
||||||
yield d
|
|
||||||
shutil.rmtree(str(d))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def clone_repo(repo, temp_dir):
|
|
||||||
subprocess.check_call([
|
|
||||||
"git",
|
|
||||||
"clone",
|
|
||||||
repo,
|
|
||||||
temp_dir,
|
|
||||||
"--no-checkout",
|
|
||||||
"--depth", "1",
|
|
||||||
"--config", "core.sparseCheckout=true"
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
def checkout_and_fetch(temp_dir):
|
|
||||||
subprocess.check_call([
|
|
||||||
"git",
|
|
||||||
"-C", temp_dir,
|
|
||||||
"fetch"
|
|
||||||
])
|
|
||||||
subprocess.check_call([
|
|
||||||
"git",
|
|
||||||
"-C", temp_dir,
|
|
||||||
"checkout"
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
def set_sparse_checkout_dir(temp_dir, subpath):
|
|
||||||
with (temp_dir / ".git" / "info" / "sparse-checkout").open("w") as file_:
|
|
||||||
file_.write(subpath)
|
|
||||||
|
|
||||||
|
|
||||||
def main(repo: str, subpath: str, dest: Path):
|
|
||||||
with make_tempdir() as temp_dir:
|
|
||||||
clone_repo(repo, temp_dir)
|
|
||||||
print("After clone", list(temp_dir.iterdir()))
|
|
||||||
set_sparse_checkout_dir(temp_dir, subpath)
|
|
||||||
checkout_and_fetch(temp_dir)
|
|
||||||
print("After checkout", list(temp_dir.iterdir()))
|
|
||||||
assert (temp_dir / subpath) in list(temp_dir.iterdir())
|
|
||||||
shutil.copytree(temp_dir / subpath, dest / subpath, dirs_exist_ok=True)
|
|
||||||
print("Exists after cleanup?", temp_dir.exists())
|
|
||||||
print("Destination", list(dest.iterdir()))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
typer.run(main)
|
|
|
@ -9,12 +9,12 @@ import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
import hashlib
|
|
||||||
|
|
||||||
from ._app import app, Arg, Opt, COMMAND, NAME
|
from ._app import app, Arg, Opt, COMMAND, NAME
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import ensure_path, run_command, make_tempdir, working_dir
|
from ..util import ensure_path, run_command, make_tempdir, working_dir
|
||||||
|
from ..util import get_hash, get_checksum
|
||||||
|
|
||||||
|
|
||||||
CONFIG_FILE = "project.yml"
|
CONFIG_FILE = "project.yml"
|
||||||
|
@ -45,19 +45,13 @@ project_cli = typer.Typer(help="Command-line interface for spaCy projects")
|
||||||
|
|
||||||
@project_cli.callback(invoke_without_command=True)
|
@project_cli.callback(invoke_without_command=True)
|
||||||
def callback(ctx: typer.Context):
|
def callback(ctx: typer.Context):
|
||||||
"""This runs before every project command and ensures DVC is installed and
|
"""This runs before every project command and ensures DVC is installed."""
|
||||||
everything is up to date.
|
ensure_dvc()
|
||||||
"""
|
|
||||||
try:
|
|
||||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
################
|
||||||
except Exception:
|
# CLI COMMANDS #
|
||||||
msg.fail(
|
################
|
||||||
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
|
|
||||||
"You can install the Python package from pip (pip install dvc) or "
|
|
||||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
|
||||||
"documentation: https://dvc.org/doc/install",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("clone")
|
@project_cli.command("clone")
|
||||||
|
@ -68,13 +62,144 @@ def project_clone_cli(
|
||||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
|
||||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
||||||
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
|
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information")
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Clone a project template from a repository."""
|
"""Clone a project template from a repository. Calls into "git" and will
|
||||||
project_clone(
|
only download the files from the given subdirectory. The GitHub repo
|
||||||
name, dest, repo=repo, git=git, no_init=no_init, verbose=verbose, silent=True
|
defaults to the official spaCy template repo, but can be customized
|
||||||
)
|
(including using a private repo). Setting the --git flag will also
|
||||||
|
initialize the project directory as a Git repo. If the project is intended
|
||||||
|
to be a Git repo, it should be initialized with Git first, before
|
||||||
|
initializing DVC (Data Version Control). This allows DVC to integrate with
|
||||||
|
Git.
|
||||||
|
"""
|
||||||
|
project_clone(name, dest, repo=repo, git=git, no_init=no_init)
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("init")
|
||||||
|
def project_init_cli(
|
||||||
|
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
|
||||||
|
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
||||||
|
):
|
||||||
|
"""Initialize a project directory with DVC and optionally Git. This should
|
||||||
|
typically be taken care of automatically when you run the "project clone"
|
||||||
|
command, but you can also run it separately. If the project is intended to
|
||||||
|
be a Git repo, it should be initialized with Git first, before initializing
|
||||||
|
DVC. This allows DVC to integrate with Git.
|
||||||
|
"""
|
||||||
|
project_init(path, git=git, silent=True)
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("assets")
|
||||||
|
def project_assets_cli(
|
||||||
|
# fmt: off
|
||||||
|
project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Use DVC (Data Version Control) to fetch the assets for the project,
|
||||||
|
defined in the "assets" section of the project config. If possible, DVC
|
||||||
|
will try to track the files so you can pull changes from upstream. It will
|
||||||
|
also try and store the checksum so the assets are versioned. If th file
|
||||||
|
can't be tracked or checked, it will be downloaded using curl. If a checksum
|
||||||
|
is provided in the project config, the file is only downloaded if no local
|
||||||
|
file with the same checksum exists.
|
||||||
|
"""
|
||||||
|
project_assets(project_dir)
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command(
|
||||||
|
"run-all",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def project_run_all_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context,
|
||||||
|
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
||||||
|
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Run all commands defined in the project. This command will use DVC and
|
||||||
|
the defined outputs and dependencies in the project config to determine
|
||||||
|
which steps need to be re-run and where to start. This means you're only
|
||||||
|
re-generating data if the inputs have changed.
|
||||||
|
|
||||||
|
This command calls into "dvc repro" and all additional arguments are passed
|
||||||
|
to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
|
||||||
|
"""
|
||||||
|
if show_help:
|
||||||
|
print_run_help(project_dir)
|
||||||
|
else:
|
||||||
|
project_run_all(project_dir, *ctx.args)
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command(
|
||||||
|
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def project_run_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context,
|
||||||
|
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
||||||
|
subcommand: str = Arg(None, help="Name of command defined in project config"),
|
||||||
|
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Run a named script defined in the project config. If the command is
|
||||||
|
part of the default pipeline defined in the "run" section, DVC is used to
|
||||||
|
determine whether the step should re-run if its inputs have changed, or
|
||||||
|
whether everything is up to date. If the script is not part of the default
|
||||||
|
pipeline, it will be called separately without DVC.
|
||||||
|
|
||||||
|
If DVC is used, the command calls into "dvc repro" and all additional
|
||||||
|
arguments are passed to the "dvc repro" command:
|
||||||
|
https://dvc.org/doc/command-reference/repro
|
||||||
|
"""
|
||||||
|
if show_help or not subcommand:
|
||||||
|
print_run_help(project_dir, subcommand)
|
||||||
|
else:
|
||||||
|
project_run(project_dir, subcommand, *ctx.args)
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("exec", hidden=True)
|
||||||
|
def project_exec_cli(
|
||||||
|
# fmt: off
|
||||||
|
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
||||||
|
subcommand: str = Arg(..., help="Name of command defined in project config"),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Execute a command defined in the project config. This CLI command is
|
||||||
|
only called internally in auto-generated DVC pipelines, as a shortcut for
|
||||||
|
multi-step commands in the project config. You typically shouldn't have to
|
||||||
|
call it yourself. To run a command, call "run" or "run-all".
|
||||||
|
"""
|
||||||
|
project_exec(project_dir, subcommand)
|
||||||
|
|
||||||
|
|
||||||
|
@project_cli.command("update-dvc")
|
||||||
|
def project_update_dvc_cli(
|
||||||
|
# fmt: off
|
||||||
|
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
||||||
|
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||||
|
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Update the auto-generated DVC config file. Uses the steps defined in the
|
||||||
|
"run" section of the project config. This typically happens automatically
|
||||||
|
when running a command, but can also be triggered manually if needed.
|
||||||
|
"""
|
||||||
|
config = load_project_config(project_dir)
|
||||||
|
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
|
||||||
|
if updated:
|
||||||
|
msg.good(f"Updated DVC config from {CONFIG_FILE}")
|
||||||
|
else:
|
||||||
|
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
|
||||||
|
|
||||||
|
|
||||||
|
app.add_typer(project_cli, name="project")
|
||||||
|
|
||||||
|
|
||||||
|
#################
|
||||||
|
# CLI FUNCTIONS #
|
||||||
|
#################
|
||||||
|
|
||||||
|
|
||||||
def project_clone(
|
def project_clone(
|
||||||
|
@ -84,51 +209,55 @@ def project_clone(
|
||||||
repo: str = about.__projects__,
|
repo: str = about.__projects__,
|
||||||
git: bool = False,
|
git: bool = False,
|
||||||
no_init: bool = False,
|
no_init: bool = False,
|
||||||
silent: bool = False,
|
|
||||||
verbose: bool = False,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Clone a project template from a repository.
|
||||||
|
|
||||||
|
name (str): Name of subdirectory to clone.
|
||||||
|
dest (Path): Destination path of cloned project.
|
||||||
|
repo (str): URL of Git repo containing project templates.
|
||||||
|
git (bool): Initialize project as Git repo. Should be set to True if project
|
||||||
|
is intended as a repo, since it will allow DVC to integrate with Git.
|
||||||
|
no_init (bool): Don't initialize DVC and Git automatically. If True, the
|
||||||
|
"init" command or "git init" and "dvc init" need to be run manually.
|
||||||
|
"""
|
||||||
dest = ensure_path(dest)
|
dest = ensure_path(dest)
|
||||||
check_clone_dest(dest)
|
check_clone(name, dest, repo)
|
||||||
# When cloning a subdirectory with DVC, it will create a folder of that name
|
project_dir = dest.resolve()
|
||||||
# within the destination dir, so we use a tempdir and then copy it into the
|
# We're using Git and sparse checkout to only clone the files we need
|
||||||
# parent directory to create the cloned directory
|
|
||||||
dest = dest.resolve()
|
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
cmd = ["dvc", "get", repo, name, "-o", str(tmp_dir)]
|
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
||||||
if verbose:
|
run_command(shlex.split(cmd))
|
||||||
cmd.append("--verbose")
|
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||||
if silent:
|
f.write(name)
|
||||||
cmd.append("--quiet")
|
run_command(["git", "-C", tmp_dir, "fetch"])
|
||||||
print(" ".join(cmd))
|
run_command(["git", "-C", tmp_dir, "checkout"])
|
||||||
run_command(cmd)
|
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
|
||||||
shutil.move(str(tmp_dir / Path(name).name), str(dest))
|
|
||||||
msg.good(f"Cloned project '{name}' from {repo}")
|
msg.good(f"Cloned project '{name}' from {repo}")
|
||||||
for sub_dir in DIRS:
|
for sub_dir in DIRS:
|
||||||
dir_path = dest / sub_dir
|
dir_path = project_dir / sub_dir
|
||||||
if not dir_path.exists():
|
if not dir_path.exists():
|
||||||
dir_path.mkdir(parents=True)
|
dir_path.mkdir(parents=True)
|
||||||
if not no_init:
|
if not no_init:
|
||||||
project_init(dest, git=git, silent=silent)
|
project_init(project_dir, git=git, silent=True)
|
||||||
msg.good(f"Your project is now ready!", dest)
|
msg.good(f"Your project is now ready!", dest)
|
||||||
print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}")
|
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("init")
|
|
||||||
def project_init_cli(
|
|
||||||
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
|
|
||||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
|
||||||
):
|
|
||||||
"""Initialize a project directory with DVC and Git (optional). This should
|
|
||||||
typically be taken care of automatically when you run the "project clone"
|
|
||||||
command.
|
|
||||||
"""
|
|
||||||
project_init(path, git=git, silent=True)
|
|
||||||
|
|
||||||
|
|
||||||
def project_init(
|
def project_init(
|
||||||
dest: Path, *, git: bool = False, silent: bool = False, analytics: bool = False
|
project_dir: Path,
|
||||||
|
*,
|
||||||
|
git: bool = False,
|
||||||
|
silent: bool = False,
|
||||||
|
analytics: bool = False,
|
||||||
):
|
):
|
||||||
with working_dir(dest):
|
"""Initialize a project as a DVC and (optionally) as a Git repo.
|
||||||
|
|
||||||
|
project_dir (Path): Path to project directory.
|
||||||
|
git (bool): Also call "git init" to initialize directory as a Git repo.
|
||||||
|
silent (bool): Don't print any output (via DVC).
|
||||||
|
analytics (bool): Opt-in to DVC analytics (defaults to False).
|
||||||
|
"""
|
||||||
|
with working_dir(project_dir):
|
||||||
init_cmd = ["dvc", "init"]
|
init_cmd = ["dvc", "init"]
|
||||||
if silent:
|
if silent:
|
||||||
init_cmd.append("--quiet")
|
init_cmd.append("--quiet")
|
||||||
|
@ -137,25 +266,20 @@ def project_init(
|
||||||
if git:
|
if git:
|
||||||
run_command(["git", "init"])
|
run_command(["git", "init"])
|
||||||
run_command(init_cmd)
|
run_command(init_cmd)
|
||||||
|
# We don't want to have analytics on by default – our users should
|
||||||
|
# opt-in explicitly. If they want it, they can always enable it.
|
||||||
if not analytics:
|
if not analytics:
|
||||||
# TODO: find a better solution for this?
|
|
||||||
run_command(["dvc", "config", "core.analytics", "false"])
|
run_command(["dvc", "config", "core.analytics", "false"])
|
||||||
config = load_project_config(dest)
|
config = load_project_config(project_dir)
|
||||||
setup_check_dvc(dest, config)
|
setup_check_dvc(project_dir, config)
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("assets")
|
def project_assets(project_dir: Path) -> None:
|
||||||
def project_assets_cli(
|
"""Fetch assets for a project using DVC if possible.
|
||||||
# fmt: off
|
|
||||||
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Use Data Version Control to get the assets for the project."""
|
|
||||||
project_assets(path)
|
|
||||||
|
|
||||||
|
project_dir (Path): Path to project directory.
|
||||||
def project_assets(project_path: Path) -> None:
|
"""
|
||||||
project_path = ensure_path(project_path)
|
project_path = ensure_path(project_dir)
|
||||||
config = load_project_config(project_path)
|
config = load_project_config(project_path)
|
||||||
setup_check_dvc(project_path, config)
|
setup_check_dvc(project_path, config)
|
||||||
assets = config.get("assets", {})
|
assets = config.get("assets", {})
|
||||||
|
@ -172,7 +296,17 @@ def project_assets(project_path: Path) -> None:
|
||||||
def fetch_asset(
|
def fetch_asset(
|
||||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
check_asset(url)
|
"""Fetch an asset from a given URL or path. Will try to import the file
|
||||||
|
using DVC's import-url if possible (fully tracked and versioned) and falls
|
||||||
|
back to get-url (versioned) and a non-DVC download if necessary. If a
|
||||||
|
checksum is provided and a local file exists, it's only re-downloaded if the
|
||||||
|
checksum doesn't match.
|
||||||
|
|
||||||
|
project_path (Path): Path to project directory.
|
||||||
|
url (str): URL or path to asset.
|
||||||
|
checksum (Optional[str]): Optional expected checksum of local file.
|
||||||
|
"""
|
||||||
|
url = convert_asset_url(url)
|
||||||
dest_path = (project_path / dest).resolve()
|
dest_path = (project_path / dest).resolve()
|
||||||
if dest_path.exists() and checksum:
|
if dest_path.exists() and checksum:
|
||||||
# If there's already a file, check for checksum
|
# If there's already a file, check for checksum
|
||||||
|
@ -185,12 +319,13 @@ def fetch_asset(
|
||||||
# If these fail, we don't want to output an error or info message.
|
# If these fail, we don't want to output an error or info message.
|
||||||
# Try with tracking the source first, then just downloading with
|
# Try with tracking the source first, then just downloading with
|
||||||
# DVC, then a regular non-DVC download.
|
# DVC, then a regular non-DVC download.
|
||||||
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
|
try:
|
||||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
|
||||||
except subprocess.CalledProcessError:
|
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
||||||
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
|
except subprocess.CalledProcessError:
|
||||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
|
||||||
run_command(["dvc", "add", str(dest_path)])
|
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
||||||
|
run_command(["dvc", "add", str(dest_path)])
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
# TODO: replace curl
|
# TODO: replace curl
|
||||||
run_command(["curl", url, "--output", str(dest_path), "--progress-bar"])
|
run_command(["curl", url, "--output", str(dest_path), "--progress-bar"])
|
||||||
|
@ -200,25 +335,12 @@ def fetch_asset(
|
||||||
msg.good(f"Fetched asset {dest}")
|
msg.good(f"Fetched asset {dest}")
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"run-all",
|
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
)
|
|
||||||
def project_run_all_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context,
|
|
||||||
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Run all commands. Additional arguments are passed to dvc repro."""
|
|
||||||
if show_help:
|
|
||||||
print_run_help(project_dir)
|
|
||||||
else:
|
|
||||||
project_run_all(project_dir, *ctx.args)
|
|
||||||
|
|
||||||
|
|
||||||
def project_run_all(project_dir: Path, *dvc_args) -> None:
|
def project_run_all(project_dir: Path, *dvc_args) -> None:
|
||||||
|
"""Run all commands defined in the project using DVC.
|
||||||
|
|
||||||
|
project_dir (Path): Path to project directory.
|
||||||
|
*dvc_args: Other arguments passed to "dvc repro".
|
||||||
|
"""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
setup_check_dvc(project_dir, config)
|
setup_check_dvc(project_dir, config)
|
||||||
dvc_cmd = ["dvc", "repro", *dvc_args]
|
dvc_cmd = ["dvc", "repro", *dvc_args]
|
||||||
|
@ -226,27 +348,16 @@ def project_run_all(project_dir: Path, *dvc_args) -> None:
|
||||||
run_command(dvc_cmd)
|
run_command(dvc_cmd)
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
)
|
|
||||||
def project_run_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context,
|
|
||||||
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
|
||||||
subcommand: str = Arg(None, help="Name of command defined in project config"),
|
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Run scripts defined in the project."""
|
|
||||||
if show_help or not subcommand:
|
|
||||||
print_run_help(project_dir, subcommand)
|
|
||||||
else:
|
|
||||||
project_run(project_dir, subcommand, *ctx.args)
|
|
||||||
|
|
||||||
|
|
||||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
"""Simulate a CLI help prompt using the info available in the project config."""
|
"""Simulate a CLI help prompt using the info available in the project config.
|
||||||
|
|
||||||
|
project_dir (Path): The project directory.
|
||||||
|
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||||
|
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||||
|
and a list of available commands is printed.
|
||||||
|
"""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
|
setup_check_dvc(project_dir, config)
|
||||||
config_commands = config.get("commands", [])
|
config_commands = config.get("commands", [])
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||||
if subcommand:
|
if subcommand:
|
||||||
|
@ -260,9 +371,20 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
print(f"\nAvailable commands in {CONFIG_FILE}")
|
print(f"\nAvailable commands in {CONFIG_FILE}")
|
||||||
print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]")
|
print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]")
|
||||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||||
|
msg.text("Run all commands defined in the 'run' block of the project config:")
|
||||||
|
print(f"{COMMAND} project run-all {project_dir}")
|
||||||
|
|
||||||
|
|
||||||
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
||||||
|
"""Run a named script defined in the project config. If the script is part
|
||||||
|
of the default pipeline (defined in the "run" section), DVC is used to
|
||||||
|
execute the command, so it can determine whether to rerun it. It then
|
||||||
|
calls into "exec" to execute it.
|
||||||
|
|
||||||
|
project_dir (Path): Path to project directory.
|
||||||
|
subcommand (str): Name of command to run.
|
||||||
|
*dvc_args: Other arguments passed to "dvc repro".
|
||||||
|
"""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
setup_check_dvc(project_dir, config)
|
setup_check_dvc(project_dir, config)
|
||||||
config_commands = config.get("commands", [])
|
config_commands = config.get("commands", [])
|
||||||
|
@ -286,18 +408,12 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
||||||
run_commands(cmd["script"], variables)
|
run_commands(cmd["script"], variables)
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("exec")
|
|
||||||
def project_exec_cli(
|
|
||||||
# fmt: off
|
|
||||||
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
|
||||||
subcommand: str = Arg(..., help="Name of command defined in project config"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Internals"""
|
|
||||||
project_exec(project_dir, subcommand)
|
|
||||||
|
|
||||||
|
|
||||||
def project_exec(project_dir: Path, subcommand: str):
|
def project_exec(project_dir: Path, subcommand: str):
|
||||||
|
"""Execute a command defined in the project config.
|
||||||
|
|
||||||
|
project_dir (Path): Path to project directory.
|
||||||
|
subcommand (str): Name of command to run.
|
||||||
|
"""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
config_commands = config.get("commands", [])
|
config_commands = config.get("commands", [])
|
||||||
variables = config.get("variables", {})
|
variables = config.get("variables", {})
|
||||||
|
@ -306,26 +422,17 @@ def project_exec(project_dir: Path, subcommand: str):
|
||||||
run_commands(commands[subcommand]["script"], variables)
|
run_commands(commands[subcommand]["script"], variables)
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("update-dvc")
|
###########
|
||||||
def project_update_dvc_cli(
|
# HELPERS #
|
||||||
# fmt: off
|
###########
|
||||||
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
|
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
|
|
||||||
if updated:
|
|
||||||
msg.good(f"Updated DVC config from {CONFIG_FILE}")
|
|
||||||
else:
|
|
||||||
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
|
|
||||||
|
|
||||||
|
|
||||||
app.add_typer(project_cli, name="project")
|
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||||
|
"""Load the project config file from a directory and validate it.
|
||||||
|
|
||||||
|
path (Path): The path to the project directory.
|
||||||
|
RETURNS (Dict[str, Any]): The loaded project config.
|
||||||
|
"""
|
||||||
config_path = path / CONFIG_FILE
|
config_path = path / CONFIG_FILE
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
msg.fail("Can't find project config", config_path, exits=1)
|
msg.fail("Can't find project config", config_path, exits=1)
|
||||||
|
@ -343,8 +450,17 @@ def update_dvc_config(
|
||||||
silent: bool = False,
|
silent: bool = False,
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Re-run the DVC commands in dry mode and update dvc.yml file in the
|
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||||
project directory. The file is auto-generated based on the config.
|
project directory. The file is auto-generated based on the config. The
|
||||||
|
first line of the auto-generated file specifies the hash of the config
|
||||||
|
dict, so if any of the config values change, the DVC config is regenerated.
|
||||||
|
|
||||||
|
path (Path): The path to the project directory.
|
||||||
|
config (Dict[str, Any]): The loaded project config.
|
||||||
|
verbose (bool): Whether to print additional info (via DVC).
|
||||||
|
silent (bool): Don't output anything (via DVC).
|
||||||
|
force (bool): Force update, even if hashes match.
|
||||||
|
RETURNS (bool): Whether the DVC config file was updated.
|
||||||
"""
|
"""
|
||||||
config_hash = get_hash(config)
|
config_hash = get_hash(config)
|
||||||
path = path.resolve()
|
path = path.resolve()
|
||||||
|
@ -392,11 +508,40 @@ def update_dvc_config(
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def setup_check_dvc(project_path: Path, config: Dict[str, Any]) -> None:
|
def ensure_dvc() -> None:
|
||||||
if not (project_path / ".dvc").exists():
|
"""Ensure that the "dvc" command is available and show an error if not."""
|
||||||
msg.fail("Project not initialized as a DVC project", exits=1)
|
try:
|
||||||
|
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||||
|
except Exception:
|
||||||
|
msg.fail(
|
||||||
|
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
|
||||||
|
"You can install the Python package from pip (pip install dvc) or "
|
||||||
|
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||||
|
"documentation: https://dvc.org/doc/install",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
|
||||||
|
"""Check that the project is set up correctly with DVC and update its
|
||||||
|
config if needed. Will raise an error if the project is not an initialized
|
||||||
|
DVC project.
|
||||||
|
|
||||||
|
project_dir (Path): The path to the project directory.
|
||||||
|
config (Dict[str, Any]): The loaded project config.
|
||||||
|
"""
|
||||||
|
if not project_dir.exists():
|
||||||
|
msg.fail(f"Can't find project directory: {project_dir}")
|
||||||
|
if not (project_dir / ".dvc").exists():
|
||||||
|
msg.fail(
|
||||||
|
"Project not initialized as a DVC project.",
|
||||||
|
f"Make sure that the project template was cloned correctly. To "
|
||||||
|
f"initialize the project directory manually, you can run: "
|
||||||
|
f"{COMMAND} project init {project_dir}",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
with msg.loading("Updating DVC config..."):
|
with msg.loading("Updating DVC config..."):
|
||||||
updated = update_dvc_config(project_path, config, silent=True)
|
updated = update_dvc_config(project_dir, config, silent=True)
|
||||||
if updated:
|
if updated:
|
||||||
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
|
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
|
||||||
|
|
||||||
|
@ -404,6 +549,14 @@ def setup_check_dvc(project_path: Path, config: Dict[str, Any]) -> None:
|
||||||
def run_commands(
|
def run_commands(
|
||||||
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
|
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Run a sequence of commands in a subprocess, in order.
|
||||||
|
|
||||||
|
commands (List[str]): The split commands.
|
||||||
|
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||||
|
values. Will be used to substitute format string variables in the
|
||||||
|
commands.
|
||||||
|
silent (boll): Don't print the commands.
|
||||||
|
"""
|
||||||
for command in commands:
|
for command in commands:
|
||||||
# Substitute variables, e.g. "./{NAME}.json"
|
# Substitute variables, e.g. "./{NAME}.json"
|
||||||
command = command.format(**variables)
|
command = command.format(**variables)
|
||||||
|
@ -418,21 +571,44 @@ def run_commands(
|
||||||
run_command(command)
|
run_command(command)
|
||||||
|
|
||||||
|
|
||||||
def check_asset(url: str) -> None:
|
def convert_asset_url(url: str) -> str:
|
||||||
|
"""Check and convert the asset URL if needed.
|
||||||
|
|
||||||
|
url (str): The asset URL.
|
||||||
|
RETURNS (str): The converted URL.
|
||||||
|
"""
|
||||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||||
# TODO: support loading from GitHub URLs? Automatically convert to raw?
|
|
||||||
if re.match("(http(s?)):\/\/github.com", url):
|
if re.match("(http(s?)):\/\/github.com", url):
|
||||||
|
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||||
|
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Downloading from a regular GitHub URL. This will only download "
|
"Downloading from a regular GitHub URL. This will only download "
|
||||||
"the source of the page, not the actual file. If you want to "
|
"the source of the page, not the actual file. Converting the URL "
|
||||||
"download the raw file, click on 'Download' on the GitHub page "
|
"to a raw URL.",
|
||||||
"and copy the raw.githubusercontent.com URL instead."
|
converted,
|
||||||
)
|
)
|
||||||
# url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/").replace("/tree/", "/")
|
return converted
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
def check_clone_dest(dest: Path) -> None:
|
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||||
"""Check and validate that the destination path can be used to clone."""
|
"""Check and validate that the destination path can be used to clone. Will
|
||||||
|
check that Git is available and that the destination path is suitable.
|
||||||
|
|
||||||
|
name (str): Name of the directory to clone from the repo.
|
||||||
|
dest (Path): Local destination of cloned directory.
|
||||||
|
repo (str): URL of the repo to clone from.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
||||||
|
except Exception:
|
||||||
|
msg.fail(
|
||||||
|
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||||
|
f"To clone a project without Git, copy the files from the '{name}' "
|
||||||
|
f"directory in the {repo} to {dest} manually and then run:",
|
||||||
|
f"{COMMAND} project init {dest}",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
if not dest:
|
if not dest:
|
||||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
|
@ -444,12 +620,3 @@ def check_clone_dest(dest: Path) -> None:
|
||||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_hash(data) -> str:
|
|
||||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
|
||||||
return hashlib.md5(data_str).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def get_checksum(path: Path) -> str:
|
|
||||||
return hashlib.md5(path.read_bytes()).hexdigest()
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ import subprocess
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -459,11 +460,35 @@ def working_dir(path: Union[str, Path]) -> None:
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def make_tempdir():
|
def make_tempdir():
|
||||||
|
"""Execute a block in a temporary directory and remove the directory and
|
||||||
|
its contents at the end of the with block.
|
||||||
|
|
||||||
|
YIELDS (Path): The path of the temp directory.
|
||||||
|
"""
|
||||||
d = Path(tempfile.mkdtemp())
|
d = Path(tempfile.mkdtemp())
|
||||||
yield d
|
yield d
|
||||||
shutil.rmtree(str(d))
|
shutil.rmtree(str(d))
|
||||||
|
|
||||||
|
|
||||||
|
def get_hash(data) -> str:
|
||||||
|
"""Get the hash for a JSON-serializable object.
|
||||||
|
|
||||||
|
data: The data to hash.
|
||||||
|
RETURNS (str): The hash.
|
||||||
|
"""
|
||||||
|
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||||
|
return hashlib.md5(data_str).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_checksum(path: Union[Path, str]) -> str:
|
||||||
|
"""Get the checksum for a file given its file path.
|
||||||
|
|
||||||
|
path (Union[Path, str]): The file path.
|
||||||
|
RETURNS (str): The checksum.
|
||||||
|
"""
|
||||||
|
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def is_in_jupyter():
|
def is_in_jupyter():
|
||||||
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
||||||
IPython kernel. Mainly used for the displaCy visualizer.
|
IPython kernel. Mainly used for the displaCy visualizer.
|
||||||
|
|
Loading…
Reference in New Issue