From c08b3f294cdfb07c4f75e88f955339111074d025 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Feb 2021 13:45:27 +1100 Subject: [PATCH] Support env vars and CLI overrides for project.yml --- spacy/cli/_util.py | 57 ++++++++++++++++++++++++---------- spacy/cli/project/run.py | 16 +++++++--- spacy/schemas.py | 1 + spacy/tests/test_cli.py | 17 ++++++++++ website/docs/usage/projects.md | 45 +++++++++++++++++++++------ 5 files changed, 105 insertions(+), 31 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index e66420024..86b3ab356 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -16,7 +16,7 @@ import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger -from ..util import is_compatible_version, ENV_VARS +from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS from .. import about if TYPE_CHECKING: @@ -111,26 +111,33 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]: value = "true" else: value = args.pop(0) - # Just like we do in the config, we're calling json.loads on the - # values. But since they come from the CLI, it'd be unintuitive to - # explicitly mark strings with escaped quotes. So we're working - # around that here by falling back to a string if parsing fails. - # TODO: improve logic to handle simple types like list of strings? - try: - result[opt] = srsly.json_loads(value) - except ValueError: - result[opt] = str(value) + result[opt] = _parse_override(value) else: msg.fail(f"{err}: name should start with --", exits=1) return result -def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: +def _parse_override(value: Any) -> Any: + # Just like we do in the config, we're calling json.loads on the + # values. But since they come from the CLI, it'd be unintuitive to + # explicitly mark strings with escaped quotes. So we're working + # around that here by falling back to a string if parsing fails. + # TODO: improve logic to handle simple types like list of strings? + try: + return srsly.json_loads(value) + except ValueError: + return str(value) + + +def load_project_config( + path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() +) -> Dict[str, Any]: """Load the project.yml file from a directory and validate it. Also make sure that all directories defined in the config exist. path (Path): The path to the project directory. interpolate (bool): Whether to substitute project variables. + overrides (Dict[str, Any]): Optional config overrides. RETURNS (Dict[str, Any]): The loaded project.yml. """ config_path = path / PROJECT_FILE @@ -154,20 +161,36 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: if not dir_path.exists(): dir_path.mkdir(parents=True) if interpolate: - err = "project.yml validation error" + err = f"{PROJECT_FILE} validation error" with show_validation_error(title=err, hint_fill=False): - config = substitute_project_variables(config) + config = substitute_project_variables(config, overrides) return config -def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}): - key = "vars" +def substitute_project_variables( + config: Dict[str, Any], + overrides: Dict[str, Any] = SimpleFrozenDict(), + key: str = "vars", + env_key: str = "env", +) -> Dict[str, Any]: + """Interpolate variables in the project file using the config system. + + config (Dict[str, Any]): The project config. + overrides (Dict[str, Any]): Optional config overrides. + key (str): Key containing variables in project config. + env_key (str): Key containing environment variable mapping in project config. + RETURNS (Dict[str, Any]): The interpolated project config. + """ config.setdefault(key, {}) - config[key].update(overrides) + config.setdefault(env_key, {}) + # Substitute references to env vars with their values + for config_var, env_var in config[env_key].items(): + config[env_key][config_var] = _parse_override(os.environ.get(env_var, "")) # Need to put variables in the top scope again so we can have a top-level # section "project" (otherwise, a list of commands in the top scope wouldn't) # be allowed by Thinc's config system - cfg = Config({"project": config, key: config[key]}) + cfg = Config({"project": config, key: config[key], env_key: config[env_key]}) + cfg = Config().from_str(cfg.to_str(), overrides=overrides) interpolated = cfg.interpolate() return dict(interpolated["project"]) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 17c881595..5339d2a21 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -3,19 +3,23 @@ from pathlib import Path from wasabi import msg import sys import srsly +import typer from ... import about from ...git_info import GIT_VERSION from ...util import working_dir, run_command, split_command, is_cwd, join_command from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS -from ...util import check_bool_env_var +from ...util import check_bool_env_var, SimpleFrozenDict from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash -from .._util import get_checksum, project_cli, Arg, Opt, COMMAND +from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides -@project_cli.command("run") +@project_cli.command( + "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} +) def project_run_cli( # fmt: off + ctx: typer.Context, # This is only used to read additional arguments subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), @@ -33,13 +37,15 @@ def project_run_cli( if show_help or not subcommand: print_run_help(project_dir, subcommand) else: - project_run(project_dir, subcommand, force=force, dry=dry) + overrides = parse_config_overrides(ctx.args) + project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry) def project_run( project_dir: Path, subcommand: str, *, + overrides: Dict[str, Any] = SimpleFrozenDict(), force: bool = False, dry: bool = False, capture: bool = False, @@ -59,7 +65,7 @@ def project_run( when you want to turn over execution to the command, and capture=True when you want to run the command more like a function. """ - config = load_project_config(project_dir) + config = load_project_config(project_dir, overrides=overrides) commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} workflows = config.get("workflows", {}) validate_subcommand(commands.keys(), workflows.keys(), subcommand) diff --git a/spacy/schemas.py b/spacy/schemas.py index d041845f3..2f25c785f 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -446,6 +446,7 @@ class ProjectConfigCommand(BaseModel): class ProjectConfigSchema(BaseModel): # fmt: off vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") + env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names") assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index bfbee677a..a3834f31a 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -325,6 +325,23 @@ def test_project_config_interpolation(): substitute_project_variables(project) +def test_project_config_interpolation_env(): + variables = {"a": 10} + env_var = "SPACY_TEST_FOO" + env_vars = {"foo": env_var} + commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}] + project = {"commands": commands, "vars": variables, "env": env_vars} + with make_tempdir() as d: + srsly.write_yaml(d / "project.yml", project) + cfg = load_project_config(d) + assert cfg["commands"][0]["script"][0] == "hello 10 " + os.environ[env_var] = "123" + with make_tempdir() as d: + srsly.write_yaml(d / "project.yml", project) + cfg = load_project_config(d) + assert cfg["commands"][0]["script"][0] == "hello 10 123" + + @pytest.mark.parametrize( "args,expected", [ diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 492345f2f..97b5b9f28 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -69,9 +69,9 @@ python -m spacy project clone pipelines/tagger_parser_ud By default, the project will be cloned into the current working directory. You can specify an optional second argument to define the output directory. The -`--repo` option lets you define a custom repo to clone from if you don't want -to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You -can also use any private repo you have access to with Git. +`--repo` option lets you define a custom repo to clone from if you don't want to +use the spaCy [`projects`](https://github.com/explosion/projects) repo. You can +also use any private repo you have access to with Git. ### 2. Fetch the project assets {#assets} @@ -221,6 +221,7 @@ pipelines. | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | @@ -310,8 +311,8 @@ company-internal and not available over the internet. In that case, you can specify the destination paths and a checksum, and leave out the URL. When your teammates clone and run your project, they can place the files in the respective directory themselves. The [`project assets`](/api/cli#project-assets) command -will alert you about missing files and mismatched checksums, so you can ensure that -others are running your project with the same data. +will alert you about missing files and mismatched checksums, so you can ensure +that others are running your project with the same data. ### Dependencies and outputs {#deps-outputs} @@ -358,9 +359,10 @@ graphs based on the dependencies and outputs, and won't re-run previous steps automatically. For instance, if you only run the command `train` that depends on data created by `preprocess` and those files are missing, spaCy will show an error – it won't just re-run `preprocess`. If you're looking for more advanced -data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you -can also use `outputs_no_cache` instead of `outputs` to define outputs that -won't be cached or tracked. +data management, check out the [Data Version Control (DVC) integration](#dvc). +If you're planning on integrating your spaCy project with DVC, you can also use +`outputs_no_cache` instead of `outputs` to define outputs that won't be cached +or tracked. ### Files and directory structure {#project-files} @@ -467,7 +469,9 @@ In your `project.yml`, you can then run the script by calling `python scripts/custom_evaluation.py` with the function arguments. You can also use the `vars` section to define reusable variables that will be substituted in commands, paths and URLs. In this example, the batch size is defined as a -variable will be added in place of `${vars.batch_size}` in the script. +variable will be added in place of `${vars.batch_size}` in the script. Just like +in the [training config](/usage/training##config-overrides), you can also +override settings on the command line – for example using `--vars.batch_size`. > #### Calling into Python > @@ -491,6 +495,29 @@ commands: - 'corpus/eval.json' ``` +You can also use the `env` section to reference **environment variables** and +make their values available to the commands. This can be useful for overriding +settings on the command line and passing through system-level settings. + +> #### Usage example +> +> ```bash +> export GPU_ID=1 +> BATCH_SIZE=128 python -m spacy project run evaluate +> ``` + +```yaml +### project.yml +env: + batch_size: BATCH_SIZE + gpu_id: GPU_ID + +commands: + - name: evaluate + script: + - 'python scripts/custom_evaluation.py ${env.batch_size}' +``` + ### Documenting your project {#custom-docs} > #### Readme Example