spaCy/spacy/cli/init_config.py

from typing import Optional, List, Tuple
from enum import Enum
from pathlib import Path
from wasabi import Printer, diff_strings
from thinc.api import Config
import srsly
import re

from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND


ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")


class Optimizations(str, Enum):
    efficiency = "efficiency"
    accuracy = "accuracy"


@init_cli.command("config")
def init_config_cli(
    # fmt: off
    output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
    lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
    pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
    optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
    cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
    # fmt: on
):
    """
    Generate a starter config.cfg for training. Based on your requirements
    specified via the CLI arguments, this command generates a config with the
    optimal settings for you use case. This includes the choice of architecture,
    pretrained weights and related hyperparameters.
    """
    if isinstance(optimize, Optimizations):  # instance of enum from the CLI
        optimize = optimize.value
    pipeline = [p.strip() for p in pipeline.split(",")]
    init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)


@init_cli.command("fill-config")
def init_fill_config_cli(
    # fmt: off
    base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
    output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
    pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"),
    diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
    # fmt: on
):
    """
    Fill partial config.cfg with default values. Will add all missing settings
    from the default config and will create all objects, check the registered
    functions for their default values and update the base config. This command
    can be used with a config generated via the training quickstart widget:
    https://nightly.spacy.io/usage/training#quickstart
    """
    fill_config(output_file, base_path, pretraining=pretraining, diff=diff)


def fill_config(
    output_file: Path, base_path: Path, *, pretraining: bool = False, diff: bool = False
) -> Tuple[Config, Config]:
    is_stdout = str(output_file) == "-"
    msg = Printer(no_print=is_stdout)
    with show_validation_error(hint_fill=False):
        config = util.load_config(base_path)
        nlp, _ = util.load_model_from_config(config, auto_fill=True, validate=False)
    # Load a second time with validation to be extra sure that the produced
    # config result is a valid config
    nlp, _ = util.load_model_from_config(nlp.config)
    filled = nlp.config
    if pretraining:
        validate_config_for_pretrain(filled, msg)
        pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
        filled = pretrain_config.merge(filled)
    before = config.to_str()
    after = filled.to_str()
    if before == after:
        msg.warn("Nothing to auto-fill: base config is already complete")
    else:
        msg.good("Auto-filled config with all values")
    if diff and not is_stdout:
        if before == after:
            msg.warn("No diff to show: nothing was auto-filled")
        else:
            msg.divider("START CONFIG DIFF")
            print("")
            print(diff_strings(before, after))
            msg.divider("END CONFIG DIFF")
            print("")
    save_config(filled, output_file, is_stdout=is_stdout)


def init_config(
    output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
) -> None:
    is_stdout = str(output_file) == "-"
    msg = Printer(no_print=is_stdout)
    try:
        from jinja2 import Template
    except ImportError:
        msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
    with TEMPLATE_PATH.open("r") as f:
        template = Template(f.read())
    # Filter out duplicates since tok2vec and transformer are added by template
    pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
    reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()
    variables = {
        "lang": lang,
        "components": pipeline,
        "optimize": optimize,
        "hardware": "cpu" if cpu else "gpu",
        "transformer_data": reco["transformer"],
        "word_vectors": reco["word_vectors"],
        "has_letters": reco["has_letters"],
    }
    if variables["transformer_data"] and not has_spacy_transformers():
        msg.warn(
            "To generate a more effective transformer-based config (GPU-only), "
            "install the spacy-transformers package and re-run this command. "
            "The config generated now does not use transformers."
        )
        variables["transformer_data"] = None
    base_template = template.render(variables).strip()
    # Giving up on getting the newlines right in jinja for now
    base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
    # Access variables declared in templates
    template_vars = template.make_module(variables)
    use_case = {
        "Language": lang,
        "Pipeline": ", ".join(pipeline),
        "Optimize for": optimize,
        "Hardware": variables["hardware"].upper(),
        "Transformer": template_vars.transformer.get("name", False),
    }
    msg.info("Generated template specific for your use case")
    for label, value in use_case.items():
        msg.text(f"- {label}: {value}")
    with show_validation_error(hint_fill=False):
        config = util.load_config_from_str(base_template)
        nlp, _ = util.load_model_from_config(config, auto_fill=True)
    msg.good("Auto-filled config with all values")
    save_config(nlp.config, output_file, is_stdout=is_stdout)


def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> None:
    msg = Printer(no_print=is_stdout)
    if is_stdout:
        print(config.to_str())
    else:
        config.to_disk(output_file, interpolate=False)
        msg.good("Saved config", output_file)
        msg.text("You can now add your data and train your model:")
        variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
        print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")


def has_spacy_transformers() -> bool:
    try:
        import spacy_transformers  # noqa: F401

        return True
    except ImportError:
        return False


def validate_config_for_pretrain(config: Config, msg: Printer) -> None:
    if "tok2vec" not in config["nlp"]["pipeline"]:
        msg.warn(
            "No tok2vec component found in the pipeline. If your tok2vec "
            "component has a different name, you may need to adjust the "
            "tok2vec_model reference in the [pretraining] block. If you don't "
            "have a tok2vec component, make sure to add it to your [components] "
            "and the pipeline specified in the [nlp] block, so you can pretrain "
            "weights for it."
        )
Add init fill-config 2020-08-14 14:49:26 +00:00			`from typing import Optional, List, Tuple`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`from enum import Enum`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 13:18:30 +00:00			`from pathlib import Path`
Add init fill-config 2020-08-14 14:49:26 +00:00			`from wasabi import Printer, diff_strings`
			`from thinc.api import Config`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`import srsly`
			`import re`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 13:18:30 +00:00
Update Thinc and include section order 2020-08-14 12:06:22 +00:00			`from .. import util`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 13:56:03 +00:00			`from ..language import DEFAULT_CONFIG_PRETRAIN_PATH`
Update init config and recommendations - As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies 2020-08-19 11:33:15 +00:00			`from ..schemas import RecommendationSchema`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND`


Update init config and recommendations - As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies 2020-08-19 11:33:15 +00:00			`ROOT = Path(__file__).parent / "templates"`
			`TEMPLATE_PATH = ROOT / "quickstart_training.jinja"`
			`RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00

			`class Optimizations(str, Enum):`
			`efficiency = "efficiency"`
			`accuracy = "accuracy"`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 13:18:30 +00:00

			`@init_cli.command("config")`
			`def init_config_cli(`
			`# fmt: off`
Small wording adjustments [ci skip] 2020-08-21 10:06:19 +00:00			`output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),`
			`pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),`
			`optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),`
			`cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 13:18:30 +00:00			`# fmt: on`
			`):`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`"""`
			`Generate a starter config.cfg for training. Based on your requirements`
			`specified via the CLI arguments, this command generates a config with the`
			`optimal settings for you use case. This includes the choice of architecture,`
			`pretrained weights and related hyperparameters.`
			`"""`
			`if isinstance(optimize, Optimizations): # instance of enum from the CLI`
			`optimize = optimize.value`
			`pipeline = [p.strip() for p in pipeline.split(",")]`
			`init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 13:18:30 +00:00

Add init fill-config 2020-08-14 14:49:26 +00:00			`@init_cli.command("fill-config")`
			`def init_fill_config_cli(`
			`# fmt: off`
			`base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),`
			`output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 13:56:03 +00:00			`pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"),`
Add init fill-config 2020-08-14 14:49:26 +00:00			`diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")`
			`# fmt: on`
			`):`
			`"""`
			`Fill partial config.cfg with default values. Will add all missing settings`
			`from the default config and will create all objects, check the registered`
			`functions for their default values and update the base config. This command`
			`can be used with a config generated via the training quickstart widget:`
			`https://nightly.spacy.io/usage/training#quickstart`
			`"""`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 13:56:03 +00:00			`fill_config(output_file, base_path, pretraining=pretraining, diff=diff)`
Add init fill-config 2020-08-14 14:49:26 +00:00

			`def fill_config(`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 13:56:03 +00:00			`output_file: Path, base_path: Path, *, pretraining: bool = False, diff: bool = False`
Add init fill-config 2020-08-14 14:49:26 +00:00			`) -> Tuple[Config, Config]:`
			`is_stdout = str(output_file) == "-"`
			`msg = Printer(no_print=is_stdout)`
			`with show_validation_error(hint_fill=False):`
Fix CLI consistency [ci skip] 2020-08-16 13:46:29 +00:00			`config = util.load_config(base_path)`
Support removing extra values in fill-config (#5966) * Support removing extra values in fill-config * Fix test 2020-08-24 20:53:47 +00:00			`nlp, _ = util.load_model_from_config(config, auto_fill=True, validate=False)`
			`# Load a second time with validation to be extra sure that the produced`
			`# config result is a valid config`
			`nlp, _ = util.load_model_from_config(nlp.config)`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 13:56:03 +00:00			`filled = nlp.config`
			`if pretraining:`
			`validate_config_for_pretrain(filled, msg)`
			`pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)`
			`filled = pretrain_config.merge(filled)`
Show warnings if there's nothing to auto-fill 2020-08-16 12:19:43 +00:00			`before = config.to_str()`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 13:56:03 +00:00			`after = filled.to_str()`
Show warnings if there's nothing to auto-fill 2020-08-16 12:19:43 +00:00			`if before == after:`
			`msg.warn("Nothing to auto-fill: base config is already complete")`
			`else:`
			`msg.good("Auto-filled config with all values")`
Add init fill-config 2020-08-14 14:49:26 +00:00			`if diff and not is_stdout:`
Show warnings if there's nothing to auto-fill 2020-08-16 12:19:43 +00:00			`if before == after:`
			`msg.warn("No diff to show: nothing was auto-filled")`
			`else:`
			`msg.divider("START CONFIG DIFF")`
			`print("")`
			`print(diff_strings(before, after))`
			`msg.divider("END CONFIG DIFF")`
			`print("")`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 13:56:03 +00:00			`save_config(filled, output_file, is_stdout=is_stdout)`
Add init fill-config 2020-08-14 14:49:26 +00:00

Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 13:18:30 +00:00			`def init_config(`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool`
			`) -> None:`
			`is_stdout = str(output_file) == "-"`
			`msg = Printer(no_print=is_stdout)`
			`try:`
			`from jinja2 import Template`
			`except ImportError:`
			`msg.fail("This command requires jinja2", "pip install jinja2", exits=1)`
			`with TEMPLATE_PATH.open("r") as f:`
			`template = Template(f.read())`
Update init config and recommendations - As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies 2020-08-19 11:33:15 +00:00			`# Filter out duplicates since tok2vec and transformer are added by template`
			`pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]`
			`reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`variables = {`
			`"lang": lang,`
			`"components": pipeline,`
			`"optimize": optimize,`
			`"hardware": "cpu" if cpu else "gpu",`
Update quickstart, template and docs 2020-08-15 12:50:29 +00:00			`"transformer_data": reco["transformer"],`
			`"word_vectors": reco["word_vectors"],`
Update init config and recommendations - As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies 2020-08-19 11:33:15 +00:00			`"has_letters": reco["has_letters"],`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`}`
Small wording adjustments [ci skip] 2020-08-21 10:06:19 +00:00			`if variables["transformer_data"] and not has_spacy_transformers():`
			`msg.warn(`
			`"To generate a more effective transformer-based config (GPU-only), "`
			`"install the spacy-transformers package and re-run this command. "`
			`"The config generated now does not use transformers."`
			`)`
			`variables["transformer_data"] = None`
Update quickstart, template and docs 2020-08-15 12:50:29 +00:00			`base_template = template.render(variables).strip()`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`# Giving up on getting the newlines right in jinja for now`
			`base_template = re.sub(r"\n\n\n+", "\n\n", base_template)`
Update quickstart, template and docs 2020-08-15 12:50:29 +00:00			`# Access variables declared in templates`
			`template_vars = template.make_module(variables)`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`use_case = {`
			`"Language": lang,`
			`"Pipeline": ", ".join(pipeline),`
			`"Optimize for": optimize,`
			`"Hardware": variables["hardware"].upper(),`
Update quickstart, template and docs 2020-08-15 12:50:29 +00:00			`"Transformer": template_vars.transformer.get("name", False),`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`}`
Update quickstart, template and docs 2020-08-15 12:50:29 +00:00			`msg.info("Generated template specific for your use case")`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`for label, value in use_case.items():`
			`msg.text(f"- {label}: {value}")`
Add init fill-config 2020-08-14 14:49:26 +00:00			`with show_validation_error(hint_fill=False):`
Update quickstart, template and docs 2020-08-15 12:50:29 +00:00			`config = util.load_config_from_str(base_template)`
Fix CLI consistency [ci skip] 2020-08-16 13:46:29 +00:00			`nlp, _ = util.load_model_from_config(config, auto_fill=True)`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`msg.good("Auto-filled config with all values")`
Add init fill-config 2020-08-14 14:49:26 +00:00			`save_config(nlp.config, output_file, is_stdout=is_stdout)`


Update type hints 2020-08-16 12:19:33 +00:00			`def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> None:`
Add init fill-config 2020-08-14 14:49:26 +00:00			`msg = Printer(no_print=is_stdout)`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`if is_stdout:`
Add init fill-config 2020-08-14 14:49:26 +00:00			`print(config.to_str())`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`else:`
Add init fill-config 2020-08-14 14:49:26 +00:00			`config.to_disk(output_file, interpolate=False)`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`msg.good("Saved config", output_file)`
			`msg.text("You can now add your data and train your model:")`
			`variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]`
			`print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")`
Add init CLI and init config (#5854) * Add init CLI and init config draft * Improve config validation * Auto-format * Don't export anything in debug config * Update docs 2020-08-02 13:18:30 +00:00

Small wording adjustments [ci skip] 2020-08-21 10:06:19 +00:00			`def has_spacy_transformers() -> bool:`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`try:`
			`import spacy_transformers # noqa: F401`
Small wording adjustments [ci skip] 2020-08-21 10:06:19 +00:00
			`return True`
Update for new Thinc and adjust config 2020-08-13 15:38:30 +00:00			`except ImportError:`
output_file required, spacy-transformers prefered instead of required 2020-08-18 11:38:43 +00:00			`return False`
Fix handling of optional [pretraining] block (#5954) * Fix handling of optional [pretraining] block * Remote pretraining from default config * Fix test * Add schema option for empty pretrain block 2020-08-24 13:56:03 +00:00

			`def validate_config_for_pretrain(config: Config, msg: Printer) -> None:`
			`if "tok2vec" not in config["nlp"]["pipeline"]:`
			`msg.warn(`
			`"No tok2vec component found in the pipeline. If your tok2vec "`
			`"component has a different name, you may need to adjust the "`
			`"tok2vec_model reference in the [pretraining] block. If you don't "`
			`"have a tok2vec component, make sure to add it to your [components] "`
			`"and the pipeline specified in the [nlp] block, so you can pretrain "`
			`"weights for it."`
			`)`