spaCy/spacy/cli/init_config.py

178 lines
7.1 KiB
Python
Raw Normal View History

2020-08-14 14:49:26 +00:00
from typing import Optional, List, Tuple
2020-08-13 15:38:30 +00:00
from enum import Enum
from pathlib import Path
2020-08-14 14:49:26 +00:00
from wasabi import Printer, diff_strings
from thinc.api import Config
2020-08-15 12:50:29 +00:00
from pydantic import BaseModel
2020-08-13 15:38:30 +00:00
import srsly
import re
2020-08-14 12:06:22 +00:00
from .. import util
2020-08-13 15:38:30 +00:00
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
2020-08-15 12:50:29 +00:00
TEMPLATE_ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja"
RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json"
2020-08-13 15:38:30 +00:00
class Optimizations(str, Enum):
efficiency = "efficiency"
accuracy = "accuracy"
2020-08-15 12:50:29 +00:00
class RecommendationsTrfItem(BaseModel):
name: str
size_factor: int
class RecommendationsTrf(BaseModel):
efficiency: RecommendationsTrfItem
accuracy: RecommendationsTrfItem
class RecommendationSchema(BaseModel):
word_vectors: Optional[str] = None
transformer: Optional[RecommendationsTrf] = None
@init_cli.command("config")
def init_config_cli(
# fmt: off
2020-08-13 15:38:30 +00:00
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
# fmt: on
):
2020-08-13 15:38:30 +00:00
"""
Generate a starter config.cfg for training. Based on your requirements
specified via the CLI arguments, this command generates a config with the
optimal settings for you use case. This includes the choice of architecture,
pretrained weights and related hyperparameters.
"""
if isinstance(optimize, Optimizations): # instance of enum from the CLI
optimize = optimize.value
pipeline = [p.strip() for p in pipeline.split(",")]
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
2020-08-14 14:49:26 +00:00
@init_cli.command("fill-config")
def init_fill_config_cli(
# fmt: off
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
# fmt: on
):
"""
Fill partial config.cfg with default values. Will add all missing settings
from the default config and will create all objects, check the registered
functions for their default values and update the base config. This command
can be used with a config generated via the training quickstart widget:
https://nightly.spacy.io/usage/training#quickstart
"""
fill_config(output_file, base_path, diff=diff)
def fill_config(
output_file: Path, base_path: Path, *, diff: bool = False
) -> Tuple[Config, Config]:
is_stdout = str(output_file) == "-"
msg = Printer(no_print=is_stdout)
with show_validation_error(hint_fill=False):
with msg.loading("Auto-filling config..."):
config = util.load_config(base_path)
try:
nlp, _ = util.load_model_from_config(config, auto_fill=True)
except ValueError as e:
msg.fail(str(e), exits=1)
msg.good("Auto-filled config with all values")
if diff and not is_stdout:
msg.divider("START CONFIG DIFF")
print("")
print(diff_strings(config.to_str(), nlp.config.to_str()))
msg.divider("END CONFIG DIFF")
print("")
save_config(nlp.config, output_file, is_stdout=is_stdout)
def init_config(
2020-08-13 15:38:30 +00:00
output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
) -> None:
is_stdout = str(output_file) == "-"
msg = Printer(no_print=is_stdout)
try:
from jinja2 import Template
except ImportError:
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
2020-08-15 12:50:29 +00:00
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
2020-08-14 12:06:22 +00:00
lang_defaults = util.get_lang_class(lang).Defaults
2020-08-13 15:38:30 +00:00
has_letters = lang_defaults.writing_system.get("has_letters", True)
2020-08-15 12:50:29 +00:00
# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
reco = RecommendationSchema(**recommendations.get(lang, {})).dict()
2020-08-13 15:38:30 +00:00
with TEMPLATE_PATH.open("r") as f:
template = Template(f.read())
variables = {
"lang": lang,
"components": pipeline,
"optimize": optimize,
"hardware": "cpu" if cpu else "gpu",
2020-08-15 12:50:29 +00:00
"transformer_data": reco["transformer"],
"word_vectors": reco["word_vectors"],
2020-08-13 15:38:30 +00:00
"has_letters": has_letters,
}
2020-08-15 12:50:29 +00:00
base_template = template.render(variables).strip()
2020-08-13 15:38:30 +00:00
# Giving up on getting the newlines right in jinja for now
base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
2020-08-15 12:50:29 +00:00
# Access variables declared in templates
template_vars = template.make_module(variables)
2020-08-13 15:38:30 +00:00
use_case = {
"Language": lang,
"Pipeline": ", ".join(pipeline),
"Optimize for": optimize,
"Hardware": variables["hardware"].upper(),
2020-08-15 12:50:29 +00:00
"Transformer": template_vars.transformer.get("name", False),
2020-08-13 15:38:30 +00:00
}
2020-08-15 12:50:29 +00:00
msg.info("Generated template specific for your use case")
2020-08-13 15:38:30 +00:00
for label, value in use_case.items():
msg.text(f"- {label}: {value}")
2020-08-15 12:50:29 +00:00
use_transformer = bool(template_vars.use_transformer)
if use_transformer:
require_spacy_transformers(msg)
2020-08-14 14:49:26 +00:00
with show_validation_error(hint_fill=False):
2020-08-15 12:50:29 +00:00
config = util.load_config_from_str(base_template)
try:
nlp, _ = util.load_model_from_config(config, auto_fill=True)
except ValueError as e:
msg.fail(str(e), exits=1)
if use_transformer:
nlp.config.pop("pretraining", {}) # TODO: solve this better
2020-08-13 15:38:30 +00:00
msg.good("Auto-filled config with all values")
2020-08-14 14:49:26 +00:00
save_config(nlp.config, output_file, is_stdout=is_stdout)
def save_config(config: Config, output_file: Path, is_stdout: bool = False):
msg = Printer(no_print=is_stdout)
2020-08-13 15:38:30 +00:00
if is_stdout:
2020-08-14 14:49:26 +00:00
print(config.to_str())
2020-08-13 15:38:30 +00:00
else:
2020-08-14 14:49:26 +00:00
config.to_disk(output_file, interpolate=False)
2020-08-13 15:38:30 +00:00
msg.good("Saved config", output_file)
msg.text("You can now add your data and train your model:")
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
2020-08-14 14:49:26 +00:00
def require_spacy_transformers(msg: Printer):
2020-08-13 15:38:30 +00:00
try:
import spacy_transformers # noqa: F401
except ImportError:
msg.fail(
2020-08-13 15:38:30 +00:00
"Using a transformer-based pipeline requires spacy-transformers "
"to be installed.",
exits=1,
)