spaCy/spacy/cli/init_pipeline.py

122 lines
5.4 KiB
Python
Raw Normal View History

2020-09-28 13:09:59 +00:00
from typing import Optional
2020-09-28 07:47:34 +00:00
import logging
from pathlib import Path
from wasabi import msg
import typer
import srsly
2020-09-28 07:47:34 +00:00
from .. import util
2020-09-29 08:58:50 +00:00
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
2020-09-28 07:47:34 +00:00
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
2020-09-28 19:17:10 +00:00
from ._util import import_code, setup_gpu
2020-09-28 09:30:18 +00:00
2020-09-29 08:58:50 +00:00
@init_cli.command("vectors")
def init_vectors_cli(
# fmt: off
lang: str = Arg(..., help="The language of the nlp object to create"),
vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
output_dir: Path = Arg(..., help="Pipeline output directory"),
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
2020-09-29 14:08:39 +00:00
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
2020-09-29 08:58:50 +00:00
# fmt: on
):
2020-09-29 14:08:39 +00:00
"""Convert word vectors for use with spaCy. Will export an nlp object that
2020-10-01 10:15:53 +00:00
you can use in the [initialize] block of your config to initialize
2020-09-29 14:08:39 +00:00
a model with vectors.
"""
2020-09-29 23:22:08 +00:00
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
2020-09-29 08:58:50 +00:00
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:
update_lexemes(nlp, jsonl_loc)
2020-09-29 14:08:39 +00:00
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
2020-09-29 08:58:50 +00:00
nlp.to_disk(output_dir)
msg.good(
"Saved nlp object with vectors to output directory. You can now use the "
2020-10-23 14:11:54 +00:00
"path to it in your config as the 'vectors' setting in [initialize].",
output_dir.resolve(),
2020-09-29 08:58:50 +00:00
)
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
# Mostly used for backwards-compatibility and may be removed in the future
lex_attrs = srsly.read_jsonl(jsonl_loc)
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
2020-09-28 07:47:34 +00:00
@init_cli.command(
2020-09-28 10:46:28 +00:00
"nlp",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
hidden=True,
2020-09-28 07:47:34 +00:00
)
def init_pipeline_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
2020-09-28 07:47:34 +00:00
output_path: Path = Arg(..., help="Output directory for the prepared data"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
2020-09-28 13:09:59 +00:00
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
2020-09-28 07:47:34 +00:00
# fmt: on
):
2020-09-29 23:22:08 +00:00
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
2020-09-28 07:47:34 +00:00
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
2020-09-28 13:09:59 +00:00
setup_gpu(use_gpu)
2020-09-28 07:47:34 +00:00
with show_validation_error(config_path):
2020-09-28 08:53:17 +00:00
config = util.load_config(config_path, overrides=overrides)
2020-09-28 13:09:59 +00:00
with show_validation_error(hint_fill=False):
2020-09-29 23:22:08 +00:00
nlp = init_nlp(config, use_gpu=use_gpu)
2020-09-28 07:47:34 +00:00
nlp.to_disk(output_path)
2020-09-28 08:53:17 +00:00
msg.good(f"Saved initialized pipeline to {output_path}")
@init_cli.command(
"labels",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def init_labels_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory for the labels"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
2020-10-01 15:38:17 +00:00
"""Generate JSON files for the labels in the data. This helps speed up the
2020-09-29 23:22:08 +00:00
training process, since spaCy won't have to preprocess the data to
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists():
output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu)
_init_labels(nlp, output_path)
def _init_labels(nlp, output_path):
for name, component in nlp.pipeline:
if getattr(component, "label_data", None) is not None:
2020-09-29 23:22:08 +00:00
output_file = output_path / f"{name}.json"
srsly.write_json(output_file, component.label_data)
msg.good(f"Saving label data for component '{name}' to {output_file}")
else:
msg.info(f"No label data found for component '{name}'")