spaCy/spacy/cli/init_pipeline.py

from typing import Optional
import logging
from pathlib import Path
from wasabi import msg
import typer
import srsly

from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu


@init_cli.command("vectors")
def init_vectors_cli(
    # fmt: off
    lang: str = Arg(..., help="The language of the nlp object to create"),
    vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
    output_dir: Path = Arg(..., help="Pipeline output directory"),
    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
    # fmt: on
):
    """Convert word vectors for use with spaCy. Will export an nlp object that
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
        update_lexemes(nlp, jsonl_loc)
    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
    nlp.to_disk(output_dir)
    msg.good(
        "Saved nlp object with vectors to output directory. You can now use the "
        "path to it in your config as the 'vectors' setting in [initialize.vocab].",
        output_dir.resolve(),
    )


def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
    # Mostly used for backwards-compatibility and may be removed in the future
    lex_attrs = srsly.read_jsonl(jsonl_loc)
    for attrs in lex_attrs:
        if "settings" in attrs:
            continue
        lexeme = nlp.vocab[attrs["orth"]]
        lexeme.set_attrs(**attrs)


@init_cli.command(
    "nlp",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
    hidden=True,
)
def init_pipeline_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    output_path: Path = Arg(..., help="Output directory for the prepared data"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    nlp.to_disk(output_path)
    msg.good(f"Saved initialized pipeline to {output_path}")


@init_cli.command(
    "labels",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def init_labels_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    output_path: Path = Arg(..., help="Output directory for the labels"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
):
    """Generate JSON files for the labels in the data. This helps speed up the
    training process, since spaCy won't have to preprocess the data to
    extract the labels."""
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    if not output_path.exists():
        output_path.mkdir()
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu)
    for name, component in nlp.pipeline:
        if getattr(component, "label_data", None) is not None:
            output_file = output_path / f"{name}.json"
            srsly.write_json(output_file, component.label_data)
            msg.good(f"Saving {name} labels to {output_file}")
        else:
            msg.info(f"No labels found for {name}")
Refactor CLI 2020-09-28 13:09:59 +00:00			`from typing import Optional`
Add init_pipeline file 2020-09-28 07:47:34 +00:00			`import logging`
			`from pathlib import Path`
			`from wasabi import msg`
			`import typer`
Move init labels to init pipeline module 2020-09-29 16:09:33 +00:00			`import srsly`
Add init_pipeline file 2020-09-28 07:47:34 +00:00
			`from .. import util`
Add init vectors 2020-09-29 08:58:50 +00:00			`from ..training.initialize import init_nlp, convert_vectors`
Hide jsonl_loc on init vectors and tidy up [ci skip] 2020-10-01 14:44:17 +00:00			`from ..language import Language`
Add init_pipeline file 2020-09-28 07:47:34 +00:00			`from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error`
Fix typos and refactor CLI logging 2020-09-28 19:17:10 +00:00			`from ._util import import_code, setup_gpu`
Update vocab init 2020-09-28 09:30:18 +00:00

Add init vectors 2020-09-29 08:58:50 +00:00			`@init_cli.command("vectors")`
			`def init_vectors_cli(`
			`# fmt: off`
			`lang: str = Arg(..., help="The language of the nlp object to create"),`
			`vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),`
			`output_dir: Path = Arg(..., help="Pipeline output directory"),`
			`prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),`
			`truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),`
			`name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),`
Fix logging 2020-09-29 14:08:39 +00:00			`verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),`
Hide jsonl_loc on init vectors and tidy up [ci skip] 2020-10-01 14:44:17 +00:00			`jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),`
Add init vectors 2020-09-29 08:58:50 +00:00			`# fmt: on`
			`):`
Fix logging 2020-09-29 14:08:39 +00:00			`"""Convert word vectors for use with spaCy. Will export an nlp object that`
Update docs [ci skip] 2020-10-01 10:15:53 +00:00			`you can use in the [initialize] block of your config to initialize`
Fix logging 2020-09-29 14:08:39 +00:00			`a model with vectors.`
			`"""`
Tidy up and adjust logging [ci skip] 2020-09-29 23:22:08 +00:00			`util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)`
Add init vectors 2020-09-29 08:58:50 +00:00			`msg.info(f"Creating blank nlp object for language '{lang}'")`
			`nlp = util.get_lang_class(lang)()`
Restore the 'jsonl' arg for init vectors The lexemes.jsonl file is still used in our English vectors, and it may be required by users as well. I think it's worth supporting the option. 2020-09-29 19:33:55 +00:00			`if jsonl_loc is not None:`
Hide jsonl_loc on init vectors and tidy up [ci skip] 2020-10-01 14:44:17 +00:00			`update_lexemes(nlp, jsonl_loc)`
Fix logging 2020-09-29 14:08:39 +00:00			`convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)`
			`msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")`
Add init vectors 2020-09-29 08:58:50 +00:00			`nlp.to_disk(output_dir)`
			`msg.good(`
			`"Saved nlp object with vectors to output directory. You can now use the "`
			`"path to it in your config as the 'vectors' setting in [initialize.vocab].",`
Resolve dir for better output [ci skip] 2020-09-29 20:01:04 +00:00			`output_dir.resolve(),`
Add init vectors 2020-09-29 08:58:50 +00:00			`)`


Hide jsonl_loc on init vectors and tidy up [ci skip] 2020-10-01 14:44:17 +00:00			`def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:`
			`# Mostly used for backwards-compatibility and may be removed in the future`
			`lex_attrs = srsly.read_jsonl(jsonl_loc)`
			`for attrs in lex_attrs:`
			`if "settings" in attrs:`
			`continue`
			`lexeme = nlp.vocab[attrs["orth"]]`
			`lexeme.set_attrs(**attrs)`


Add init_pipeline file 2020-09-28 07:47:34 +00:00			`@init_cli.command(`
Don't support init path for now 2020-09-28 10:46:28 +00:00			`"nlp",`
			`context_settings={"allow_extra_args": True, "ignore_unknown_options": True},`
			`hidden=True,`
Add init_pipeline file 2020-09-28 07:47:34 +00:00			`)`
			`def init_pipeline_cli(`
			`# fmt: off`
			`ctx: typer.Context, # This is only used to read additional arguments`
			`config_path: Path = Arg(..., help="Path to config file", exists=True),`
			`output_path: Path = Arg(..., help="Output directory for the prepared data"),`
			`code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),`
			`verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),`
Refactor CLI 2020-09-28 13:09:59 +00:00			`use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")`
Add init_pipeline file 2020-09-28 07:47:34 +00:00			`# fmt: on`
			`):`
Tidy up and adjust logging [ci skip] 2020-09-29 23:22:08 +00:00			`util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)`
Add init_pipeline file 2020-09-28 07:47:34 +00:00			`overrides = parse_config_overrides(ctx.args)`
			`import_code(code_path)`
Refactor CLI 2020-09-28 13:09:59 +00:00			`setup_gpu(use_gpu)`
Add init_pipeline file 2020-09-28 07:47:34 +00:00			`with show_validation_error(config_path):`
Fix commands 2020-09-28 08:53:17 +00:00			`config = util.load_config(config_path, overrides=overrides)`
Refactor CLI 2020-09-28 13:09:59 +00:00			`with show_validation_error(hint_fill=False):`
Tidy up and adjust logging [ci skip] 2020-09-29 23:22:08 +00:00			`nlp = init_nlp(config, use_gpu=use_gpu)`
Add init_pipeline file 2020-09-28 07:47:34 +00:00			`nlp.to_disk(output_path)`
Fix commands 2020-09-28 08:53:17 +00:00			`msg.good(f"Saved initialized pipeline to {output_path}")`
Move init labels to init pipeline module 2020-09-29 16:09:33 +00:00

			`@init_cli.command(`
			`"labels",`
			`context_settings={"allow_extra_args": True, "ignore_unknown_options": True},`
			`)`
			`def init_labels_cli(`
			`# fmt: off`
			`ctx: typer.Context, # This is only used to read additional arguments`
			`config_path: Path = Arg(..., help="Path to config file", exists=True),`
			`output_path: Path = Arg(..., help="Output directory for the labels"),`
			`code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),`
			`verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),`
			`use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")`
			`# fmt: on`
			`):`
Update docs [ci skip] 2020-10-01 15:38:17 +00:00			`"""Generate JSON files for the labels in the data. This helps speed up the`
Tidy up and adjust logging [ci skip] 2020-09-29 23:22:08 +00:00			`training process, since spaCy won't have to preprocess the data to`
			`extract the labels."""`
			`util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)`
Move init labels to init pipeline module 2020-09-29 16:09:33 +00:00			`if not output_path.exists():`
			`output_path.mkdir()`
			`overrides = parse_config_overrides(ctx.args)`
			`import_code(code_path)`
			`setup_gpu(use_gpu)`
			`with show_validation_error(config_path):`
			`config = util.load_config(config_path, overrides=overrides)`
			`with show_validation_error(hint_fill=False):`
			`nlp = init_nlp(config, use_gpu=use_gpu)`
			`for name, component in nlp.pipeline:`
			`if getattr(component, "label_data", None) is not None:`
Tidy up and adjust logging [ci skip] 2020-09-29 23:22:08 +00:00			`output_file = output_path / f"{name}.json"`
			`srsly.write_json(output_file, component.label_data)`
			`msg.good(f"Saving {name} labels to {output_file}")`
Move init labels to init pipeline module 2020-09-29 16:09:33 +00:00			`else:`
			`msg.info(f"No labels found for {name}")`