diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index e2935184c..62d9096d9 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -7,6 +7,7 @@ import srsly from .. import util from ..training.initialize import init_nlp, convert_vectors +from ..language import Language from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, setup_gpu @@ -19,9 +20,9 @@ def init_vectors_cli( output_dir: Path = Arg(..., help="Pipeline output directory"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), - jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"), name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), # fmt: on ): """Convert word vectors for use with spaCy. Will export an nlp object that @@ -32,12 +33,7 @@ def init_vectors_cli( msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() if jsonl_loc is not None: - lex_attrs = srsly.read_jsonl(jsonl_loc) - for attrs in lex_attrs: - if "settings" in attrs: - continue - lexeme = nlp.vocab[attrs["orth"]] - lexeme.set_attrs(**attrs) + update_lexemes(nlp, jsonl_loc) convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name) msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") nlp.to_disk(output_dir) @@ -48,6 +44,16 @@ def init_vectors_cli( ) +def update_lexemes(nlp: Language, jsonl_loc: Path) -> None: + # Mostly used for backwards-compatibility and may be removed in the future + lex_attrs = srsly.read_jsonl(jsonl_loc) + for attrs in lex_attrs: + if "settings" in attrs: + continue + lexeme = nlp.vocab[attrs["orth"]] + lexeme.set_attrs(**attrs) + + @init_cli.command( "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},