mirror of https://github.com/explosion/spaCy.git
Hide jsonl_loc on init vectors and tidy up [ci skip]
This commit is contained in:
parent
27cbffff1b
commit
7f68f4bd92
|
@ -7,6 +7,7 @@ import srsly
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.initialize import init_nlp, convert_vectors
|
from ..training.initialize import init_nlp, convert_vectors
|
||||||
|
from ..language import Language
|
||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||||
from ._util import import_code, setup_gpu
|
from ._util import import_code, setup_gpu
|
||||||
|
|
||||||
|
@ -19,9 +20,9 @@ def init_vectors_cli(
|
||||||
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
output_dir: Path = Arg(..., help="Pipeline output directory"),
|
||||||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||||
jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
|
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
@ -32,12 +33,7 @@ def init_vectors_cli(
|
||||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||||
nlp = util.get_lang_class(lang)()
|
nlp = util.get_lang_class(lang)()
|
||||||
if jsonl_loc is not None:
|
if jsonl_loc is not None:
|
||||||
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
update_lexemes(nlp, jsonl_loc)
|
||||||
for attrs in lex_attrs:
|
|
||||||
if "settings" in attrs:
|
|
||||||
continue
|
|
||||||
lexeme = nlp.vocab[attrs["orth"]]
|
|
||||||
lexeme.set_attrs(**attrs)
|
|
||||||
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
|
@ -48,6 +44,16 @@ def init_vectors_cli(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
|
||||||
|
# Mostly used for backwards-compatibility and may be removed in the future
|
||||||
|
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||||
|
for attrs in lex_attrs:
|
||||||
|
if "settings" in attrs:
|
||||||
|
continue
|
||||||
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
|
lexeme.set_attrs(**attrs)
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command(
|
@init_cli.command(
|
||||||
"nlp",
|
"nlp",
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
|
Loading…
Reference in New Issue