diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index edbd5dff7..7fdd39932 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -18,6 +18,8 @@ from wasabi import msg from ..vectors import Vectors from ..errors import Errors, Warnings from ..util import ensure_path, get_lang_class, load_model, OOV_RANK +from ..lookups import Lookups + try: import ftfy @@ -49,12 +51,8 @@ DEFAULT_OOV_PROB = -20 str, ), model_name=("Optional name for the model meta", "option", "mn", str), - base_model=( - "Base model (for languages with custom tokenizers)", - "option", - "b", - str, - ), + omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), + base_model=("Base model (for languages with custom tokenizers)", "option", "b", str), ) def init_model( lang, @@ -67,6 +65,7 @@ def init_model( prune_vectors=-1, vectors_name=None, model_name=None, + omit_extra_lookups=False, base_model=None, ): """ @@ -100,6 +99,15 @@ def init_model( with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) + + # Create empty extra lexeme tables so the data from spacy-lookups-data + # isn't loaded if these features are accessed + if omit_extra_lookups: + nlp.vocab.lookups_extra = Lookups() + nlp.vocab.lookups_extra.add_table("lexeme_cluster") + nlp.vocab.lookups_extra.add_table("lexeme_prob") + nlp.vocab.lookups_extra.add_table("lexeme_settings") + msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 7cb2d9745..6ce095c15 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -17,6 +17,7 @@ from .._ml import create_default_optimizer from ..util import use_gpu as set_gpu from ..gold import GoldCorpus from ..compat import path2str +from ..lookups import Lookups from .. import util from .. import about @@ -57,6 +58,7 @@ from .. import about textcat_arch=("Textcat model architecture", "option", "ta", str), textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), + omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), verbose=("Display more information for debug", "flag", "VV", bool), debug=("Run data diagnostics before training", "flag", "D", bool), # fmt: on @@ -96,6 +98,7 @@ def train( textcat_arch="bow", textcat_positive_label=None, tag_map_path=None, + omit_extra_lookups=False, verbose=False, debug=False, ): @@ -247,6 +250,14 @@ def train( # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) + # Create empty extra lexeme tables so the data from spacy-lookups-data + # isn't loaded if these features are accessed + if omit_extra_lookups: + nlp.vocab.lookups_extra = Lookups() + nlp.vocab.lookups_extra.add_table("lexeme_cluster") + nlp.vocab.lookups_extra.add_table("lexeme_prob") + nlp.vocab.lookups_extra.add_table("lexeme_settings") + if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors)