diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 3311a5120..18589a954 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -18,6 +18,7 @@ from wasabi import msg from ..vectors import Vectors from ..errors import Errors, Warnings from ..util import ensure_path, get_lang_class, OOV_RANK +from ..lookups import Lookups try: import ftfy @@ -49,6 +50,7 @@ DEFAULT_OOV_PROB = -20 str, ), model_name=("Optional name for the model meta", "option", "mn", str), + omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), ) def init_model( lang, @@ -61,6 +63,7 @@ def init_model( prune_vectors=-1, vectors_name=None, model_name=None, + omit_extra_lookups=False, ): """ Create a new model from raw data, like word frequencies, Brown clusters @@ -93,6 +96,15 @@ def init_model( with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name) + + # Create empty extra lexeme tables so the data from spacy-lookups-data + # isn't loaded if these features are accessed + if omit_extra_lookups: + nlp.vocab.lookups_extra = Lookups() + nlp.vocab.lookups_extra.add_table("lexeme_cluster") + nlp.vocab.lookups_extra.add_table("lexeme_prob") + nlp.vocab.lookups_extra.add_table("lexeme_settings") + msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 7cb2d9745..6ce095c15 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -17,6 +17,7 @@ from .._ml import create_default_optimizer from ..util import use_gpu as set_gpu from ..gold import GoldCorpus from ..compat import path2str +from ..lookups import Lookups from .. import util from .. import about @@ -57,6 +58,7 @@ from .. import about textcat_arch=("Textcat model architecture", "option", "ta", str), textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), + omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), verbose=("Display more information for debug", "flag", "VV", bool), debug=("Run data diagnostics before training", "flag", "D", bool), # fmt: on @@ -96,6 +98,7 @@ def train( textcat_arch="bow", textcat_positive_label=None, tag_map_path=None, + omit_extra_lookups=False, verbose=False, debug=False, ): @@ -247,6 +250,14 @@ def train( # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) + # Create empty extra lexeme tables so the data from spacy-lookups-data + # isn't loaded if these features are accessed + if omit_extra_lookups: + nlp.vocab.lookups_extra = Lookups() + nlp.vocab.lookups_extra.add_table("lexeme_cluster") + nlp.vocab.lookups_extra.add_table("lexeme_prob") + nlp.vocab.lookups_extra.add_table("lexeme_settings") + if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors)