diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index f1786e04b..258b8634a 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,6 +1,7 @@ [nlp] lang = null pipeline = [] +load_vocab_data = true [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" diff --git a/spacy/language.py b/spacy/language.py index 6d2ae3dbe..dc6167ef2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -121,15 +121,18 @@ class Language: vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. meta (dict): Custom meta data for the Language class. Is written to by models to add model meta data. - max_length (int) : - Maximum number of characters in a single text. The current models - may run out memory on extremely long texts, due to large internal - allocations. You should segment these texts into meaningful units, - e.g. paragraphs, subsections etc, before passing them to spaCy. - Default maximum length is 1,000,000 characters (1mb). As a rule of - thumb, if all pipeline components are enabled, spaCy's default - models currently requires roughly 1GB of temporary memory per + max_length (int): Maximum number of characters in a single text. The + current models may run out memory on extremely long texts, due to + large internal allocations. You should segment these texts into + meaningful units, e.g. paragraphs, subsections etc, before passing + them to spaCy. Default maximum length is 1,000,000 charas (1mb). As + a rule of thumb, if all pipeline components are enabled, spaCy's + default models currently requires roughly 1GB of temporary memory per 100,000 characters in one text. + create_tokenizer (Callable): Function that takes the nlp object and + returns a tokenizer. + create_lemmatizer (Callable): Function that takes the nlp object and + returns a lemmatizer. RETURNS (Language): The newly constructed object. """ # We're only calling this to import all factories provided via entry @@ -150,12 +153,12 @@ class Language: if not create_lemmatizer: lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]} create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] - # TODO: where does the vocab data come in? vocab = create_vocab( self.lang, self.Defaults, lemmatizer=create_lemmatizer(self), vectors_name=vectors_name, + load_data=self._config["nlp"]["load_vocab_data"], ) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): diff --git a/spacy/schemas.py b/spacy/schemas.py index e55123e14..c6bdd6e9c 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -224,21 +224,13 @@ class ConfigSchemaTraining(BaseModel): arbitrary_types_allowed = True -class ConfigSchemaNlpWritingSystem(BaseModel): - direction: StrictStr = Field(..., title="The writing direction, e.g. 'rtl'") - has_case: StrictBool = Field(..., title="Whether the language has case") - has_letters: StrictBool = Field(..., title="Whether the language has letters") - - class Config: - extra = "allow" - - class ConfigSchemaNlp(BaseModel): # fmt: off lang: StrictStr = Field(..., title="The base language to use") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") tokenizer: Callable = Field(..., title="The tokenizer to use") lemmatizer: Callable = Field(..., title="The lemmatizer to use") + load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") # fmt: on class Config: diff --git a/spacy/util.py b/spacy/util.py index 18ce7e474..3b6ba0f25 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -188,8 +188,10 @@ def load_model( """Load a model from a package or data path. name (str): Package name or model path. - **overrides: Specific overrides, like pipeline components to disable. - RETURNS (Language): `Language` class with the loaded model. + disable (Iterable[str]): Names of pipeline components to disable. + component_cfg (Dict[str, dict]): Config overrides for pipeline components, + keyed by component names. + RETURNS (Language): The loaded nlp object. """ cfg = component_cfg if isinstance(name, str): # name or string path diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 56e62834a..2115789e6 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -23,10 +23,10 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_lookups_data=True): +def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available - if load_lookups_data: + if load_data: tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"] lookups = load_lookups(lang, tables=tables, strict=False) else: