From 5fd0b5207e34e849ca257c92966e2c7d72431149 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 6 Jul 2021 12:43:17 +0200 Subject: [PATCH] Fix vectors check for sourced components (#8559) * Fix vectors check for sourced components Since vectors are not loaded when components are sourced, store a hash for the vectors of each sourced component and compare it to the loaded vectors after the vectors are loaded from the `[initialize]` block. * Pop temporary info * Remove stored hash in remove_pipe * Add default for pop * Add additional convert/debug/assemble CLI tests --- .github/azure-steps.yml | 24 ++++++++++++++++++++++++ spacy/language.py | 10 +++++++++- spacy/training/initialize.py | 7 +++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 4255ec219..7ffdf6e61 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -62,6 +62,30 @@ steps: - script: | python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" displayName: 'Test download CLI' condition: eq(variables['python_version'], '3.8') + + - script: | + python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . + displayName: 'Test convert CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -m spacy init config -p ner -l ca ner.cfg + python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy + displayName: 'Test debug config CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + displayName: 'Test assemble CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + displayName: 'Test assemble CLI vectors warning' + condition: eq(variables['python_version'], '3.8') diff --git a/spacy/language.py b/spacy/language.py index 1f2dcf2b5..daee63170 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -934,6 +934,7 @@ class Language: # because factory may be used for something else self._pipe_meta.pop(name) self._pipe_configs.pop(name) + self.meta.get("_sourced_vectors_hashes", {}).pop(name, None) # Make sure name is removed from the [initialize] config if name in self._config["initialize"]["components"]: self._config["initialize"]["components"].pop(name) @@ -1680,6 +1681,8 @@ class Language: # If components are loaded from a source (existing models), we cache # them here so they're only loaded once source_nlps = {} + source_nlp_vectors_hashes = {} + nlp.meta["_sourced_vectors_hashes"] = {} for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) @@ -1719,7 +1722,12 @@ class Language: name, source_name, pipe_cfg["replace_listeners"] ) listeners_replaced = True - nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="\\[W113\\]") + nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) + if model not in source_nlp_vectors_hashes: + source_nlp_vectors_hashes[model] = hash(source_nlps[model].vocab.vectors.to_bytes()) + nlp.meta["_sourced_vectors_hashes"][pipe_name] = source_nlp_vectors_hashes[model] # Delete from cache if listeners were replaced if listeners_replaced: del source_nlps[model] diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index c1fda9181..3cfd33f95 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -9,6 +9,7 @@ import gzip import zipfile import tqdm from itertools import islice +import warnings from .pretrain import get_tok2vec_ref from ..lookups import Lookups @@ -124,6 +125,12 @@ def init_vocab( if vectors is not None: load_vectors_into_model(nlp, vectors) logger.info(f"Added vectors: {vectors}") + # warn if source model vectors are not identical + sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) + vectors_hash = hash(nlp.vocab.vectors.to_bytes()) + for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): + if vectors_hash != sourced_vectors_hash: + warnings.warn(Warnings.W113.format(name=sourced_component)) logger.info("Finished initializing nlp object")