From ea450d652c32f65b947a1e1a498b45f29ed4dc29 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 19 Nov 2021 08:51:19 +0100 Subject: [PATCH] Exclude strings from v3.2+ source vector checks (#9697) Exclude strings from `Vector.to_bytes()` comparions for v3.2+ `Vectors` that now include the string store so that the source vector comparison is only comparing the vectors and not the strings. --- spacy/language.py | 7 +++++-- spacy/training/initialize.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index aa57989ac..204b24ecb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -701,7 +701,8 @@ class Language: if ( self.vocab.vectors.shape != source.vocab.vectors.shape or self.vocab.vectors.key2row != source.vocab.vectors.key2row - or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes() + or self.vocab.vectors.to_bytes(exclude=["strings"]) + != source.vocab.vectors.to_bytes(exclude=["strings"]) ): warnings.warn(Warnings.W113.format(name=source_name)) if source_name not in source.component_names: @@ -1822,7 +1823,9 @@ class Language: ) if model not in source_nlp_vectors_hashes: source_nlp_vectors_hashes[model] = hash( - source_nlps[model].vocab.vectors.to_bytes() + source_nlps[model].vocab.vectors.to_bytes( + exclude=["strings"] + ) ) if "_sourced_vectors_hashes" not in nlp.meta: nlp.meta["_sourced_vectors_hashes"] = {} diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 13ccfeb93..084204389 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -132,7 +132,7 @@ def init_vocab( logger.info(f"Added vectors: {vectors}") # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) - vectors_hash = hash(nlp.vocab.vectors.to_bytes()) + vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): if vectors_hash != sourced_vectors_hash: warnings.warn(Warnings.W113.format(name=sourced_component))