From 8e93fa850748c884c71505b4f26c46d0c98d3ba1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Mar 2022 09:21:25 +0100 Subject: [PATCH] Fix Vectors.n_keys for floret vectors (#10394) Fix `Vectors.n_keys` for floret vectors to match docstring description and avoid W007 warnings in similarity methods. --- spacy/tests/vocab_vectors/test_vectors.py | 4 ++++ spacy/vectors.pyx | 2 ++ website/docs/api/vectors.md | 10 +++++----- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 0650a7487..ffd7489b2 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -535,6 +535,10 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # every word has a vector assert nlp.vocab[word * 5].has_vector + # n_keys is -1 for floret + assert nlp_plain.vocab.vectors.n_keys > 0 + assert nlp.vocab.vectors.n_keys == -1 + # check that single and batched vector lookups are identical words = [s for s in nlp_plain.vocab.vectors] single_vecs = OPS.to_numpy(OPS.asarray([nlp.vocab[word].vector for word in words])) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index bc4863703..2b1ea764b 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -170,6 +170,8 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#n_keys """ + if self.mode == Mode.floret: + return -1 return len(self.key2row) def __reduce__(self): diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index b3bee822c..a651c23b0 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -327,9 +327,9 @@ will be counted individually. In `floret` mode, the keys table is not used. > assert vectors.n_keys == 0 > ``` -| Name | Description | -| ----------- | -------------------------------------------- | -| **RETURNS** | The number of all keys in the table. ~~int~~ | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------- | +| **RETURNS** | The number of all keys in the table. Returns `-1` for floret vectors. ~~int~~ | ## Vectors.most_similar {#most_similar tag="method"} @@ -348,7 +348,7 @@ supported for `floret` mode. > ``` | Name | Description | -| -------------- | --------------------------------------------------------------------------- | +| -------------- | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | | `queries` | An array with one or more vectors. ~~numpy.ndarray~~ | | _keyword-only_ | | | `batch_size` | The batch size to use. Default to `1024`. ~~int~~ | @@ -385,7 +385,7 @@ Change the embedding matrix to use different Thinc ops. > ``` | Name | Description | -|-------|----------------------------------------------------------| +| ----- | -------------------------------------------------------- | | `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ | ## Vectors.to_disk {#to_disk tag="method"}