From 69d041148f6c5652dd3489d8864c19939e4cd455 Mon Sep 17 00:00:00 2001 From: Suraj Krishnan Rajan Date: Sat, 21 Apr 2018 01:34:14 +0530 Subject: [PATCH] Implement Fast-Text vectors with subword features --- spacy/tests/vectors/test_vectors.py | 24 ++++++++++++++++++ spacy/vocab.pyx | 38 ++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py index ce32eec00..c72777c07 100644 --- a/spacy/tests/vectors/test_vectors.py +++ b/spacy/tests/vectors/test_vectors.py @@ -23,6 +23,18 @@ def vectors(): ('juice', [5, 5, 10]), ('pie', [7, 6.3, 8.9])] +@pytest.fixture +def ngrams_vectors(): + return [ + ("apple", [1, 2, 3]), + ("app", [-0.1, -0.2, -0.3]), + ('ppl', [-0.2, -0.3, -0.4]), + ('pl', [0.7, 0.8, 0.9]) + ] +@pytest.fixture() +def ngrams_vocab(en_vocab, ngrams_vectors): + add_vecs_to_vocab(en_vocab, ngrams_vectors) + return en_vocab @pytest.fixture def data(): @@ -105,6 +117,18 @@ def test_vectors_token_vector(tokenizer_v, vectors, text): assert vectors[1] == (doc[2].text, list(doc[2].vector)) +@pytest.mark.parametrize('text', ["apple"]) +def test_vectors__ngrams_word(ngrams_vocab, text): + assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors()[0][1]) + +@pytest.mark.parametrize('text', ["applpie"]) +def test_vectors__ngrams_subword(ngrams_vocab, text): + truth = list(ngrams_vocab.get_vector(text,1,6)) + test = list([(ngrams_vectors()[1][1][i] + ngrams_vectors()[2][1][i] + ngrams_vectors()[3][1][i])/3 for i in range(len(ngrams_vectors()[1][1]))]) + eps = [abs(truth[i] - test[i]) for i in range(len(truth))] + for i in eps: + assert i<1e-6 + @pytest.mark.parametrize('text', ["apple", "orange"]) def test_vectors_lexeme_vector(vocab, text): lex = vocab[text] diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index c285828cc..a3eb08b32 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -309,7 +309,7 @@ cdef class Vocab: link_vectors_to_models(self) return remap - def get_vector(self, orth): + def get_vector(self, orth, minn=None, maxn=None): """Retrieve a vector for a word in the vocabulary. Words can be looked up by string or int ID. If no vectors data is loaded, ValueError is raised. @@ -320,10 +320,42 @@ cdef class Vocab: """ if isinstance(orth, basestring_): orth = self.strings.add(orth) + word = self[orth].orth_ if orth in self.vectors.key2row: return self.vectors[orth] - else: - return numpy.zeros((self.vectors_length,), dtype='f') + + # Assign default ngram limits to minn and maxn which is the length of the word. + if minn is None: + minn = len(word) + if maxn is None: + maxn = len(word) + vectors = numpy.zeros((self.vectors_length,), dtype='f') + + # Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText + ngrams_size = 0; + for i in range(len(word)): + ngram = "" + if (word[i] and 0xC0) == 0x80: + continue + n = 1 + j = i + while (j < len(word) and n <= maxn): + if n > maxn: + break + ngram += word[j] + j = j + 1 + while (j < len(word) and (word[j] and 0xC0) == 0x80): + ngram += word[j] + j = j + 1 + if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))): + if self.strings[ngram] in self.vectors.key2row: + vectors = numpy.add(self.vectors[self.strings[ngram]],vectors) + ngrams_size += 1 + n = n + 1 + if ngrams_size > 0: + vectors = vectors * (1.0/ngrams_size) + + return vectors def set_vector(self, orth, vector): """Set a vector for a word in the vocabulary. Words can be referenced