Implement Fast-Text vectors with subword features

2018-04-21 01:34:14 +05:30 · 2018-04-21 01:34:14 +05:30 · 69d041148f
parent 686225eadd
commit 69d041148f
2 changed files with 59 additions and 3 deletions
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@ -23,6 +23,18 @@ def vectors():
        ('juice', [5, 5, 10]),
        ('pie', [7, 6.3, 8.9])]

+@pytest.fixture
+def ngrams_vectors():
+    return [
+        ("apple", [1, 2, 3]),
+        ("app", [-0.1, -0.2, -0.3]),
+        ('ppl', [-0.2, -0.3, -0.4]),
+        ('pl', [0.7, 0.8, 0.9])
+    ]
+@pytest.fixture()
+def ngrams_vocab(en_vocab, ngrams_vectors):
+    add_vecs_to_vocab(en_vocab, ngrams_vectors)
+    return en_vocab

@pytest.fixture
 def data():
@ -105,6 +117,18 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
    assert vectors[1] == (doc[2].text, list(doc[2].vector))


+@pytest.mark.parametrize('text', ["apple"])
+def test_vectors__ngrams_word(ngrams_vocab, text):
+    assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors()[0][1])
+
+@pytest.mark.parametrize('text', ["applpie"])
+def test_vectors__ngrams_subword(ngrams_vocab, text):
+    truth = list(ngrams_vocab.get_vector(text,1,6))
+    test = list([(ngrams_vectors()[1][1][i] + ngrams_vectors()[2][1][i] + ngrams_vectors()[3][1][i])/3 for i in range(len(ngrams_vectors()[1][1]))])
+    eps = [abs(truth[i] - test[i]) for i in range(len(truth))]
+    for i in eps:
+        assert i<1e-6
+
@pytest.mark.parametrize('text', ["apple", "orange"])
 def test_vectors_lexeme_vector(vocab, text):
    lex = vocab[text]
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -309,7 +309,7 @@ cdef class Vocab:
        link_vectors_to_models(self)
        return remap

-    def get_vector(self, orth):
+    def get_vector(self, orth, minn=None, maxn=None):
        """Retrieve a vector for a word in the vocabulary. Words can be looked
        up by string or int ID. If no vectors data is loaded, ValueError is
        raised.
@ -320,10 +320,42 @@ cdef class Vocab:
        """
        if isinstance(orth, basestring_):
            orth = self.strings.add(orth)
+        word = self[orth].orth_
        if orth in self.vectors.key2row:
            return self.vectors[orth]
-        else:
-            return numpy.zeros((self.vectors_length,), dtype='f')
+
+        # Assign default ngram limits to minn and maxn which is the length of the word.
+        if minn is None:
+            minn = len(word)
+        if maxn is None:
+            maxn = len(word)
+        vectors = numpy.zeros((self.vectors_length,), dtype='f')
+
+        # Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText
+        ngrams_size = 0;
+        for i in range(len(word)):
+            ngram = ""
+            if (word[i] and 0xC0) == 0x80:
+                continue
+            n = 1
+            j = i
+            while (j < len(word) and n <= maxn):
+                if n > maxn:
+                    break
+                ngram += word[j]
+                j = j + 1
+                while (j < len(word) and (word[j] and 0xC0) == 0x80):
+                    ngram += word[j]
+                    j = j + 1
+                if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))):
+                    if self.strings[ngram] in self.vectors.key2row:
+                        vectors = numpy.add(self.vectors[self.strings[ngram]],vectors)
+                        ngrams_size += 1
+                n = n + 1
+        if ngrams_size > 0:
+            vectors = vectors * (1.0/ngrams_size)
+
+        return vectors

    def set_vector(self, orth, vector):
        """Set a vector for a word in the vocabulary. Words can be referenced