* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

2015-09-14 17:49:58 +10:00 · 2015-09-14 17:49:58 +10:00 · 65dc0d1dfb
parent e13e47e9e5
commit 65dc0d1dfb
4 changed files with 74 additions and 1 deletions
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -3,6 +3,13 @@ from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64

+# Compiler crashes on memory view coercion without this. Should report bug.
+from cython.view cimport array as cvarray
+cimport numpy as np
+np.import_array()
+
+
+
 from libc.string cimport memset

 from .orth cimport word_shape
@ -35,6 +42,26 @@ cdef class Lexeme:
    def py_check_flag(self, attr_id_t flag_id):
        return True if Lexeme.check_flag(self.c, flag_id) else False

+    def similarity(self, other):
+        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
+
+    property vector_norm:
+        def __get__(self):
+            return self.c.l2_norm
+
+        def __set__(self, float value):
+            self.c.l2_norm = value
+
+    property vector:
+        def __get__(self):
+            cdef int length = self.vocab.repvec_length
+            repvec_view = <float[:length,]>self.c.repvec
+            return numpy.asarray(repvec_view)
+
+    property repvec:
+        def __get__(self):
+            return self.vector
+        
    property orth_:
        def __get__(self):
            return self.vocab.strings[self.c.orth]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -3,7 +3,9 @@ from libc.string cimport memcpy, memset
 from libc.stdint cimport uint32_t

 import numpy
+import numpy.linalg
 import struct
+cimport numpy as np

 from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
@ -118,6 +120,22 @@ cdef class Doc:
    def __str__(self):
        return u''.join([t.string for t in self])

+    def similarity(self, other):
+        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
+
+    property repvec:
+        def __get__(self):
+            return self.vector
+
+    property vector:
+        def __get__(self):
+            return sum(t.vector for t in self if not t.is_stop) / len(self)
+
+
+    property vector_norm:
+        def __get__(self):
+            return numpy.linalg.norm(self.vector)
+
    @property
    def string(self):
        return u''.join([t.string for t in self])
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@ -1,5 +1,8 @@
 from __future__ import unicode_literals
 from collections import defaultdict
+import numpy
+import numpy.linalg
+cimport numpy as np

 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t
@ -52,6 +55,17 @@ cdef class Span:
    def merge(self, unicode tag, unicode lemma, unicode ent_type):
        self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)

+    def similarity(self, other):
+        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
+
+    property vector:
+        def __get__(self):
+            return sum(t.vector for t in self if not t.is_stop) / len(self)
+
+    property vector_norm:
+        def __get__(self):
+            return numpy.linalg.norm(self.vector)
+
    property text:
        def __get__(self):
            return u' '.join([t.text for t in self])
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -49,6 +49,9 @@ cdef class Token:
    def nbor(self, int i=1):
        return self.doc[self.i+i]

+    def similarity(self, other):
+        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
+
    property lex_id:
        def __get__(self):
            return self.c.lex.id
@ -125,12 +128,20 @@ cdef class Token:
        def __get__(self):
            return self.c.dep

-    property repvec:
+    property vector:
        def __get__(self):
            cdef int length = self.vocab.repvec_length
            repvec_view = <float[:length,]>self.c.lex.repvec
            return numpy.asarray(repvec_view)

+    property repvec:
+        def __get__(self):
+            return self.vector
+
+    property vector_norm:
+        def __get__(self):
+            return self.c.lex.l2_norm
+
    property n_lefts:
        def __get__(self):
            cdef int n = 0
@ -302,6 +313,9 @@ cdef class Token:
    property is_oov:
        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)

+    property is_stop:
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_STOP)
+
    property is_alpha:
        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)