* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

This commit is contained in:
Matthew Honnibal 2015-09-14 17:49:58 +10:00
parent e13e47e9e5
commit 65dc0d1dfb
4 changed files with 74 additions and 1 deletions

View File

@ -3,6 +3,13 @@ from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()
from libc.string cimport memset
from .orth cimport word_shape
@ -35,6 +42,26 @@ cdef class Lexeme:
def py_check_flag(self, attr_id_t flag_id):
return True if Lexeme.check_flag(self.c, flag_id) else False
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property vector_norm:
def __get__(self):
return self.c.l2_norm
def __set__(self, float value):
self.c.l2_norm = value
property vector:
def __get__(self):
cdef int length = self.vocab.repvec_length
repvec_view = <float[:length,]>self.c.repvec
return numpy.asarray(repvec_view)
property repvec:
def __get__(self):
return self.vector
property orth_:
def __get__(self):
return self.vocab.strings[self.c.orth]

View File

@ -3,7 +3,9 @@ from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t
import numpy
import numpy.linalg
import struct
cimport numpy as np
from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME
@ -118,6 +120,22 @@ cdef class Doc:
def __str__(self):
return u''.join([t.string for t in self])
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property repvec:
def __get__(self):
return self.vector
property vector:
def __get__(self):
return sum(t.vector for t in self if not t.is_stop) / len(self)
property vector_norm:
def __get__(self):
return numpy.linalg.norm(self.vector)
@property
def string(self):
return u''.join([t.string for t in self])

View File

@ -1,5 +1,8 @@
from __future__ import unicode_literals
from collections import defaultdict
import numpy
import numpy.linalg
cimport numpy as np
from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t
@ -52,6 +55,17 @@ cdef class Span:
def merge(self, unicode tag, unicode lemma, unicode ent_type):
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property vector:
def __get__(self):
return sum(t.vector for t in self if not t.is_stop) / len(self)
property vector_norm:
def __get__(self):
return numpy.linalg.norm(self.vector)
property text:
def __get__(self):
return u' '.join([t.text for t in self])

View File

@ -49,6 +49,9 @@ cdef class Token:
def nbor(self, int i=1):
return self.doc[self.i+i]
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property lex_id:
def __get__(self):
return self.c.lex.id
@ -125,12 +128,20 @@ cdef class Token:
def __get__(self):
return self.c.dep
property repvec:
property vector:
def __get__(self):
cdef int length = self.vocab.repvec_length
repvec_view = <float[:length,]>self.c.lex.repvec
return numpy.asarray(repvec_view)
property repvec:
def __get__(self):
return self.vector
property vector_norm:
def __get__(self):
return self.c.lex.l2_norm
property n_lefts:
def __get__(self):
cdef int n = 0
@ -302,6 +313,9 @@ cdef class Token:
property is_oov:
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
property is_stop:
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_STOP)
property is_alpha:
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)