* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.

This commit is contained in:
Matthew Honnibal 2015-09-14 17:49:58 +10:00
parent e13e47e9e5
commit 65dc0d1dfb
4 changed files with 74 additions and 1 deletions

View File

@ -3,6 +3,13 @@ from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()
from libc.string cimport memset from libc.string cimport memset
from .orth cimport word_shape from .orth cimport word_shape
@ -35,6 +42,26 @@ cdef class Lexeme:
def py_check_flag(self, attr_id_t flag_id): def py_check_flag(self, attr_id_t flag_id):
return True if Lexeme.check_flag(self.c, flag_id) else False return True if Lexeme.check_flag(self.c, flag_id) else False
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property vector_norm:
def __get__(self):
return self.c.l2_norm
def __set__(self, float value):
self.c.l2_norm = value
property vector:
def __get__(self):
cdef int length = self.vocab.repvec_length
repvec_view = <float[:length,]>self.c.repvec
return numpy.asarray(repvec_view)
property repvec:
def __get__(self):
return self.vector
property orth_: property orth_:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.orth] return self.vocab.strings[self.c.orth]

View File

@ -3,7 +3,9 @@ from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
import numpy import numpy
import numpy.linalg
import struct import struct
cimport numpy as np
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
@ -118,6 +120,22 @@ cdef class Doc:
def __str__(self): def __str__(self):
return u''.join([t.string for t in self]) return u''.join([t.string for t in self])
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property repvec:
def __get__(self):
return self.vector
property vector:
def __get__(self):
return sum(t.vector for t in self if not t.is_stop) / len(self)
property vector_norm:
def __get__(self):
return numpy.linalg.norm(self.vector)
@property @property
def string(self): def string(self):
return u''.join([t.string for t in self]) return u''.join([t.string for t in self])

View File

@ -1,5 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import defaultdict from collections import defaultdict
import numpy
import numpy.linalg
cimport numpy as np
from ..structs cimport TokenC, LexemeC from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t from ..typedefs cimport flags_t, attr_t
@ -52,6 +55,17 @@ cdef class Span:
def merge(self, unicode tag, unicode lemma, unicode ent_type): def merge(self, unicode tag, unicode lemma, unicode ent_type):
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type) self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property vector:
def __get__(self):
return sum(t.vector for t in self if not t.is_stop) / len(self)
property vector_norm:
def __get__(self):
return numpy.linalg.norm(self.vector)
property text: property text:
def __get__(self): def __get__(self):
return u' '.join([t.text for t in self]) return u' '.join([t.text for t in self])

View File

@ -49,6 +49,9 @@ cdef class Token:
def nbor(self, int i=1): def nbor(self, int i=1):
return self.doc[self.i+i] return self.doc[self.i+i]
def similarity(self, other):
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property lex_id: property lex_id:
def __get__(self): def __get__(self):
return self.c.lex.id return self.c.lex.id
@ -125,12 +128,20 @@ cdef class Token:
def __get__(self): def __get__(self):
return self.c.dep return self.c.dep
property repvec: property vector:
def __get__(self): def __get__(self):
cdef int length = self.vocab.repvec_length cdef int length = self.vocab.repvec_length
repvec_view = <float[:length,]>self.c.lex.repvec repvec_view = <float[:length,]>self.c.lex.repvec
return numpy.asarray(repvec_view) return numpy.asarray(repvec_view)
property repvec:
def __get__(self):
return self.vector
property vector_norm:
def __get__(self):
return self.c.lex.l2_norm
property n_lefts: property n_lefts:
def __get__(self): def __get__(self):
cdef int n = 0 cdef int n = 0
@ -302,6 +313,9 @@ cdef class Token:
property is_oov: property is_oov:
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
property is_stop:
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_STOP)
property is_alpha: property is_alpha:
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA) def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)