mirror of https://github.com/explosion/spaCy.git
* Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility.
This commit is contained in:
parent
e13e47e9e5
commit
65dc0d1dfb
|
@ -3,6 +3,13 @@ from cpython.ref cimport Py_INCREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
|
from cython.view cimport array as cvarray
|
||||||
|
cimport numpy as np
|
||||||
|
np.import_array()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
|
@ -35,6 +42,26 @@ cdef class Lexeme:
|
||||||
def py_check_flag(self, attr_id_t flag_id):
|
def py_check_flag(self, attr_id_t flag_id):
|
||||||
return True if Lexeme.check_flag(self.c, flag_id) else False
|
return True if Lexeme.check_flag(self.c, flag_id) else False
|
||||||
|
|
||||||
|
def similarity(self, other):
|
||||||
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
property vector_norm:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.l2_norm
|
||||||
|
|
||||||
|
def __set__(self, float value):
|
||||||
|
self.c.l2_norm = value
|
||||||
|
|
||||||
|
property vector:
|
||||||
|
def __get__(self):
|
||||||
|
cdef int length = self.vocab.repvec_length
|
||||||
|
repvec_view = <float[:length,]>self.c.repvec
|
||||||
|
return numpy.asarray(repvec_view)
|
||||||
|
|
||||||
|
property repvec:
|
||||||
|
def __get__(self):
|
||||||
|
return self.vector
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.orth]
|
return self.vocab.strings[self.c.orth]
|
||||||
|
|
|
@ -3,7 +3,9 @@ from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
import numpy.linalg
|
||||||
import struct
|
import struct
|
||||||
|
cimport numpy as np
|
||||||
|
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
|
@ -118,6 +120,22 @@ cdef class Doc:
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return u''.join([t.string for t in self])
|
return u''.join([t.string for t in self])
|
||||||
|
|
||||||
|
def similarity(self, other):
|
||||||
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
property repvec:
|
||||||
|
def __get__(self):
|
||||||
|
return self.vector
|
||||||
|
|
||||||
|
property vector:
|
||||||
|
def __get__(self):
|
||||||
|
return sum(t.vector for t in self if not t.is_stop) / len(self)
|
||||||
|
|
||||||
|
|
||||||
|
property vector_norm:
|
||||||
|
def __get__(self):
|
||||||
|
return numpy.linalg.norm(self.vector)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def string(self):
|
def string(self):
|
||||||
return u''.join([t.string for t in self])
|
return u''.join([t.string for t in self])
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
import numpy
|
||||||
|
import numpy.linalg
|
||||||
|
cimport numpy as np
|
||||||
|
|
||||||
from ..structs cimport TokenC, LexemeC
|
from ..structs cimport TokenC, LexemeC
|
||||||
from ..typedefs cimport flags_t, attr_t
|
from ..typedefs cimport flags_t, attr_t
|
||||||
|
@ -52,6 +55,17 @@ cdef class Span:
|
||||||
def merge(self, unicode tag, unicode lemma, unicode ent_type):
|
def merge(self, unicode tag, unicode lemma, unicode ent_type):
|
||||||
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
|
self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type)
|
||||||
|
|
||||||
|
def similarity(self, other):
|
||||||
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
property vector:
|
||||||
|
def __get__(self):
|
||||||
|
return sum(t.vector for t in self if not t.is_stop) / len(self)
|
||||||
|
|
||||||
|
property vector_norm:
|
||||||
|
def __get__(self):
|
||||||
|
return numpy.linalg.norm(self.vector)
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return u' '.join([t.text for t in self])
|
return u' '.join([t.text for t in self])
|
||||||
|
|
|
@ -49,6 +49,9 @@ cdef class Token:
|
||||||
def nbor(self, int i=1):
|
def nbor(self, int i=1):
|
||||||
return self.doc[self.i+i]
|
return self.doc[self.i+i]
|
||||||
|
|
||||||
|
def similarity(self, other):
|
||||||
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property lex_id:
|
property lex_id:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.id
|
return self.c.lex.id
|
||||||
|
@ -125,12 +128,20 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.dep
|
return self.c.dep
|
||||||
|
|
||||||
property repvec:
|
property vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int length = self.vocab.repvec_length
|
cdef int length = self.vocab.repvec_length
|
||||||
repvec_view = <float[:length,]>self.c.lex.repvec
|
repvec_view = <float[:length,]>self.c.lex.repvec
|
||||||
return numpy.asarray(repvec_view)
|
return numpy.asarray(repvec_view)
|
||||||
|
|
||||||
|
property repvec:
|
||||||
|
def __get__(self):
|
||||||
|
return self.vector
|
||||||
|
|
||||||
|
property vector_norm:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.lex.l2_norm
|
||||||
|
|
||||||
property n_lefts:
|
property n_lefts:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
|
@ -302,6 +313,9 @@ cdef class Token:
|
||||||
property is_oov:
|
property is_oov:
|
||||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
|
||||||
|
|
||||||
|
property is_stop:
|
||||||
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_STOP)
|
||||||
|
|
||||||
property is_alpha:
|
property is_alpha:
|
||||||
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue