From 65dc0d1dfb4ed6634de61242ecb40343778bfe48 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 14 Sep 2015 17:49:58 +1000 Subject: [PATCH] * Extend word vectors support, with .similarity() function, vector_norm property, and rename repvec to vector. Keep repvec name as well for now for backwards compatibility. --- spacy/lexeme.pyx | 27 +++++++++++++++++++++++++++ spacy/tokens/doc.pyx | 18 ++++++++++++++++++ spacy/tokens/spans.pyx | 14 ++++++++++++++ spacy/tokens/token.pyx | 16 +++++++++++++++- 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index e0fa854cb..44c31f834 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -3,6 +3,13 @@ from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 +# Compiler crashes on memory view coercion without this. Should report bug. +from cython.view cimport array as cvarray +cimport numpy as np +np.import_array() + + + from libc.string cimport memset from .orth cimport word_shape @@ -35,6 +42,26 @@ cdef class Lexeme: def py_check_flag(self, attr_id_t flag_id): return True if Lexeme.check_flag(self.c, flag_id) else False + def similarity(self, other): + return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + + property vector_norm: + def __get__(self): + return self.c.l2_norm + + def __set__(self, float value): + self.c.l2_norm = value + + property vector: + def __get__(self): + cdef int length = self.vocab.repvec_length + repvec_view = self.c.repvec + return numpy.asarray(repvec_view) + + property repvec: + def __get__(self): + return self.vector + property orth_: def __get__(self): return self.vocab.strings[self.c.orth] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f9552b6eb..6878793ab 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -3,7 +3,9 @@ from libc.string cimport memcpy, memset from libc.stdint cimport uint32_t import numpy +import numpy.linalg import struct +cimport numpy as np from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME @@ -118,6 +120,22 @@ cdef class Doc: def __str__(self): return u''.join([t.string for t in self]) + def similarity(self, other): + return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + + property repvec: + def __get__(self): + return self.vector + + property vector: + def __get__(self): + return sum(t.vector for t in self if not t.is_stop) / len(self) + + + property vector_norm: + def __get__(self): + return numpy.linalg.norm(self.vector) + @property def string(self): return u''.join([t.string for t in self]) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index d9e4fbf0e..12ad6e425 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -1,5 +1,8 @@ from __future__ import unicode_literals from collections import defaultdict +import numpy +import numpy.linalg +cimport numpy as np from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t @@ -52,6 +55,17 @@ cdef class Span: def merge(self, unicode tag, unicode lemma, unicode ent_type): self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type) + def similarity(self, other): + return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + + property vector: + def __get__(self): + return sum(t.vector for t in self if not t.is_stop) / len(self) + + property vector_norm: + def __get__(self): + return numpy.linalg.norm(self.vector) + property text: def __get__(self): return u' '.join([t.text for t in self]) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index e3e78838f..5b5d84887 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -49,6 +49,9 @@ cdef class Token: def nbor(self, int i=1): return self.doc[self.i+i] + def similarity(self, other): + return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + property lex_id: def __get__(self): return self.c.lex.id @@ -125,12 +128,20 @@ cdef class Token: def __get__(self): return self.c.dep - property repvec: + property vector: def __get__(self): cdef int length = self.vocab.repvec_length repvec_view = self.c.lex.repvec return numpy.asarray(repvec_view) + property repvec: + def __get__(self): + return self.vector + + property vector_norm: + def __get__(self): + return self.c.lex.l2_norm + property n_lefts: def __get__(self): cdef int n = 0 @@ -302,6 +313,9 @@ cdef class Token: property is_oov: def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) + property is_stop: + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_STOP) + property is_alpha: def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)