diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 66867a648..26acff407 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -43,6 +43,8 @@ cdef class Lexeme: return True if Lexeme.c_check_flag(self.c, flag_id) else False def similarity(self, other): + if self.vector_norm == 0 or other.vector_norm == 0: + return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property has_vector: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 19cff3a90..536b3582b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -127,6 +127,8 @@ cdef class Doc: return u''.join([t.string for t in self]) def similarity(self, other): + if self.vector_norm == 0 or other.vector_norm == 0: + return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property repvec: diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 38b9ebcca..cca24cb5b 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -60,6 +60,8 @@ cdef class Span: self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type) def similarity(self, other): + if self.vector_norm == 0.0 or other.vector_norm == 0.0: + return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property vector: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 4f8effa4e..25db3f47e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -50,6 +50,8 @@ cdef class Token: return self.doc[self.i+i] def similarity(self, other): + if self.vector_norm == 0 or other.vector_norm == 0: + return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property lex_id: diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bb256b02e..e3ac67bf7 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,6 +1,5 @@ from __future__ import unicode_literals - from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.string cimport memset from libc.stdint cimport int32_t @@ -117,16 +116,14 @@ cdef class Vocab: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef hash_t key - #cdef bint is_oov = mem is not self.mem - # TODO - is_oov = False - mem = self.mem + cdef bint is_oov = mem is not self.mem if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) lex.orth = self.strings[string] lex.length = len(string) lex.id = self.length + lex.repvec = mem.alloc(self.vectors_length, sizeof(float)) if self.get_lex_attr is not None: for attr, func in self.get_lex_attr.items(): value = func(string) @@ -283,7 +280,7 @@ cdef class Vocab: vec_len, len(pieces)) orth = self.strings[word_str] lexeme = self.get_by_orth(self.mem, orth) - lexeme.repvec = self.mem.alloc(len(pieces), sizeof(float)) + lexeme.repvec = self.mem.alloc(self.vectors_length, sizeof(float)) for i, val_str in enumerate(pieces): lexeme.repvec[i] = float(val_str)