From ba4e5637015c0f744855610b8f3b25a756860c90 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 21 Sep 2015 18:03:08 +1000 Subject: [PATCH] * Ensure vectors are same length, and return vector length in load_vectors_bz2 --- spacy/vocab.pyx | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 7d51c6c6d..d0df37317 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -270,16 +270,25 @@ cdef class Vocab: def load_vectors_bz2(self, loc): cdef LexemeC* lexeme cdef attr_t orth + cdef int32_t vec_len = -1 with bz2.BZ2File(loc, 'r') as file_: - for line in file_: + for line_num, line in enumerate(file_): pieces = line.split() word_str = pieces.pop(0) + if vec_len == -1: + vec_len = len(pieces) + elif vec_len != len(pieces): + raise IOError( + "Error loading word vectors: all vectors must be same " + "length. Previous vector was length %d, vector on line %d " + "was length %d." % (vec_len, line_num, len(pieces))) orth = self.strings[word_str] lexeme = self.get_by_orth(self.mem, orth) lexeme.repvec = self.mem.alloc(len(pieces), sizeof(float)) for i, val_str in enumerate(pieces): lexeme.repvec[i] = float(val_str) + return vec_len def load_vectors_bin(self, loc): cdef CFile file_ = CFile(loc, b'rb')