From d6945bf880eff371fae0e02132ff30c691c2e644 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 17 Sep 2015 12:58:23 +1000 Subject: [PATCH] * Add way to load vectors from bz2 file to vocab --- spacy/vocab.pyx | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 6c7c4d321..7d51c6c6d 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -262,6 +262,26 @@ cdef class Vocab: fp.close() def load_vectors(self, loc): + if loc.endswith('bz2'): + self.load_vectors_bz2(loc) + else: + self.load_vectors_bin(loc) + + def load_vectors_bz2(self, loc): + cdef LexemeC* lexeme + cdef attr_t orth + with bz2.BZ2File(loc, 'r') as file_: + for line in file_: + pieces = line.split() + word_str = pieces.pop(0) + orth = self.strings[word_str] + lexeme = self.get_by_orth(self.mem, orth) + lexeme.repvec = self.mem.alloc(len(pieces), sizeof(float)) + + for i, val_str in enumerate(pieces): + lexeme.repvec[i] = float(val_str) + + def load_vectors_bin(self, loc): cdef CFile file_ = CFile(loc, b'rb') cdef int32_t word_len cdef int32_t vec_len