From 90da3a695df03e247263a8b2d8d45229891e176d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 10 Sep 2015 14:49:10 +0200 Subject: [PATCH] * Load lemmatizer from disk in Vocab.from_dir --- spacy/vocab.pyx | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index de0557c95..5307f0fe8 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -38,19 +38,6 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, get_lex_attr=None, tag_map=None, vectors=None): - if tag_map is None: - tag_map = {} - self.mem = Pool() - self._by_hash = PreshMap() - self._by_orth = PreshMap() - self.strings = StringStore() - self.get_lex_attr = get_lex_attr - self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {})) - - self.length = 1 - self._serializer = None - @classmethod def from_dir(cls, data_dir, get_lex_attr=None, vectors=None): if not path.exists(data_dir): @@ -59,13 +46,31 @@ cdef class Vocab: raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) - cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) + lemmatizer = Lemmatizer.from_dir(path.join(data_dir, '..')) + + cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map, + lemmatizer=lemmatizer) self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) return self + def __init__(self, get_lex_attr=None, tag_map=None, vectors=None, lemmatizer=None): + if tag_map is None: + tag_map = {} + if lemmatizer is None: + lemmatizer = Lemmatizer({}, {}, {}) + self.mem = Pool() + self._by_hash = PreshMap() + self._by_orth = PreshMap() + self.strings = StringStore() + self.get_lex_attr = get_lex_attr + self.morphology = Morphology(self.strings, tag_map, lemmatizer) + + self.length = 1 + self._serializer = None + property serializer: def __get__(self): if self._serializer is None: @@ -199,7 +204,7 @@ cdef class Vocab: lexeme = addr fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) fp.write_from(&lexeme.flags, sizeof(lexeme.flags), 1) - fp.write_from(&lexeme.id, sizeof(lexeme.flags), 1) + fp.write_from(&lexeme.id, sizeof(lexeme.id), 1) fp.write_from(&lexeme.length, sizeof(lexeme.length), 1) fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) fp.write_from(&lexeme.lower, sizeof(lexeme.lower), 1)