From 90da3a695df03e247263a8b2d8d45229891e176d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 10 Sep 2015 14:49:10 +0200 Subject: [PATCH 1/3] * Load lemmatizer from disk in Vocab.from_dir --- spacy/vocab.pyx | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index de0557c95..5307f0fe8 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -38,19 +38,6 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, get_lex_attr=None, tag_map=None, vectors=None): - if tag_map is None: - tag_map = {} - self.mem = Pool() - self._by_hash = PreshMap() - self._by_orth = PreshMap() - self.strings = StringStore() - self.get_lex_attr = get_lex_attr - self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {})) - - self.length = 1 - self._serializer = None - @classmethod def from_dir(cls, data_dir, get_lex_attr=None, vectors=None): if not path.exists(data_dir): @@ -59,13 +46,31 @@ cdef class Vocab: raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) - cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) + lemmatizer = Lemmatizer.from_dir(path.join(data_dir, '..')) + + cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map, + lemmatizer=lemmatizer) self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) return self + def __init__(self, get_lex_attr=None, tag_map=None, vectors=None, lemmatizer=None): + if tag_map is None: + tag_map = {} + if lemmatizer is None: + lemmatizer = Lemmatizer({}, {}, {}) + self.mem = Pool() + self._by_hash = PreshMap() + self._by_orth = PreshMap() + self.strings = StringStore() + self.get_lex_attr = get_lex_attr + self.morphology = Morphology(self.strings, tag_map, lemmatizer) + + self.length = 1 + self._serializer = None + property serializer: def __get__(self): if self._serializer is None: @@ -199,7 +204,7 @@ cdef class Vocab: lexeme = addr fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) fp.write_from(&lexeme.flags, sizeof(lexeme.flags), 1) - fp.write_from(&lexeme.id, sizeof(lexeme.flags), 1) + fp.write_from(&lexeme.id, sizeof(lexeme.id), 1) fp.write_from(&lexeme.length, sizeof(lexeme.length), 1) fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) fp.write_from(&lexeme.lower, sizeof(lexeme.lower), 1) From b9e31dc24532564a8ee7e789daf2dc8050819a5b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 10 Sep 2015 14:50:44 +0200 Subject: [PATCH 2/3] * Bug fix to gazetteer.json --- lang_data/en/gazetteer.json | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index dce2e1f2a..0f02e22d1 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -13,10 +13,6 @@ [ {"orth": "9/11"} ], - [ - {"lower": "septmber"}, - {"lower": "eleven"} - ], [ {"lower": "september"}, {"orth": "11"} From c3f773cd63fec194193e52d1d13cf764aee74a89 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 10 Sep 2015 14:51:05 +0200 Subject: [PATCH 3/3] * Fix Lexeme.check_flag --- spacy/lexeme.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 63280155c..75cac871c 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -72,7 +72,8 @@ cdef class Lexeme: @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: - return lexeme.flags & (1 << flag_id) + cdef flags_t one = 1 + return lexeme.flags & (one << flag_id) @staticmethod cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: