From b4faf551f545c7ef47f73d0f9efaad8374fa0f65 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:19:21 +0200 Subject: [PATCH] * Refactor language-independent tagger class --- spacy/tagger.pxd | 11 ++- spacy/tagger.pyx | 223 +++++++++++++++++++++++++++++++---------------- 2 files changed, 151 insertions(+), 83 deletions(-) diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 4aa9acc43..213781047 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -4,24 +4,23 @@ from cymem.cymem cimport Pool from ._ml cimport Model from .strings cimport StringStore -from .structs cimport TokenC, LexemeC, Morphology, PosTag +from .structs cimport TokenC, LexemeC from .parts_of_speech cimport univ_pos_t +from .vocab cimport Vocab cdef class Tagger: cdef readonly Pool mem cdef readonly StringStore strings cdef readonly Model model + cdef readonly Vocab vocab cdef public object lemmatizer cdef PreshMapArray _morph_cache cdef public dict freqs - cdef PosTag* tags - cdef readonly object tag_names - cdef readonly object tag_map cdef readonly int n_tags cdef int predict(self, int i, const TokenC* tokens) except -1 cdef int update(self, int i, const TokenC* tokens, int gold) except -1 - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 + #cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 + #cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index ccb40fd22..5d015b6cc 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -6,50 +6,129 @@ from thinc.typedefs cimport atom_t, weight_t from .typedefs cimport attr_t from .tokens.doc cimport Doc -from .morphology cimport set_morph_from_dict from .attrs cimport TAG from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE +from .attrs cimport * +from ._ml cimport arg_max + -cdef struct _CachedMorph: - Morphology morph - int lemma +cpdef enum: + P2_orth + P2_cluster + P2_shape + P2_prefix + P2_suffix + P2_pos + P2_lemma + P2_flags + + P1_orth + P1_cluster + P1_shape + P1_prefix + P1_suffix + P1_pos + P1_lemma + P1_flags + + W_orth + W_cluster + W_shape + W_prefix + W_suffix + W_pos + W_lemma + W_flags + + N1_orth + N1_cluster + N1_shape + N1_prefix + N1_suffix + N1_pos + N1_lemma + N1_flags + + N2_orth + N2_cluster + N2_shape + N2_prefix + N2_suffix + N2_pos + N2_lemma + N2_flags + + N_CONTEXT_FIELDS cdef class Tagger: """A part-of-speech tagger for English""" + @classmethod + def read_config(cls, data_dir): + return json.load(open(path.join(data_dir, 'pos', 'config.json'))) + + @classmethod + def default_templates(cls): + return ( + (W_orth,), + (P1_lemma, P1_pos), + (P2_lemma, P2_pos), + (N1_orth,), + (N2_orth,), + + (W_suffix,), + (W_prefix,), + + (P1_pos,), + (P2_pos,), + (P1_pos, P2_pos), + (P1_pos, W_orth), + (P1_suffix,), + (N1_suffix,), + + (W_shape,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + (P1_cluster,), + (P2_cluster,), + + (W_flags,), + (N1_flags,), + (N2_flags,), + (P1_flags,), + (P2_flags,), + ) + def make_lemmatizer(self): return None - def __init__(self, StringStore strings, data_dir): + def __init__(self, Vocab vocab, templates): self.mem = Pool() - model_dir = path.join(data_dir, 'pos') - self.strings = strings - cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) - self.tag_names = sorted(cfg['tag_names']) - assert self.tag_names - self.n_tags = len(self.tag_names) - self.tag_map = cfg['tag_map'] - cdef int n_tags = len(self.tag_names) + 1 + self.vocab = vocab + + cdef int n_tags = self.vocab.morphology.n_tags + 1 - self.model = Model(n_tags, cfg['templates'], model_dir) - self._morph_cache = PreshMapArray(n_tags) - self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) - for i, tag in enumerate(sorted(self.tag_names)): - pos, props = self.tag_map[tag] - self.tags[i].id = i - self.tags[i].pos = pos - set_morph_from_dict(&self.tags[i].morph, props) - if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')): - self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', - 'morphs.json')))) - self.lemmatizer = self.make_lemmatizer(data_dir) + self.model = Model(n_tags, templates) self.freqs = {TAG: defaultdict(int)} for tag in self.tag_names: - self.freqs[TAG][self.strings[tag]] = 1 + self.freqs[TAG][self.vocab.strings[tag]] = 1 self.freqs[TAG][0] = 1 + @property + def tag_names(self): + return tuple(sorted(self.vocab.morphology.tag_map.keys())) + + @classmethod + def from_dir(cls, data_dir, vocab): + if path.exists(path.join(data_dir, 'templates.json')): + templates = json.loads(open(path.join(data_dir, 'templates.json'))) + else: + templates = cls.default_templates() + return cls(vocab, templates) + def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. @@ -63,18 +142,14 @@ cdef class Tagger: for i in range(tokens.length): if tokens.data[i].pos == 0: guess = self.predict(i, tokens.data) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) - + self.vocab.morphology.assign_tag(&tokens.data[i], guess) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): - tokens.data[i].tag = self.strings[tag_strs[i]] - self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], - tokens.data) + self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -88,57 +163,51 @@ cdef class Tagger: for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) loss = golds[i] != -1 and guess != golds[i] - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) + + self.vocab.morphology.assign_tag(&tokens.data[i], guess) correct += loss == 0 self.freqs[TAG][tokens.data[i].tag] += 1 return correct cdef int predict(self, int i, const TokenC* tokens) except -1: - raise NotImplementedError + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + return arg_max(scores, self.model.n_classes) cdef int update(self, int i, const TokenC* tokens, int gold) except -1: - raise NotImplementedError + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + guess = arg_max(scores, self.model.n_classes) + loss = guess != gold if gold != -1 else 0 + self.model.update(context, guess, gold, loss) + return guess - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: - tokens[i].pos = tag.pos - cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) - if cached is NULL: - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) - cached.morph = tag.morph - self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) - tokens[i].lemma = cached.lemma - tokens[i].morph = cached.morph - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: - if self.lemmatizer is None: - return lex.orth - cdef unicode py_string = self.strings[lex.orth] - if pos != NOUN and pos != VERB and pos != ADJ: - return lex.orth - cdef set lemma_strings - cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, pos) - lemma_string = sorted(lemma_strings)[0] - lemma = self.strings[lemma_string] - return lemma - - def load_morph_exceptions(self, dict exc): - cdef unicode pos_str - cdef unicode form_str - cdef unicode lemma_str - cdef dict entries - cdef dict props - cdef int lemma - cdef attr_t orth - cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) - for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.strings[lemma_str] - set_morph_from_dict(&cached.morph, props) - self._morph_cache.set(pos, orth, cached) +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.lower + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.tag + context[6] = t.lemma + if t.lex.flags & (1 << IS_ALPHA): + context[7] = 1 + elif t.lex.flags & (1 << IS_PUNCT): + context[7] = 2 + elif t.lex.flags & (1 << LIKE_URL): + context[7] = 3 + elif t.lex.flags & (1 << LIKE_NUM): + context[7] = 4 + else: + context[7] = 0