From 567388e38dd8491e6743929a592c7628bbcf63bb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 15 Mar 2015 17:01:58 -0400 Subject: [PATCH] * Use values encoded by StringStore in POS tagging, rather than indices into a list of tags --- spacy/en/pos.pxd | 2 +- spacy/en/pos.pyx | 21 +++++++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index 7ec88a7d5..22d65cde2 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -20,6 +20,6 @@ cdef class EnPosTagger: cdef readonly object tag_map cdef readonly int n_tags - cdef int set_morph(self, const int i, TokenC* tokens) except -1 + cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 544f8de4b..c5d7e126d 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -275,21 +275,19 @@ cdef class EnPosTagger: if tokens.data[i].pos == 0: fill_context(context, i, tokens.data) scores = self.model.score(context) - tokens.data[i].tag = arg_max(scores, self.model.n_classes) - self.set_morph(i, tokens.data) + guess = arg_max(scores, self.model.n_classes) + tokens.data[i].tag = self.strings[self.tag_names[guess]] + self.set_morph(i, &self.tags[guess], tokens.data) - # TODO: Clean this up. - tokens._tag_strings = tuple(self.tag_names) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def tag_from_strings(self, Tokens tokens, object tag_strs): cdef int i for i in range(tokens.length): - tokens.data[i].tag = self.tag_names.index(tag_strs[i]) - self.set_morph(i, tokens.data) - # TODO: Clean this up. - tokens._tag_strings = tuple(self.tag_names) + tokens.data[i].tag = self.strings[tag_strs[i]] + self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], + tokens.data) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -307,13 +305,12 @@ cdef class EnPosTagger: guess = arg_max(scores, self.model.n_classes) loss = guess != golds[i] if golds[i] != -1 else 0 self.model.update(context, guess, golds[i], loss) - tokens.data[i].tag = guess - self.set_morph(i, tokens.data) + tokens.data[i].tag = self.strings[self.tag_names[guess]] + self.set_morph(i, &self.tags[guess], tokens.data) correct += loss == 0 return correct - cdef int set_morph(self, const int i, TokenC* tokens) except -1: - cdef const PosTag* tag = &self.tags[tokens[i].tag] + cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: tokens[i].pos = tag.pos cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) if cached is NULL: