* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags.

This commit is contained in:
Matthew Honnibal 2014-10-23 03:20:02 +11:00
parent e5e951ae67
commit 96b835a3d4
1 changed files with 61 additions and 60 deletions

View File

@ -13,9 +13,8 @@ from thinc.features import NonZeroConjFeat
from thinc.features import ConjFeat from thinc.features import ConjFeat
from .en import EN from .en import EN
from .lexeme cimport LexStr_shape, LexStr_suff, LexStr_pre, LexStr_norm
from .lexeme cimport LexDist_upper, LexDist_title from .lexeme cimport *
from .lexeme cimport LexDist_upper, LexInt_cluster, LexInt_id
NULL_TAG = 0 NULL_TAG = 0
@ -37,7 +36,9 @@ cdef class Tagger:
self.model.load(file_) self.model.load(file_)
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0: cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
get_atoms(self._atoms, i, tokens, prev, prev_prev) assert i >= 0
get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
self.extractor.extract(self._feats, self._values, self._atoms, NULL) self.extractor.extract(self._feats, self._values, self._atoms, NULL)
assert self._feats[self.extractor.n] == 0 assert self._feats[self.extractor.n] == 0
self._guess = self.model.score(self._scores, self._feats, self._values) self._guess = self.model.score(self._scores, self._feats, self._values)
@ -62,76 +63,77 @@ cdef class Tagger:
cpdef enum: cpdef enum:
P2i P2i
P1i
N0i
N1i
N2i
P2c P2c
P1c
N0c
N1c
N2c
P2shape
P1shape
N0shape
N1shape
N2shape
P2suff
P1suff
N0suff
N1suff
N2suff
P2pref
P1pref
N0pref
N1pref
N2pref
P2w P2w
P1w P2shape
N0w P2pref
N1w P2suff
N2w
P2oft_title P2oft_title
P1oft_title
N0oft_title
N1oft_title
N2oft_title
P2oft_upper P2oft_upper
P1i
P1c
P1w
P1shape
P1pre
P1suff
P1oft_title
P1oft_upper P1oft_upper
N0i
N0c
N0w
N0shape
N0pref
N0suff
N0oft_title
N0oft_upper N0oft_upper
N1i
N1c
N1w
N1shape
N1pref
N1suff
N1oft_title
N1oft_upper N1oft_upper
N2i
N2c
N2w
N2shape
N2pref
N2suff
N2oft_title
N2oft_upper N2oft_upper
P1t
P2t P2t
P1t
CONTEXT_SIZE CONTEXT_SIZE
cdef int get_atoms(atom_t* context, int i, Tokens tokens, class_t prev_tag, cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC* n1,
class_t prev_prev_tag) except -1: LexemeC* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
cdef int j _fill_token(&atoms[P2i], p2)
for j in range(CONTEXT_SIZE): _fill_token(&atoms[P1i], p1)
context[j] = 0 _fill_token(&atoms[N0i], n0)
cdef int* indices = [i-2, i-1, i, i+1, i+2] _fill_token(&atoms[N1i], n1)
_fill_token(&atoms[N2i], n2)
atoms[P1t] = prev_tag
atoms[P2t] = prev_prev_tag
cdef int* int_feats = [<int>LexInt_id, <int>LexInt_cluster]
cdef int* string_feats = [<int>LexStr_shape, <int>LexStr_suff, <int>LexStr_pre,
<int>LexStr_norm]
cdef int* bool_feats = [<int>LexDist_title, <int>LexDist_upper]
cdef int c = 0 cdef inline void _fill_token(atom_t* atoms, LexemeC* lex) nogil:
c = tokens.int_array(context, c, indices, 5, int_feats, 2) atoms[0] = lex.ints[<int>LexInt_id]
c = tokens.string_array(context, c, indices, 5, string_feats, 4) atoms[1] = lex.ints[<int>LexInt_cluster]
c = tokens.bool_array(context, c, indices, 5, bool_feats, 2) atoms[2] = <atom_t>lex.strings[<int>LexStr_norm]
context[P1t] = prev_tag atoms[3] = <atom_t>lex.strings[<int>LexStr_shape]
context[P2t] = prev_prev_tag atoms[4] = <atom_t>lex.strings[<int>LexStr_pre]
atoms[5] = <atom_t>lex.strings[<int>LexStr_suff]
atoms[6] = lex.dist_flags & (1 << LexDist_title)
atoms[7] = lex.dist_flags & (1 << LexDist_upper)
TEMPLATES = ( TEMPLATES = (
@ -159,4 +161,3 @@ TEMPLATES = (
(N0oft_upper,), (N0oft_upper,),
(N0oft_title,), (N0oft_title,),
) )