* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags.

2014-10-23 03:20:02 +11:00 · 2014-10-23 03:20:02 +11:00 · 96b835a3d4
parent e5e951ae67
commit 96b835a3d4
1 changed files with 61 additions and 60 deletions
--- a/spacy/pos.pyx
+++ b/spacy/pos.pyx
@ -13,9 +13,8 @@ from thinc.features import NonZeroConjFeat
 from thinc.features import ConjFeat
 from .en import EN
-from .lexeme cimport LexStr_shape, LexStr_suff, LexStr_pre, LexStr_norm
+
-from .lexeme cimport LexDist_upper, LexDist_title
+from .lexeme cimport *
 from .lexeme cimport LexDist_upper, LexInt_cluster, LexInt_id
 NULL_TAG = 0
@ -37,7 +36,9 @@ cdef class Tagger:
                self.model.load(file_)
    cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
-        get_atoms(self._atoms, i, tokens, prev, prev_prev)
+        assert i >= 0
        get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
                  tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
        self.extractor.extract(self._feats, self._values, self._atoms, NULL)
        assert self._feats[self.extractor.n] == 0
        self._guess = self.model.score(self._scores, self._feats, self._values)
@ -62,76 +63,77 @@ cdef class Tagger:
 cpdef enum:
    P2i
    P1i
    N0i
    N1i
    N2i
    P2c
    P1c
    N0c
    N1c
    N2c
    P2shape
    P1shape
    N0shape
    N1shape
    N2shape
    P2suff
    P1suff
    N0suff
    N1suff
    N2suff
    P2pref
    P1pref
    N0pref
    N1pref
    N2pref
    P2w
-    P1w
+    P2shape
-    N0w
+    P2pref
-    N1w
+    P2suff
    N2w
    P2oft_title
    P1oft_title
    N0oft_title
    N1oft_title
    N2oft_title
    P2oft_upper
    P1i
    P1c
    P1w
    P1shape
    P1pre
    P1suff
    P1oft_title
    P1oft_upper
    N0i
    N0c
    N0w
    N0shape
    N0pref
    N0suff
    N0oft_title
    N0oft_upper
    N1i
    N1c
    N1w
    N1shape
    N1pref
    N1suff
    N1oft_title
    N1oft_upper
    N2i
    N2c
    N2w
    N2shape
    N2pref
    N2suff
    N2oft_title
    N2oft_upper
    P1t
    P2t
    P1t
    CONTEXT_SIZE
-cdef int get_atoms(atom_t* context, int i, Tokens tokens, class_t prev_tag,
+cdef int get_atoms(atom_t* atoms, LexemeC* p2, LexemeC* p1, LexemeC* n0, LexemeC* n1,
-                   class_t prev_prev_tag) except -1:
+                   LexemeC* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
-    cdef int j
+    _fill_token(&atoms[P2i], p2)
-    for j in range(CONTEXT_SIZE):
+    _fill_token(&atoms[P1i], p1)
-        context[j] = 0
+    _fill_token(&atoms[N0i], n0)
-    cdef int* indices = [i-2, i-1, i, i+1, i+2]
+    _fill_token(&atoms[N1i], n1)
    _fill_token(&atoms[N2i], n2)
    atoms[P1t] = prev_tag
    atoms[P2t] = prev_prev_tag
    cdef int* int_feats = [<int>LexInt_id, <int>LexInt_cluster]
    cdef int* string_feats = [<int>LexStr_shape, <int>LexStr_suff, <int>LexStr_pre,
                              <int>LexStr_norm]
    cdef int* bool_feats = [<int>LexDist_title, <int>LexDist_upper]
-    cdef int c = 0
+cdef inline void _fill_token(atom_t* atoms, LexemeC* lex) nogil:
-    c = tokens.int_array(context, c, indices, 5, int_feats, 2)
+    atoms[0] = lex.ints[<int>LexInt_id]
-    c = tokens.string_array(context, c, indices, 5, string_feats, 4)
+    atoms[1] = lex.ints[<int>LexInt_cluster]
-    c = tokens.bool_array(context, c, indices, 5, bool_feats, 2)
+    atoms[2] = <atom_t>lex.strings[<int>LexStr_norm]
-    context[P1t] = prev_tag
+    atoms[3] = <atom_t>lex.strings[<int>LexStr_shape]
-    context[P2t] = prev_prev_tag
+    atoms[4] = <atom_t>lex.strings[<int>LexStr_pre]
    atoms[5] = <atom_t>lex.strings[<int>LexStr_suff]
    atoms[6] = lex.dist_flags & (1 << LexDist_title)
    atoms[7] = lex.dist_flags & (1 << LexDist_upper)
 TEMPLATES = (
@ -159,4 +161,3 @@ TEMPLATES = (
    (N0oft_upper,),
    (N0oft_title,),
 )