spaCy/spacy/pos.pyx

# cython: profile=True
from os import path
import os
import shutil
import ujson
import random
import codecs
import gzip
import cython

from libc.stdint cimport uint32_t


from thinc.weights cimport arg_max
from thinc.features import NonZeroConjFeat
from thinc.features import ConjFeat

from .lexeme cimport *
from .lang cimport Lexicon


NULL_TAG = 0


cdef class Tagger:
    tags = {'NULL': NULL_TAG}
    def __init__(self, model_dir):
        self.mem = Pool()
        tags_loc = path.join(model_dir, 'postags.json')
        if path.exists(tags_loc):
            with open(tags_loc) as file_:
                Tagger.tags.update(ujson.load(file_))
        self.model = LinearModel(len(self.tags), self.extractor.n)
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
        self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
        self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))
        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
        self._guess = NULL_TAG

    cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
        get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
                  tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
        self.extractor.extract(self._feats, self._values, self._atoms, NULL)
        self._guess = self.model.score(self._scores, self._feats, self._values)
        return self._guess

    cpdef bint tell_answer(self, class_t gold) except *:
        cdef class_t guess = self._guess
        if gold == guess or gold == NULL_TAG:
            self.model.update({})
            return 0
        counts = {guess: {}, gold: {}}
        self.extractor.count(counts[gold], self._feats, 1)
        self.extractor.count(counts[guess], self._feats, -1)
        self.model.update(counts)

    @classmethod
    def encode_pos(cls, tag):
        if tag not in cls.tags:
            cls.tags[tag] = len(cls.tags)
        return cls.tags[tag]


@cython.boundscheck(False)
def count_tags(Tagger tagger, Tokens tokens, uint32_t[:, :] tag_counts):
    cdef class_t prev_prev, prev, tag
    prev = tagger.tags['EOL']; prev_prev = tagger.tags['EOL']
    cdef int i
    cdef id_t token
    for i in range(tokens.length):
        tag = tagger.predict(i, tokens, prev, prev_prev)
        prev_prev = prev
        prev = tag
        token = tokens.lex[i].id
        if token < tag_counts.shape[0]:
            tag_counts[token, tag] += 1


cpdef enum:
    P2i
    P2c
    P2w
    P2shape
    P2pref
    P2suff
    P2oft_title
    P2oft_upper
    P2pos
    P2url
    P2num

    P1i
    P1c
    P1w
    P1shape
    P1pre
    P1suff
    P1oft_title
    P1oft_upper
    P1pos
    P1url
    P1num

    N0i
    N0c
    N0w
    N0shape
    N0pref
    N0suff
    N0oft_title
    N0oft_upper
    N0pos
    N0url
    N0num

    N1i
    N1c
    N1w
    N1shape
    N1pref
    N1suff
    N1oft_title
    N1oft_upper
    N1pos
    N1url
    N1num

    N2i
    N2c
    N2w
    N2shape
    N2pref
    N2suff
    N2oft_title
    N2oft_upper
    N2pos
    N2url
    N2num

    P2t
    P1t

    CONTEXT_SIZE


cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
                   Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
    _fill_token(&atoms[P2i], p2)
    _fill_token(&atoms[P1i], p1)
    _fill_token(&atoms[N0i], n0)
    _fill_token(&atoms[N1i], n1)
    _fill_token(&atoms[N2i], n2)
    atoms[P1t] = prev_tag
    atoms[P2t] = prev_prev_tag


cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
    atoms[0] = lex.sic
    atoms[1] = lex.cluster
    atoms[2] = lex.norm
    atoms[3] = lex.shape
    atoms[4] = lex.prefix
    atoms[5] = lex.suffix

    atoms[6] = lex.flags & (1 << OFT_TITLE)
    atoms[7] = lex.flags & (1 << OFT_UPPER)
    atoms[8] = lex.postype
    atoms[9] = lex.flags & (1 << LIKE_URL)
    atoms[10] = lex.flags & (1 << LIKE_NUMBER)


TEMPLATES = (
    (N0i,),
    (N0w,),
    (N0suff,),
    (N0pref,),
    (P1t,),
    (P2t,),
    (P1t, P2t),
    (P1t, N0w),
    (P1w,),
    (P1suff,),
    (P2w,),
    (N1w,),
    (N1suff,),
    (N2w,),

    (N0shape,),
    (N0c,),
    (N1c,),
    (N2c,),
    (P1c,),
    (P2c,),
    (N0oft_upper,),
    (N0oft_title,),

    (P1t, N1w),
    (P1t, P2t, N1w),
    (P1w, P2w, N1w),
    (P2w, N1w, N2w),

    (N0pos,),
    (N0w, N1pos),
    (N0w, N1pos, N2pos),
    (P1t, N0pos),
    (P2t, P1t, N0pos),

    (P2url,),
    (P1url,),
    (N0url,),
    (N1url,),
    (N2url,),

    (P2num,),
    (P1num,),
    (N0num,),
    (N1num,),
    (N2num,),
)
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`# cython: profile=True`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`from os import path`
			`import os`
			`import shutil`
			`import ujson`
			`import random`
			`import codecs`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`import gzip`
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00			`import cython`

			`from libc.stdint cimport uint32_t`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00

			`from thinc.weights cimport arg_max`
			`from thinc.features import NonZeroConjFeat`
			`from thinc.features import ConjFeat`

* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`from .lexeme cimport *`
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00			`from .lang cimport Lexicon`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00

			`NULL_TAG = 0`


			`cdef class Tagger:`
			`tags = {'NULL': NULL_TAG}`
			`def __init__(self, model_dir):`
			`self.mem = Pool()`
* Fix POS tagger, so that it loads correctly. Lexemes are being read in. 2014-10-30 02:38:55 +00:00			`tags_loc = path.join(model_dir, 'postags.json')`
			`if path.exists(tags_loc):`
			`with open(tags_loc) as file_:`
			`Tagger.tags.update(ujson.load(file_))`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`self.model = LinearModel(len(self.tags), self.extractor.n)`
* Fix POS tagger, so that it loads correctly. Lexemes are being read in. 2014-10-30 02:38:55 +00:00			`if path.exists(path.join(model_dir, 'model')):`
			`self.model.load(path.join(model_dir, 'model'))`
			`self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))`
			`self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))`
			`self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))`
			`self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))`
			`self._guess = NULL_TAG`
* Whitespace 2014-11-02 03:22:32 +00:00
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],`
			`tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`self.extractor.extract(self._feats, self._values, self._atoms, NULL)`
			`self._guess = self.model.score(self._scores, self._feats, self._values)`
			`return self._guess`

			`cpdef bint tell_answer(self, class_t gold) except *:`
			`cdef class_t guess = self._guess`
			`if gold == guess or gold == NULL_TAG:`
			`self.model.update({})`
			`return 0`
			`counts = {guess: {}, gold: {}}`
			`self.extractor.count(counts[gold], self._feats, 1)`
			`self.extractor.count(counts[guess], self._feats, -1)`
			`self.model.update(counts)`

			`@classmethod`
			`def encode_pos(cls, tag):`
			`if tag not in cls.tags:`
			`cls.tags[tag] = len(cls.tags)`
			`return cls.tags[tag]`


* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00			`@cython.boundscheck(False)`
			`def count_tags(Tagger tagger, Tokens tokens, uint32_t[:, :] tag_counts):`
			`cdef class_t prev_prev, prev, tag`
			`prev = tagger.tags['EOL']; prev_prev = tagger.tags['EOL']`
			`cdef int i`
			`cdef id_t token`
			`for i in range(tokens.length):`
			`tag = tagger.predict(i, tokens, prev, prev_prev)`
			`prev_prev = prev`
			`prev = tag`
			`token = tokens.lex[i].id`
			`if token < tag_counts.shape[0]:`
			`tag_counts[token, tag] += 1`


* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`cpdef enum:`
			`P2i`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`P2c`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`P2w`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`P2shape`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`P2pref`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`P2suff`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`P2oft_title`
			`P2oft_upper`
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00			`P2pos`
* Use LIKE_URL and LIKE_NUMBER flag features. Seems to improve accuracy on onto web 2014-11-02 02:19:54 +00:00			`P2url`
			`P2num`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00
			`P1i`
			`P1c`
			`P1w`
			`P1shape`
			`P1pre`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`P1suff`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`P1oft_title`
			`P1oft_upper`
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00			`P1pos`
* Use LIKE_URL and LIKE_NUMBER flag features. Seems to improve accuracy on onto web 2014-11-02 02:19:54 +00:00			`P1url`
			`P1num`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`N0i`
			`N0c`
			`N0w`
			`N0shape`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`N0pref`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`N0suff`
			`N0oft_title`
			`N0oft_upper`
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00			`N0pos`
* Use LIKE_URL and LIKE_NUMBER flag features. Seems to improve accuracy on onto web 2014-11-02 02:19:54 +00:00			`N0url`
			`N0num`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`N1i`
			`N1c`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`N1w`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`N1shape`
			`N1pref`
			`N1suff`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`N1oft_title`
			`N1oft_upper`
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00			`N1pos`
* Use LIKE_URL and LIKE_NUMBER flag features. Seems to improve accuracy on onto web 2014-11-02 02:19:54 +00:00			`N1url`
			`N1num`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00
			`N2i`
			`N2c`
			`N2w`
			`N2shape`
			`N2pref`
			`N2suff`
			`N2oft_title`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`N2oft_upper`
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00			`N2pos`
* Use LIKE_URL and LIKE_NUMBER flag features. Seems to improve accuracy on onto web 2014-11-02 02:19:54 +00:00			`N2url`
			`N2num`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00
			`P2t`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`P1t`

* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`CONTEXT_SIZE`


* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,`
			`Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00			`_fill_token(&atoms[P2i], p2)`
			`_fill_token(&atoms[P1i], p1)`
			`_fill_token(&atoms[N0i], n0)`
			`_fill_token(&atoms[N1i], n1)`
			`_fill_token(&atoms[N2i], n2)`
			`atoms[P1t] = prev_tag`
			`atoms[P2t] = prev_prev_tag`
* Refactoring get_atoms, improving tokens API 2014-10-22 02:10:56 +00:00
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:`
* Remove lexemes vector from Lexicon, and the id and hash attributes from Lexeme 2014-10-30 04:21:38 +00:00			`atoms[0] = lex.sic`
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`atoms[1] = lex.cluster`
			`atoms[2] = lex.norm`
			`atoms[3] = lex.shape`
			`atoms[4] = lex.prefix`
			`atoms[5] = lex.suffix`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 16:20:02 +00:00
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`atoms[6] = lex.flags & (1 << OFT_TITLE)`
			`atoms[7] = lex.flags & (1 << OFT_UPPER)`
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00			`atoms[8] = lex.postype`
* Use LIKE_URL and LIKE_NUMBER flag features. Seems to improve accuracy on onto web 2014-11-02 02:19:54 +00:00			`atoms[9] = lex.flags & (1 << LIKE_URL)`
			`atoms[10] = lex.flags & (1 << LIKE_NUMBER)`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00

			`TEMPLATES = (`
			`(N0i,),`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`(N0w,),`
			`(N0suff,),`
			`(N0pref,),`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`(P1t,),`
			`(P2t,),`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 01:57:06 +00:00			`(P1t, P2t),`
			`(P1t, N0w),`
			`(P1w,),`
			`(P1suff,),`
			`(P2w,),`
			`(N1w,),`
			`(N1suff,),`
			`(N2w,),`

			`(N0shape,),`
			`(N0c,),`
			`(N1c,),`
			`(N2c,),`
			`(P1c,),`
			`(P2c,),`
			`(N0oft_upper,),`
			`(N0oft_title,),`
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web. 2014-10-31 06:42:04 +00:00
			`(P1t, N1w),`
			`(P1t, P2t, N1w),`
			`(P1w, P2w, N1w),`
			`(P2w, N1w, N2w),`

			`(N0pos,),`
			`(N0w, N1pos),`
			`(N0w, N1pos, N2pos),`
			`(P1t, N0pos),`
* Use LIKE_URL and LIKE_NUMBER flag features. Seems to improve accuracy on onto web 2014-11-02 02:19:54 +00:00			`(P2t, P1t, N0pos),`

			`(P2url,),`
			`(P1url,),`
			`(N0url,),`
			`(N1url,),`
			`(N2url,),`

			`(P2num,),`
			`(P1num,),`
			`(N0num,),`
			`(N1num,),`
			`(N2num,),`
* Add greedy pos tagger 2014-10-21 23:17:26 +00:00			`)`