2014-10-22 01:57:06 +00:00
|
|
|
# cython: profile=True
|
2014-10-21 23:17:26 +00:00
|
|
|
from os import path
|
|
|
|
import os
|
|
|
|
import shutil
|
|
|
|
import ujson
|
|
|
|
import random
|
|
|
|
import codecs
|
2014-10-22 01:57:06 +00:00
|
|
|
import gzip
|
2014-10-31 06:42:04 +00:00
|
|
|
import cython
|
|
|
|
|
|
|
|
from libc.stdint cimport uint32_t
|
2014-10-21 23:17:26 +00:00
|
|
|
|
|
|
|
|
|
|
|
from thinc.weights cimport arg_max
|
|
|
|
from thinc.features import NonZeroConjFeat
|
|
|
|
from thinc.features import ConjFeat
|
|
|
|
|
2014-10-22 16:20:02 +00:00
|
|
|
from .lexeme cimport *
|
2014-10-31 06:42:04 +00:00
|
|
|
from .lang cimport Lexicon
|
2014-10-21 23:17:26 +00:00
|
|
|
|
|
|
|
|
|
|
|
NULL_TAG = 0
|
|
|
|
|
|
|
|
|
|
|
|
cdef class Tagger:
|
|
|
|
tags = {'NULL': NULL_TAG}
|
|
|
|
def __init__(self, model_dir):
|
|
|
|
self.mem = Pool()
|
2014-10-30 02:38:55 +00:00
|
|
|
tags_loc = path.join(model_dir, 'postags.json')
|
|
|
|
if path.exists(tags_loc):
|
|
|
|
with open(tags_loc) as file_:
|
|
|
|
Tagger.tags.update(ujson.load(file_))
|
2014-10-21 23:17:26 +00:00
|
|
|
self.model = LinearModel(len(self.tags), self.extractor.n)
|
2014-10-30 02:38:55 +00:00
|
|
|
if path.exists(path.join(model_dir, 'model')):
|
|
|
|
self.model.load(path.join(model_dir, 'model'))
|
|
|
|
self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
|
2014-10-21 23:17:26 +00:00
|
|
|
self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))
|
|
|
|
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
|
|
|
|
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
|
|
|
|
self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
|
|
|
|
self._guess = NULL_TAG
|
2014-11-02 03:22:32 +00:00
|
|
|
|
2014-10-21 23:17:26 +00:00
|
|
|
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
|
2014-10-22 16:20:02 +00:00
|
|
|
get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
|
|
|
|
tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
|
2014-10-21 23:17:26 +00:00
|
|
|
self.extractor.extract(self._feats, self._values, self._atoms, NULL)
|
|
|
|
self._guess = self.model.score(self._scores, self._feats, self._values)
|
|
|
|
return self._guess
|
|
|
|
|
|
|
|
cpdef bint tell_answer(self, class_t gold) except *:
|
|
|
|
cdef class_t guess = self._guess
|
|
|
|
if gold == guess or gold == NULL_TAG:
|
|
|
|
self.model.update({})
|
|
|
|
return 0
|
|
|
|
counts = {guess: {}, gold: {}}
|
|
|
|
self.extractor.count(counts[gold], self._feats, 1)
|
|
|
|
self.extractor.count(counts[guess], self._feats, -1)
|
|
|
|
self.model.update(counts)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def encode_pos(cls, tag):
|
|
|
|
if tag not in cls.tags:
|
|
|
|
cls.tags[tag] = len(cls.tags)
|
|
|
|
return cls.tags[tag]
|
|
|
|
|
|
|
|
|
2014-10-31 06:42:04 +00:00
|
|
|
@cython.boundscheck(False)
|
|
|
|
def count_tags(Tagger tagger, Tokens tokens, uint32_t[:, :] tag_counts):
|
|
|
|
cdef class_t prev_prev, prev, tag
|
|
|
|
prev = tagger.tags['EOL']; prev_prev = tagger.tags['EOL']
|
|
|
|
cdef int i
|
|
|
|
cdef id_t token
|
|
|
|
for i in range(tokens.length):
|
|
|
|
tag = tagger.predict(i, tokens, prev, prev_prev)
|
|
|
|
prev_prev = prev
|
|
|
|
prev = tag
|
|
|
|
token = tokens.lex[i].id
|
|
|
|
if token < tag_counts.shape[0]:
|
|
|
|
tag_counts[token, tag] += 1
|
|
|
|
|
|
|
|
|
2014-10-21 23:17:26 +00:00
|
|
|
cpdef enum:
|
|
|
|
P2i
|
2014-10-22 01:57:06 +00:00
|
|
|
P2c
|
2014-10-22 16:20:02 +00:00
|
|
|
P2w
|
2014-10-22 01:57:06 +00:00
|
|
|
P2shape
|
2014-10-22 16:20:02 +00:00
|
|
|
P2pref
|
2014-10-22 01:57:06 +00:00
|
|
|
P2suff
|
2014-11-02 13:15:03 +00:00
|
|
|
P2title
|
|
|
|
P2upper
|
2014-10-22 16:20:02 +00:00
|
|
|
P2oft_title
|
|
|
|
P2oft_upper
|
2014-10-31 06:42:04 +00:00
|
|
|
P2pos
|
2014-11-02 02:19:54 +00:00
|
|
|
P2url
|
|
|
|
P2num
|
2014-10-22 16:20:02 +00:00
|
|
|
|
|
|
|
P1i
|
|
|
|
P1c
|
|
|
|
P1w
|
|
|
|
P1shape
|
|
|
|
P1pre
|
2014-10-22 01:57:06 +00:00
|
|
|
P1suff
|
2014-11-02 13:15:03 +00:00
|
|
|
P1title
|
|
|
|
P1upper
|
2014-10-22 16:20:02 +00:00
|
|
|
P1oft_title
|
|
|
|
P1oft_upper
|
2014-10-31 06:42:04 +00:00
|
|
|
P1pos
|
2014-11-02 02:19:54 +00:00
|
|
|
P1url
|
|
|
|
P1num
|
2014-10-22 01:57:06 +00:00
|
|
|
|
2014-10-22 16:20:02 +00:00
|
|
|
N0i
|
|
|
|
N0c
|
|
|
|
N0w
|
|
|
|
N0shape
|
2014-10-22 01:57:06 +00:00
|
|
|
N0pref
|
2014-10-22 16:20:02 +00:00
|
|
|
N0suff
|
2014-11-02 13:15:03 +00:00
|
|
|
N0title
|
|
|
|
N0upper
|
2014-10-22 16:20:02 +00:00
|
|
|
N0oft_title
|
|
|
|
N0oft_upper
|
2014-10-31 06:42:04 +00:00
|
|
|
N0pos
|
2014-11-02 02:19:54 +00:00
|
|
|
N0url
|
|
|
|
N0num
|
2014-10-22 01:57:06 +00:00
|
|
|
|
2014-10-22 16:20:02 +00:00
|
|
|
N1i
|
|
|
|
N1c
|
2014-10-22 01:57:06 +00:00
|
|
|
N1w
|
2014-10-22 16:20:02 +00:00
|
|
|
N1shape
|
|
|
|
N1pref
|
|
|
|
N1suff
|
2014-11-02 13:15:03 +00:00
|
|
|
N1title
|
|
|
|
N1upper
|
2014-10-22 01:57:06 +00:00
|
|
|
N1oft_title
|
|
|
|
N1oft_upper
|
2014-10-31 06:42:04 +00:00
|
|
|
N1pos
|
2014-11-02 02:19:54 +00:00
|
|
|
N1url
|
|
|
|
N1num
|
2014-10-22 16:20:02 +00:00
|
|
|
|
|
|
|
N2i
|
|
|
|
N2c
|
|
|
|
N2w
|
|
|
|
N2shape
|
|
|
|
N2pref
|
|
|
|
N2suff
|
2014-11-02 13:15:03 +00:00
|
|
|
N2title
|
|
|
|
N2upper
|
2014-10-22 16:20:02 +00:00
|
|
|
N2oft_title
|
2014-10-21 23:17:26 +00:00
|
|
|
N2oft_upper
|
2014-10-31 06:42:04 +00:00
|
|
|
N2pos
|
2014-11-02 02:19:54 +00:00
|
|
|
N2url
|
|
|
|
N2num
|
2014-10-21 23:17:26 +00:00
|
|
|
|
|
|
|
P2t
|
2014-10-22 16:20:02 +00:00
|
|
|
P1t
|
|
|
|
|
2014-10-21 23:17:26 +00:00
|
|
|
CONTEXT_SIZE
|
|
|
|
|
|
|
|
|
2014-10-23 13:59:17 +00:00
|
|
|
cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
|
|
|
|
Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
|
2014-10-22 16:20:02 +00:00
|
|
|
_fill_token(&atoms[P2i], p2)
|
|
|
|
_fill_token(&atoms[P1i], p1)
|
|
|
|
_fill_token(&atoms[N0i], n0)
|
|
|
|
_fill_token(&atoms[N1i], n1)
|
|
|
|
_fill_token(&atoms[N2i], n2)
|
|
|
|
atoms[P1t] = prev_tag
|
|
|
|
atoms[P2t] = prev_prev_tag
|
2014-10-22 02:10:56 +00:00
|
|
|
|
2014-10-22 01:57:06 +00:00
|
|
|
|
2014-10-23 13:59:17 +00:00
|
|
|
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
|
2014-10-30 04:21:38 +00:00
|
|
|
atoms[0] = lex.sic
|
2014-10-23 13:59:17 +00:00
|
|
|
atoms[1] = lex.cluster
|
2014-11-02 13:15:03 +00:00
|
|
|
atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
2014-10-23 13:59:17 +00:00
|
|
|
atoms[3] = lex.shape
|
|
|
|
atoms[4] = lex.prefix
|
|
|
|
atoms[5] = lex.suffix
|
2014-10-22 16:20:02 +00:00
|
|
|
|
2014-11-02 13:15:03 +00:00
|
|
|
atoms[6] = lex.flags & (1 << IS_TITLE)
|
|
|
|
atoms[7] = lex.flags & (1 << IS_UPPER)
|
|
|
|
atoms[8] = lex.flags & (1 << OFT_TITLE)
|
|
|
|
atoms[9] = lex.flags & (1 << OFT_UPPER)
|
|
|
|
atoms[10] = lex.postype
|
|
|
|
atoms[11] = lex.flags & (1 << LIKE_URL)
|
|
|
|
atoms[12] = lex.flags & (1 << LIKE_NUMBER)
|
2014-10-21 23:17:26 +00:00
|
|
|
|
|
|
|
TEMPLATES = (
|
|
|
|
(N0i,),
|
2014-10-22 01:57:06 +00:00
|
|
|
(N0w,),
|
|
|
|
(N0suff,),
|
|
|
|
(N0pref,),
|
2014-10-21 23:17:26 +00:00
|
|
|
(P1t,),
|
|
|
|
(P2t,),
|
2014-10-22 01:57:06 +00:00
|
|
|
(P1t, P2t),
|
|
|
|
(P1t, N0w),
|
|
|
|
(P1w,),
|
|
|
|
(P1suff,),
|
|
|
|
(P2w,),
|
|
|
|
(N1w,),
|
|
|
|
(N1suff,),
|
|
|
|
(N2w,),
|
|
|
|
|
|
|
|
(N0shape,),
|
|
|
|
(N0c,),
|
|
|
|
(N1c,),
|
|
|
|
(N2c,),
|
|
|
|
(P1c,),
|
|
|
|
(P2c,),
|
2014-11-02 13:15:03 +00:00
|
|
|
(P1c, N0c),
|
|
|
|
(N0c, N1c),
|
|
|
|
(P1c, P1t),
|
|
|
|
(P1c, P1t, N0c),
|
|
|
|
(P1t, N0c),
|
2014-10-22 01:57:06 +00:00
|
|
|
(N0oft_upper,),
|
|
|
|
(N0oft_title,),
|
2014-10-31 06:42:04 +00:00
|
|
|
|
2014-11-02 13:15:03 +00:00
|
|
|
(P1w, N0w),
|
|
|
|
(N0w, N1w),
|
2014-10-31 06:42:04 +00:00
|
|
|
|
|
|
|
(N0pos,),
|
2014-11-02 13:15:03 +00:00
|
|
|
(P1t, N0pos, N1pos),
|
|
|
|
(P1t, N1pos),
|
2014-11-02 02:19:54 +00:00
|
|
|
|
|
|
|
(N0url,),
|
|
|
|
(N0num,),
|
2014-11-02 13:15:03 +00:00
|
|
|
(P1url,),
|
|
|
|
(P1url,),
|
2014-11-02 02:19:54 +00:00
|
|
|
(N1num,),
|
2014-11-02 13:15:03 +00:00
|
|
|
(N1url,),
|
2014-10-21 23:17:26 +00:00
|
|
|
)
|