* Begin revising tagger, focussing on POS tagging

This commit is contained in:
Matthew Honnibal 2014-12-07 15:29:04 +11:00
parent f5c4f2eb52
commit 0c7aeb9de7
2 changed files with 50 additions and 61 deletions

View File

@ -5,20 +5,17 @@ from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from .typedefs cimport hash_t
from .context cimport Slots
from .tokens cimport Tokens
cpdef enum TagType:
POS
ENTITY
SENSE
cdef class Tagger:
cpdef int set_tags(self, Tokens tokens) except -1
cpdef class_t predict(self, int i, Tokens tokens) except 0
cpdef int tell_answer(self, list gold) except -1
cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except 0
cpdef readonly Pool mem
cpdef readonly Extractor extractor
@ -26,9 +23,3 @@ cdef class Tagger:
cpdef readonly TagType tag_type
cpdef readonly list tag_names
cdef class_t _guess
cdef atom_t* _context
cdef feat_t* _feats
cdef weight_t* _values
cdef weight_t* _scores

View File

@ -1,8 +1,10 @@
# cython: profile=True
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from .context cimport fill_context
from .context cimport N_FIELDS
from os import path
import os
import shutil
@ -10,11 +12,7 @@ import random
import json
import cython
from .context cimport fill_context
from .context cimport N_FIELDS
from thinc.features cimport ConjFeat
from thinc.features cimport Feature, count_feats
NULL_TAG = 0
@ -35,7 +33,8 @@ def setup_model_dir(tag_type, tag_names, templates, model_dir):
def train(train_sents, model_dir, nr_iter=10):
cdef Tokens tokens
tagger = Tagger(model_dir)
cdef Tagger tagger = Tagger(model_dir)
cdef int i
for _ in range(nr_iter):
n_corr = 0
total = 0
@ -43,9 +42,10 @@ def train(train_sents, model_dir, nr_iter=10):
assert len(tokens) == len(golds), [t.string for t in tokens]
for i in range(tokens.length):
if tagger.tag_type == POS:
gold = _get_gold_pos(i, golds, tokens.pos)
elif tagger.tag_type == ENTITY:
gold = _get_gold_ner(i, golds, tokens.ner)
gold = _get_gold_pos(i, golds)
else:
raise StandardError
guess = tagger.predict(i, tokens)
tokens.set_tag(i, tagger.tag_type, guess)
if gold is not None:
@ -59,7 +59,7 @@ def train(train_sents, model_dir, nr_iter=10):
tagger.model.dump(path.join(model_dir, 'model'))
cdef object _get_gold_pos(i, golds, int* pred):
cdef object _get_gold_pos(i, golds):
if golds[i] == 0:
return None
else:
@ -96,17 +96,11 @@ cdef class Tagger:
templates = cfg['templates']
self.tag_names = cfg['tag_names']
self.tag_type = cfg['tag_type']
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
self.extractor = Extractor(templates)
self.model = LinearModel(len(self.tag_names))
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
self._guess = NULL_TAG
cpdef int set_tags(self, Tokens tokens) except -1:
"""Assign tags to a Tokens object.
@ -119,7 +113,7 @@ cdef class Tagger:
for i in range(tokens.length):
tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
cpdef class_t predict(self, int i, Tokens tokens) except 0:
cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except 0:
"""Predict the tag of tokens[i]. The tagger remembers the features and
prediction, in case you later call tell_answer.
@ -127,38 +121,20 @@ cdef class Tagger:
>>> tag = EN.pos_tagger.predict(0, tokens)
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
"""
fill_context(self._context, i, tokens)
self.extractor.extract(self._feats, self._values, self._context, NULL)
self._guess = self.model.score(self._scores, self._feats, self._values)
return self._guess
cpdef int tell_answer(self, list golds) except -1:
"""Provide the correct tag for the word the tagger was last asked to predict.
During Tagger.predict, the tagger remembers the features and prediction
for the example. These are used to calculate a weight update given the
correct label.
>>> tokens = EN.tokenize('An example sentence.')
>>> guess = EN.pos_tagger.predict(1, tokens)
>>> JJ = EN.pos_tagger.tag_id('JJ')
>>> JJ
7
>>> EN.pos_tagger.tell_answer(JJ)
"""
cdef class_t guess = self._guess
if guess in golds:
self.model.update({})
return 0
best_gold = golds[0]
best_score = self._scores[best_gold-1]
for gold in golds[1:]:
if self._scores[gold-1] > best_gold:
best_score = self._scores[best_gold-1]
best_gold = gold
counts = {guess: {}, best_gold: {}}
self.extractor.count(counts[best_gold], self._feats, 1)
self.extractor.count(counts[guess], self._feats, -1)
cdef int n_feats
cdef atom_t[N_FIELDS] context
print sizeof(context)
fill_context(context, i, tokens.data)
cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
cdef weight_t* scores = self.model.get_scores(feats, n_feats)
cdef class_t guess = _arg_max(scores, self.nr_class)
if golds is not None and guess not in golds:
best = _arg_max_among(scores, golds)
counts = {}
count_feats(counts[guess], feats, n_feats, -1)
count_feats(counts[best], feats, n_feats, 1)
self.model.update(counts)
return guess
def tag_id(self, object tag_name):
"""Encode tag_name into a tag ID integer."""
@ -167,3 +143,25 @@ cdef class Tagger:
tag_id = len(self.tag_names)
self.tag_names.append(tag_name)
return tag_id
cdef class_t _arg_max(weight_t* scores, int n_classes):
cdef int best = 0
cdef weight_t score = scores[best]
cdef int i
for i in range(1, n_classes):
if scores[i] > score:
score = scores[i]
best = i
return best
cdef class_t _arg_max_among(weight_t* scores, list classes):
cdef int best = classes[0]
cdef weight_t score = scores[best]
cdef class_t clas
for clas in classes:
if scores[clas] > score:
score = scores[clas]
best = clas
return best