From abbe3e44b0dd860c23a2d691a0658b2a751bc944 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 5 Nov 2014 00:37:59 +1100 Subject: [PATCH] * Move spacy.pos tagger to spacy.tagger, and generalize it so that it can take on other tagging tasks, given a different set of feature templates. --- spacy/tagger.pxd | 32 +++++++++++++++++ spacy/tagger.pyx | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 spacy/tagger.pxd create mode 100644 spacy/tagger.pyx diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd new file mode 100644 index 000000000..514e9693e --- /dev/null +++ b/spacy/tagger.pxd @@ -0,0 +1,32 @@ +from cymem.cymem cimport Pool + +from thinc.learner cimport LinearModel +from thinc.features cimport Extractor +from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t + +from .tokens cimport Tokens + + +cdef enum TagType: + POS + ENTITY + SENSE + + +cdef class Tagger: + cpdef int set_tags(self, Tokens tokens) except -1 + cpdef class_t predict(self, int i, Tokens tokens) except 0 + cpdef int tell_answer(self, class_t gold) except -1 + + cpdef readonly Pool mem + cpdef readonly Extractor extractor + cpdef readonly LinearModel model + + cpdef readonly TagType tag_type + cpdef readonly list tag_names + + cdef class_t _guess + cdef atom_t* _context + cdef feat_t* _feats + cdef weight_t* _values + cdef weight_t* _scores diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx new file mode 100644 index 000000000..55f46fa67 --- /dev/null +++ b/spacy/tagger.pyx @@ -0,0 +1,91 @@ +# cython: profile=True +from os import path +import os +import shutil +import random +import codecs +import gzip +import json +import cython + + +from thinc.features cimport ConjFeat + +NULL_TAG = 0 + + +cdef class Tagger: + """Assign part-of-speech, named entity or supersense tags, using greedy + decoding. The tagger reads its model and configuration from disk. + """ + def __init__(self, model_dir): + self.mem = Pool() + cfg = json.load(path.join(model_dir, 'config.json')) + templates = cfg['templates'] + self.tag_names = cfg['tag_names'] + self.tag_type = cfg['tag_type'] + self.model = LinearModel(len(self.tag_names)) + if path.exists(path.join(model_dir, 'model')): + self.model.load(path.join(model_dir, 'model')) + self.extractor = Extractor(templates, [ConjFeat] * len(templates)) + + self._feats = self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) + self._values = self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) + self._scores = self.mem.alloc(len(self.cfg.tags), sizeof(weight_t)) + self._guess = NULL_TAG + + cpdef int set_tags(self, Tokens tokens) except -1: + """Assign tags to a Tokens object. + + >>> tokens = EN.tokenize(u'An example sentence.') + >>> assert tokens[0].pos == 'NO_TAG' + >>> EN.pos_tagger.set_tags(tokens) + >>> assert tokens[0].pos == 'DT' + """ + cdef int i + for i in range(tokens.length): + tokens.set_tag(i, self.tag_type, self.predict(i, tokens)) + + cpdef class_t predict(self, int i, Tokens tokens) except 0: + """Predict the tag of tokens[i]. The tagger remembers the features and + prediction, in case you later call tell_answer. + + >>> tokens = EN.tokenize(u'An example sentence.') + >>> tag = EN.pos_tagger.predict(0, tokens) + >>> assert tag == EN.pos_tagger.tag_id('DT') == 5 + """ + #if self.tag_type == POS: + # _pos_feats.fill_context(self._context, i, tokens) + self.extractor.extract(self._feats, self._values, self._context, NULL) + self._guess = self.model.score(self._scores, self._feats, self._values) + return self._guess + + cpdef int tell_answer(self, class_t gold) except -1: + """Provide the correct tag for the word the tagger was last asked to predict. + During Tagger.predict, the tagger remembers the features and prediction + for the example. These are used to calculate a weight update given the + correct label. + + >>> tokens = EN.tokenize('An example sentence.') + >>> guess = EN.pos_tagger.predict(1, tokens) + >>> JJ = EN.pos_tagger.tag_id('JJ') + >>> JJ + 7 + >>> EN.pos_tagger.tell_answer(JJ) + """ + cdef class_t guess = self._guess + if gold == guess or gold == NULL_TAG: + self.model.update({}) + return 0 + counts = {guess: {}, gold: {}} + self.extractor.count(counts[gold], self._feats, 1) + self.extractor.count(counts[guess], self._feats, -1) + self.model.update(counts) + + def tag_id(self, object tag_name): + """Encode tag_name into a tag ID integer.""" + tag_id = self.tag_names.index(tag_name) + if tag_id == -1: + tag_id = len(self.tag_names) + self.tag_names.append(tag_name) + return tag_id