diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd new file mode 100644 index 000000000..8e2432b92 --- /dev/null +++ b/spacy/_ml.pxd @@ -0,0 +1,34 @@ +from libc.stdint cimport uint8_t + +from cymem.cymem cimport Pool + +from thinc.learner cimport LinearModel +from thinc.features cimport Extractor +from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t + +from preshed.maps cimport PreshMapArray + +from .typedefs cimport hash_t, id_t +from .tokens cimport Tokens + + +cdef class Model: + cdef class_t predict(self, atom_t* context) except * + cdef class_t predict_among(self, atom_t* context, bint* valid) except * + cdef class_t predict_and_update(self, atom_t* context, const bint* valid, + const int* costs) except * + + cdef object model_loc + cdef Extractor _extractor + cdef LinearModel _model + + +""" +cdef class HastyModel: + cdef class_t predict(self, const atom_t* context, object golds=*) except * + + cdef Model _model1 + cdef Model _model2 + + c +""" diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx new file mode 100644 index 000000000..515453dff --- /dev/null +++ b/spacy/_ml.pyx @@ -0,0 +1,138 @@ +# cython: profile=True +from __future__ import unicode_literals +from __future__ import division + +from os import path +import os +from collections import defaultdict +import shutil +import random +import json +import cython + +from thinc.features cimport Feature, count_feats + + +def setup_model_dir(tag_names, tag_map, templates, model_dir): + if path.exists(model_dir): + shutil.rmtree(model_dir) + os.mkdir(model_dir) + config = { + 'templates': templates, + 'tag_names': tag_names, + 'tag_map': tag_map + } + with open(path.join(model_dir, 'config.json'), 'w') as file_: + json.dump(config, file_) + + +cdef class Model: + def __init__(self, n_classes, templates, model_dir=None): + self._extractor = Extractor(templates) + self._model = LinearModel(n_classes, self._extractor.n_templ) + self.model_loc = path.join(model_dir, 'model') if model_dir else None + if self.model_loc and path.exists(self.model_loc): + self._model.load(self.model_loc, freq_thresh=0) + + cdef class_t predict(self, atom_t* context) except *: + cdef int n_feats + cdef const Feature* feats = self._extractor.get_feats(context, &n_feats) + cdef const weight_t* scores = self._model.get_scores(feats, n_feats) + guess = _arg_max(scores, self._model.nr_class) + return guess + + cdef class_t predict_among(self, atom_t* context, const bint* valid) except *: + cdef int n_feats + cdef const Feature* feats = self._extractor.get_feats(context, &n_feats) + cdef const weight_t* scores = self._model.get_scores(feats, n_feats) + return _arg_max_among(scores, valid, self._model.nr_class) + + cdef class_t predict_and_update(self, atom_t* context, const bint* valid, + const int* costs) except *: + cdef: + int n_feats + const Feature* feats + const weight_t* scores + + int guess + int best + int cost + int i + weight_t score + + feats = self._extractor.get_feats(context, &n_feats) + scores = self._model.get_scores(feats, n_feats) + guess = _arg_max_among(scores, valid, self._model.nr_class) + cost = costs[guess] + if cost == 0: + self._model.update({}) + return guess + + guess_counts = defaultdict(int) + best_counts = defaultdict(int) + for i in range(n_feats): + feat = (feats[i].i, feats[i].key) + upd = feats[i].value * cost + best_counts[feat] += upd + guess_counts[feat] -= upd + best = -1 + score = 0 + for i in range(self._model.nr_class): + if valid[i] and costs[i] == 0 and (best == -1 or scores[i] > score): + best = i + score = scores[i] + self._model.update({guess: guess_counts, best: best_counts}) + return guess + + def end_training(self): + self._model.end_training() + self._model.dump(self.model_loc, freq_thresh=0) + + +""" +cdef class HastyModel: + def __init__(self, model_dir): + cfg = json.load(open(path.join(model_dir, 'config.json'))) + templates = cfg['templates'] + univ_counts = {} + cdef unicode tag + cdef unicode univ_tag + tag_names = cfg['tag_names'] + self.extractor = Extractor(templates) + self.model = LinearModel(len(tag_names) + 1, self.extractor.n_templ+2) # TODO + if path.exists(path.join(model_dir, 'model')): + self.model.load(path.join(model_dir, 'model')) + + cdef class_t predict(self, atom_t* context) except *: + pass + + cdef class_t predict_among(self, atom_t* context, bint* valid) except *: + pass + + cdef class_t predict_and_update(self, atom_t* context, int* costs) except *: + pass + + def dump(self, model_dir): + pass +""" + +cdef int _arg_max(const weight_t* scores, int n_classes) except -1: + cdef int best = 0 + cdef weight_t score = scores[best] + cdef int i + for i in range(1, n_classes): + if scores[i] >= score: + score = scores[i] + best = i + return best + + +cdef int _arg_max_among(const weight_t* scores, const bint* valid, int n_classes) except -1: + cdef int clas + cdef weight_t score = 0 + cdef int best = -1 + for clas in range(n_classes): + if valid[clas] and (best == -1 or scores[clas] > score): + score = scores[clas] + best = clas + return best diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index d7673f293..223b7aef3 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -1,20 +1,24 @@ from preshed.maps cimport PreshMapArray +from cymem.cymem cimport Pool -from ..tagger cimport Tagger +from .._ml cimport Model from ..strings cimport StringStore from ..structs cimport TokenC, Lexeme, Morphology, PosTag from ..typedefs cimport univ_tag_t from .lemmatizer import Lemmatizer -cdef class EnPosTagger(Tagger): +cdef class EnPosTagger: + cdef readonly Pool mem cdef readonly StringStore strings + cdef readonly Model model cdef public object lemmatizer cdef PreshMapArray _morph_cache cdef PosTag* tags cdef readonly object tag_names cdef readonly object tag_map + cdef readonly int n_tags cdef int set_morph(self, const int i, TokenC* tokens) except -1 cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index d973490ee..33804a3fd 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -2,6 +2,9 @@ from os import path import json +from libc.string cimport memset + +from cymem.cymem cimport Address from thinc.typedefs cimport atom_t from ..typedefs cimport univ_tag_t @@ -203,16 +206,20 @@ cdef struct _CachedMorph: int lemma -cdef class EnPosTagger(Tagger): +cdef class EnPosTagger: """A part-of-speech tagger for English""" def __init__(self, StringStore strings, data_dir): + self.mem = Pool() model_dir = path.join(data_dir, 'pos') - Tagger.__init__(self, path.join(model_dir)) self.strings = strings cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) self.tag_names = sorted(cfg['tag_names']) + self.n_tags = len(self.tag_names) self.tag_map = cfg['tag_map'] cdef int n_tags = len(self.tag_names) + 1 + + self.model = Model(n_tags, cfg['templates'], model_dir=model_dir) + self._morph_cache = PreshMapArray(n_tags) self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) for i, tag in enumerate(sorted(self.tag_names)): @@ -235,20 +242,27 @@ cdef class EnPosTagger(Tagger): cdef TokenC* t = tokens.data for i in range(tokens.length): fill_context(context, i, t) - t[i].fine_pos = self.predict(context) + t[i].fine_pos = self.model.predict(context) self.set_morph(i, t) - def train(self, Tokens tokens, golds): + def train(self, Tokens tokens, py_golds): cdef int i cdef atom_t[N_CONTEXT_FIELDS] context - c = 0 + cdef Address costs_mem = Address(self.n_tags, sizeof(int)) + cdef Address valid_mem = Address(self.n_tags, sizeof(bint)) + cdef int* costs = costs_mem.ptr + cdef bint* valid = valid_mem.ptr + memset(valid, 1, sizeof(int) * self.n_tags) + correct = 0 cdef TokenC* t = tokens.data for i in range(tokens.length): fill_context(context, i, t) - t[i].fine_pos = self.predict(context, [golds[i]]) + memset(costs, 1, sizeof(int) * self.n_tags) + costs[py_golds[i]] = 0 + t[i].fine_pos = self.model.predict_and_update(context, valid, costs) self.set_morph(i, t) - c += t[i].fine_pos == golds[i] - return c + correct += costs[t[i].fine_pos] == 0 + return correct cdef int set_morph(self, const int i, TokenC* tokens) except -1: cdef const PosTag* tag = &self.tags[tokens[i].fine_pos]