diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index 4b111217e..7024e88fc 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -3,7 +3,7 @@ from libc.stdint cimport uint8_t from cymem.cymem cimport Pool from thinc.learner cimport LinearModel -from thinc.features cimport Extractor +from thinc.features cimport Extractor, Feature from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t from preshed.maps cimport PreshMapArray @@ -17,6 +17,8 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil cdef class Model: cdef int n_classes + + cdef int regularize(self, Feature* feats, int n, int a=*) except -1 cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1 @@ -24,21 +26,10 @@ cdef class Model: cdef Extractor _extractor cdef LinearModel _model - cdef inline const weight_t* score(self, atom_t* context) except NULL: + cdef inline const weight_t* score(self, atom_t* context, bint regularize) except NULL: cdef int n_feats feats = self._extractor.get_feats(context, &n_feats) + if regularize: + self.regularize(feats, n_feats, 3) return self._model.get_scores(feats, n_feats) - -cdef class HastyModel: - cdef Pool mem - cdef weight_t* _scores - - cdef const weight_t* score(self, atom_t* context) except NULL - cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1 - - cdef int n_classes - cdef Model _hasty - cdef Model _full - cdef readonly int hasty_cnt - cdef readonly int full_cnt diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 026129a51..02db80a2d 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -4,9 +4,9 @@ from __future__ import division from os import path import os import shutil -import random import json import cython +import numpy.random from thinc.features cimport Feature, count_feats @@ -44,70 +44,11 @@ cdef class Model: count_feats(counts[guess], feats, n_feats, -cost) self._model.update(counts) + cdef int regularize(self, Feature* feats, int n, int a=3) except -1: + zipfs = numpy.random.zipf(a, n) + for i in range(n): + feats[i].value *= 1.0 / zipfs[i] + def end_training(self): self._model.end_training() self._model.dump(self.model_loc, freq_thresh=0) - - -cdef class HastyModel: - def __init__(self, n_classes, hasty_templates, full_templates, model_dir): - full_templates = tuple([t for t in full_templates if t not in hasty_templates]) - self.mem = Pool() - self.n_classes = n_classes - self._scores = self.mem.alloc(self.n_classes, sizeof(weight_t)) - assert path.exists(model_dir) - assert path.isdir(model_dir) - self._hasty = Model(n_classes, hasty_templates, path.join(model_dir, 'hasty_model')) - self._full = Model(n_classes, full_templates, path.join(model_dir, 'full_model')) - self.hasty_cnt = 0 - self.full_cnt = 0 - - cdef const weight_t* score(self, atom_t* context) except NULL: - cdef int i - hasty_scores = self._hasty.score(context) - if will_use_hasty(hasty_scores, self._hasty.n_classes): - self.hasty_cnt += 1 - return hasty_scores - else: - self.full_cnt += 1 - full_scores = self._full.score(context) - for i in range(self.n_classes): - self._scores[i] = full_scores[i] + hasty_scores[i] - return self._scores - - cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1: - self._hasty.update(context, guess, gold, cost) - self._full.update(context, guess, gold, cost) - - def end_training(self): - self._hasty.end_training() - self._full.end_training() - - -@cython.cdivision(True) -cdef bint will_use_hasty(const weight_t* scores, int n_classes) nogil: - cdef: - weight_t best_score, second_score - int best, second - - if scores[0] >= scores[1]: - best = 0 - best_score = scores[0] - second = 1 - second_score = scores[1] - else: - best = 1 - best_score = scores[1] - second = 0 - second_score = scores[0] - cdef int i - for i in range(2, n_classes): - if scores[i] > best_score: - second_score = best_score - second = best - best = i - best_score = scores[i] - elif scores[i] > second_score: - second_score = scores[i] - second = i - return best_score > 0 and second_score < (best_score / 2)