From a231b56d404f41a73b6ee7157ec786631faa54b1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 20 Jul 2017 00:18:15 +0200 Subject: [PATCH] Add text-classification hook to pipeline --- spacy/pipeline.pyx | 230 ++++++++++++++++++++++++++++----------------- 1 file changed, 145 insertions(+), 85 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 9137dc58e..556d97e5a 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -42,10 +42,89 @@ from .compat import json_dumps from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats +from ._ml import build_text_classifier from .parts_of_speech import X -class TokenVectorEncoder(object): +class BaseThincComponent(object): + name = None + + @classmethod + def Model(cls, *shape, **kwargs): + raise NotImplementedError + + def __init__(self, vocab, model=True, **cfg): + raise NotImplementedError + + def __call__(self, doc): + scores = self.predict([doc]) + self.set_annotations([doc], scores) + return doc + + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in cytoolz.partition_all(batch_size, stream): + docs = list(docs) + scores = self.predict(docs) + self.set_annotations(docs, scores) + yield from docs + + def predict(self, docs): + raise NotImplementedError + + def set_annotations(self, docs, scores): + raise NotImplementedError + + def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): + raise NotImplementedError + + def get_loss(self, docs, golds, scores): + raise NotImplementedError + + def begin_training(self, gold_tuples, pipeline=None): + token_vector_width = pipeline[0].model.nO + if self.model is True: + self.model = self.Model(1, token_vector_width) + + def use_params(self, params): + with self.model.use_params(params): + yield + + def to_bytes(self, **exclude): + serialize = OrderedDict(( + ('model', lambda: self.model.to_bytes()), + ('vocab', lambda: self.vocab.to_bytes()) + )) + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, **exclude): + if self.model is True: + self.model = self.Model() + deserialize = OrderedDict(( + ('model', lambda b: self.model.from_bytes(b)), + ('vocab', lambda b: self.vocab.from_bytes(b)) + )) + util.from_bytes(bytes_data, deserialize, exclude) + return self + + def to_disk(self, path, **exclude): + serialize = OrderedDict(( + ('model', lambda p: p.open('wb').write(self.model.to_bytes())), + ('vocab', lambda p: self.vocab.to_disk(p)) + )) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, **exclude): + if self.model is True: + self.model = self.Model() + deserialize = OrderedDict(( + ('model', lambda p: self.model.from_bytes(p.open('rb').read())), + ('vocab', lambda p: self.vocab.from_disk(p)) + )) + util.from_disk(path, deserialize, exclude) + return self + + +class TokenVectorEncoder(BaseThincComponent): """Assign position-sensitive vectors to tokens, using a CNN or RNN.""" name = 'tensorizer' @@ -155,51 +234,8 @@ class TokenVectorEncoder(object): if self.model is True: self.model = self.Model() - def use_params(self, params): - """Replace weights of models in the pipeline with those provided in the - params dictionary. - params (dict): A dictionary of parameters keyed by model ID. - """ - with self.model.use_params(params): - yield - - def to_bytes(self, **exclude): - serialize = OrderedDict(( - ('model', lambda: self.model.to_bytes()), - ('vocab', lambda: self.vocab.to_bytes()) - )) - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, **exclude): - if self.model is True: - self.model = self.Model() - deserialize = OrderedDict(( - ('model', lambda b: self.model.from_bytes(b)), - ('vocab', lambda b: self.vocab.from_bytes(b)) - )) - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk(self, path, **exclude): - serialize = OrderedDict(( - ('model', lambda p: p.open('wb').write(self.model.to_bytes())), - ('vocab', lambda p: self.vocab.to_disk(p)) - )) - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, **exclude): - if self.model is True: - self.model = self.Model() - deserialize = OrderedDict(( - ('model', lambda p: self.model.from_bytes(p.open('rb').read())), - ('vocab', lambda p: self.vocab.from_disk(p)) - )) - util.from_disk(path, deserialize, exclude) - return self - - -class NeuralTagger(object): +class NeuralTagger(BaseThincComponent): name = 'tagger' def __init__(self, vocab, model=True): self.vocab = vocab @@ -252,7 +288,6 @@ class NeuralTagger(object): loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) - return d_tokvecs def get_loss(self, docs, golds, scores): @@ -423,7 +458,7 @@ class NeuralLabeller(NeuralTagger): return float(loss), d_scores -class SimilarityHook(object): +class SimilarityHook(BaseThincComponent): """ Experimental @@ -477,48 +512,65 @@ class SimilarityHook(object): if self.model is True: self.model = self.Model(pipeline[0].model.nO) - def use_params(self, params): - """Replace weights of models in the pipeline with those provided in the - params dictionary. - params (dict): A dictionary of parameters keyed by model ID. - """ - with self.model.use_params(params): - yield +class TextClassifier(BaseThincComponent): + name = 'text-classifier' - def to_bytes(self, **exclude): - serialize = OrderedDict(( - ('model', lambda: self.model.to_bytes()), - ('vocab', lambda: self.vocab.to_bytes()) - )) - return util.to_bytes(serialize, exclude) + @classmethod + def Model(cls, nr_class, width=64, **cfg): + return build_text_classifier(nr_class, width, **cfg) - def from_bytes(self, bytes_data, **exclude): + def __init__(self, vocab, model=True, **cfg): + self.vocab = vocab + self.model = model + self.labels = cfg.get('labels', ['LABEL']) + + def __call__(self, doc): + scores = self.predict([doc]) + self.set_annotations([doc], scores) + return doc + + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in cytoolz.partition_all(batch_size, stream): + docs = list(docs) + scores = self.predict(docs) + self.set_annotations(docs, scores) + yield from docs + + def predict(self, docs): + scores = self.model(docs) + scores = self.model.ops.asarray(scores) + return scores + + def set_annotations(self, docs, scores): + for i, doc in enumerate(docs): + for j, label in self.labels: + doc.cats[label] = float(scores[i, j]) + + def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): + docs, tensors = docs_tensors + scores, bp_scores = self.model.begin_update(docs, drop=drop) + loss, d_scores = self.get_loss(docs, golds, scores) + d_tensors = bp_scores(d_scores, sgd=sgd) + if losses is not None: + losses.setdefault(self.name, 0.0) + losses[self.name] += loss + return d_tensors + + def get_loss(self, docs, golds, scores): + truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') + for i, gold in enumerate(golds): + for j, label in enumerate(self.labels): + truths[i, j] = label in gold.cats + truths = self.model.ops.asarray(truths) + d_scores = (scores-truths) / scores.shape[0] + mean_square_error = ((scores-truths)**2).sum(axis=1).mean() + return mean_square_error, d_scores + + def begin_training(self, gold_tuples, pipeline=None): + token_vector_width = pipeline[0].model.nO if self.model is True: - self.model = self.Model() - deserialize = OrderedDict(( - ('model', lambda b: self.model.from_bytes(b)), - ('vocab', lambda b: self.vocab.from_bytes(b)) - )) - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk(self, path, **exclude): - serialize = OrderedDict(( - ('model', lambda p: p.open('wb').write(self.model.to_bytes())), - ('vocab', lambda p: self.vocab.to_disk(p)) - )) - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, **exclude): - if self.model is True: - self.model = self.Model() - deserialize = OrderedDict(( - ('model', lambda p: self.model.from_bytes(p.open('rb').read())), - ('vocab', lambda p: self.vocab.from_disk(p)) - )) - util.from_disk(path, deserialize, exclude) - return self + self.model = self.Model(len(self.labels), token_vector_width) cdef class EntityRecognizer(LinearParser): @@ -569,6 +621,14 @@ cdef class NeuralEntityRecognizer(NeuralParser): nr_feature = 6 + def predict_confidences(self, docs): + tensors = [d.tensor for d in docs] + samples = [] + for i in range(10): + states = self.parse_batch(docs, tensors, drop=0.3) + for state in states: + samples.append(self._get_entities(state)) + def __reduce__(self): return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)