From bf917225ab123f354ead66f9685558cd52129fff Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 26 Sep 2017 05:42:52 -0500 Subject: [PATCH] Allow multi-task objectives during training --- spacy/pipeline.pyx | 109 ++++++++++++++++++++++++++++--------- spacy/syntax/nn_parser.pxd | 1 + spacy/syntax/nn_parser.pyx | 16 +++++- 3 files changed, 99 insertions(+), 27 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index b91ddcc9d..17e9a15de 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -291,7 +291,7 @@ class TokenVectorEncoder(BaseThincComponent): if self.model is True: self.cfg['pretrained_dims'] = self.vocab.vectors_length self.model = self.Model(**self.cfg) - link_vectors_to_models(self.vocab) + link_vectors_to_models(self.vocab) class NeuralTagger(BaseThincComponent): @@ -395,7 +395,7 @@ class NeuralTagger(BaseThincComponent): if self.model is True: self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) - link_vectors_to_models(self.vocab) + link_vectors_to_models(self.vocab) @classmethod def Model(cls, n_tags, **cfg): @@ -477,9 +477,25 @@ class NeuralTagger(BaseThincComponent): class NeuralLabeller(NeuralTagger): name = 'nn_labeller' - def __init__(self, vocab, model=True, **cfg): + def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): self.vocab = vocab self.model = model + if target == 'dep': + self.make_label = self.make_dep + elif target == 'tag': + self.make_label = self.make_tag + elif target == 'ent': + self.make_label = self.make_ent + elif target == 'dep_tag_offset': + self.make_label = self.make_dep_tag_offset + elif target == 'ent_tag': + self.make_label = self.make_ent_tag + elif hasattr(target, '__call__'): + self.make_label = target + else: + raise ValueError( + "NeuralLabeller target should be function or one of " + "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']") self.cfg = dict(cfg) self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) @@ -495,43 +511,78 @@ class NeuralLabeller(NeuralTagger): def set_annotations(self, docs, dep_ids): pass - def begin_training(self, gold_tuples=tuple(), pipeline=None): + def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None): gold_tuples = nonproj.preprocess_training_data(gold_tuples) for raw_text, annots_brackets in gold_tuples: for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots - for dep in deps: - if dep not in self.labels: - self.labels[dep] = len(self.labels) - token_vector_width = pipeline[0].model.nO + for i in range(len(ids)): + label = self.make_label(i, words, tags, heads, deps, ents) + if label is not None and label not in self.labels: + self.labels[label] = len(self.labels) + print(len(self.labels)) if self.model is True: - self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] - self.model = self.Model(len(self.labels), **self.cfg) - link_vectors_to_models(self.vocab) + self.model = chain( + tok2vec, + Softmax(len(self.labels), 128) + ) + link_vectors_to_models(self.vocab) @classmethod - def Model(cls, n_tags, **cfg): - return build_tagger_model(n_tags, **cfg) + def Model(cls, n_tags, tok2vec=None, **cfg): + return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg) def get_loss(self, docs, golds, scores): - scores = self.model.ops.flatten(scores) cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype='i') guesses = scores.argmax(axis=1) for gold in golds: - for tag in gold.labels: - if tag is None or tag not in self.labels: + for i in range(len(gold.labels)): + label = self.make_label(i, gold.words, gold.tags, gold.heads, + gold.labels, gold.ents) + if label is None or label not in self.labels: correct[idx] = guesses[idx] else: - correct[idx] = self.labels[tag] + correct[idx] = self.labels[label] idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores /= d_scores.shape[0] loss = (d_scores**2).sum() - d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores + @staticmethod + def make_dep(i, words, tags, heads, deps, ents): + if deps[i] is None or heads[i] is None: + return None + return deps[i] + + @staticmethod + def make_tag(i, words, tags, heads, deps, ents): + return tags[i] + + @staticmethod + def make_ent(i, words, tags, heads, deps, ents): + if ents is None: + return None + return ents[i] + + @staticmethod + def make_dep_tag_offset(i, words, tags, heads, deps, ents): + if deps[i] is None or heads[i] is None: + return None + offset = heads[i] - i + offset = min(offset, 2) + offset = max(offset, -2) + return '%s-%s:%d' % (deps[i], tags[i], offset) + + @staticmethod + def make_ent_tag(i, words, tags, heads, deps, ents): + if ents is None or ents[i] is None: + return None + else: + return '%s-%s' % (tags[i], ents[i]) + class SimilarityHook(BaseThincComponent): """ @@ -695,6 +746,14 @@ cdef class NeuralDependencyParser(NeuralParser): name = 'parser' TransitionSystem = ArcEager + def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): + for target in ['dep']: + labeller = NeuralLabeller(self.vocab, target=target) + tok2vec = self.model[0] + labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) + pipeline.append(labeller) + self._multitasks.append(labeller) + def __reduce__(self): return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) @@ -705,13 +764,13 @@ cdef class NeuralEntityRecognizer(NeuralParser): nr_feature = 6 - def predict_confidences(self, docs): - tensors = [d.tensor for d in docs] - samples = [] - for i in range(10): - states = self.parse_batch(docs, tensors, drop=0.3) - for state in states: - samples.append(self._get_entities(state)) + def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): + for target in []: + labeller = NeuralLabeller(self.vocab, target=target) + tok2vec = self.model[0] + labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) + pipeline.append(labeller) + self._multitasks.append(labeller) def __reduce__(self): return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 524718965..b0b7693b7 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -13,6 +13,7 @@ cdef class Parser: cdef public object model cdef readonly TransitionSystem moves cdef readonly object cfg + cdef public object _multitasks cdef void _parse_step(self, StateC* state, const float* feat_weights, diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 9d9eda882..988c092af 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -318,6 +318,7 @@ cdef class Parser: for label in labels: self.moves.add_action(action, label) self.model = model + self._multitasks = [] def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) @@ -419,7 +420,7 @@ cdef class Parser: cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) while not next_step.empty(): if not has_hidden: - for i in range( + for i in cython.parallel.prange( next_step.size(), num_threads=6, nogil=True): self._parse_step(next_step[i], feat_weights, nr_class, nr_feat, nr_piece) @@ -745,7 +746,7 @@ cdef class Parser: # order, or the model goes out of synch self.cfg.setdefault('extra_labels', []).append(label) - def begin_training(self, gold_tuples, **cfg): + def begin_training(self, gold_tuples, pipeline=None, **cfg): if 'model' in cfg: self.model = cfg['model'] gold_tuples = nonproj.preprocess_training_data(gold_tuples) @@ -756,9 +757,20 @@ cdef class Parser: if self.model is True: cfg['pretrained_dims'] = self.vocab.vectors_length self.model, cfg = self.Model(self.moves.n_moves, **cfg) + self.init_multitask_objectives(gold_tuples, pipeline, **cfg) link_vectors_to_models(self.vocab) self.cfg.update(cfg) + def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): + '''Setup models for secondary objectives, to benefit from multi-task + learning. This method is intended to be overridden by subclasses. + + For instance, the dependency parser can benefit from sharing + an input representation with a label prediction model. These auxiliary + models are discarded after training. + ''' + pass + def preprocess_gold(self, docs_golds): for doc, gold in docs_golds: yield doc, gold