Allow multi-task objectives during training

This commit is contained in:
Matthew Honnibal 2017-09-26 05:42:52 -05:00
parent 4ae9ea7684
commit bf917225ab
3 changed files with 99 additions and 27 deletions

View File

@ -291,7 +291,7 @@ class TokenVectorEncoder(BaseThincComponent):
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab)
link_vectors_to_models(self.vocab)
class NeuralTagger(BaseThincComponent):
@ -395,7 +395,7 @@ class NeuralTagger(BaseThincComponent):
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
link_vectors_to_models(self.vocab)
link_vectors_to_models(self.vocab)
@classmethod
def Model(cls, n_tags, **cfg):
@ -477,9 +477,25 @@ class NeuralTagger(BaseThincComponent):
class NeuralLabeller(NeuralTagger):
name = 'nn_labeller'
def __init__(self, vocab, model=True, **cfg):
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab
self.model = model
if target == 'dep':
self.make_label = self.make_dep
elif target == 'tag':
self.make_label = self.make_tag
elif target == 'ent':
self.make_label = self.make_ent
elif target == 'dep_tag_offset':
self.make_label = self.make_dep_tag_offset
elif target == 'ent_tag':
self.make_label = self.make_ent_tag
elif hasattr(target, '__call__'):
self.make_label = target
else:
raise ValueError(
"NeuralLabeller target should be function or one of "
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
@ -495,43 +511,78 @@ class NeuralLabeller(NeuralTagger):
def set_annotations(self, docs, dep_ids):
pass
def begin_training(self, gold_tuples=tuple(), pipeline=None):
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
for dep in deps:
if dep not in self.labels:
self.labels[dep] = len(self.labels)
token_vector_width = pipeline[0].model.nO
for i in range(len(ids)):
label = self.make_label(i, words, tags, heads, deps, ents)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
print(len(self.labels))
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.model = self.Model(len(self.labels), **self.cfg)
link_vectors_to_models(self.vocab)
self.model = chain(
tok2vec,
Softmax(len(self.labels), 128)
)
link_vectors_to_models(self.vocab)
@classmethod
def Model(cls, n_tags, **cfg):
return build_tagger_model(n_tags, **cfg)
def Model(cls, n_tags, tok2vec=None, **cfg):
return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)
for gold in golds:
for tag in gold.labels:
if tag is None or tag not in self.labels:
for i in range(len(gold.labels)):
label = self.make_label(i, gold.words, gold.tags, gold.heads,
gold.labels, gold.ents)
if label is None or label not in self.labels:
correct[idx] = guesses[idx]
else:
correct[idx] = self.labels[tag]
correct[idx] = self.labels[label]
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
@staticmethod
def make_dep(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
return deps[i]
@staticmethod
def make_tag(i, words, tags, heads, deps, ents):
return tags[i]
@staticmethod
def make_ent(i, words, tags, heads, deps, ents):
if ents is None:
return None
return ents[i]
@staticmethod
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
offset = heads[i] - i
offset = min(offset, 2)
offset = max(offset, -2)
return '%s-%s:%d' % (deps[i], tags[i], offset)
@staticmethod
def make_ent_tag(i, words, tags, heads, deps, ents):
if ents is None or ents[i] is None:
return None
else:
return '%s-%s' % (tags[i], ents[i])
class SimilarityHook(BaseThincComponent):
"""
@ -695,6 +746,14 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser'
TransitionSystem = ArcEager
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in ['dep']:
labeller = NeuralLabeller(self.vocab, target=target)
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
pipeline.append(labeller)
self._multitasks.append(labeller)
def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
@ -705,13 +764,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6
def predict_confidences(self, docs):
tensors = [d.tensor for d in docs]
samples = []
for i in range(10):
states = self.parse_batch(docs, tensors, drop=0.3)
for state in states:
samples.append(self._get_entities(state))
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in []:
labeller = NeuralLabeller(self.vocab, target=target)
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
pipeline.append(labeller)
self._multitasks.append(labeller)
def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)

View File

@ -13,6 +13,7 @@ cdef class Parser:
cdef public object model
cdef readonly TransitionSystem moves
cdef readonly object cfg
cdef public object _multitasks
cdef void _parse_step(self, StateC* state,
const float* feat_weights,

View File

@ -318,6 +318,7 @@ cdef class Parser:
for label in labels:
self.moves.add_action(action, label)
self.model = model
self._multitasks = []
def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None)
@ -419,7 +420,7 @@ cdef class Parser:
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
while not next_step.empty():
if not has_hidden:
for i in range(
for i in cython.parallel.prange(
next_step.size(), num_threads=6, nogil=True):
self._parse_step(next_step[i],
feat_weights, nr_class, nr_feat, nr_piece)
@ -745,7 +746,7 @@ cdef class Parser:
# order, or the model goes out of synch
self.cfg.setdefault('extra_labels', []).append(label)
def begin_training(self, gold_tuples, **cfg):
def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg:
self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
@ -756,9 +757,20 @@ cdef class Parser:
if self.model is True:
cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
link_vectors_to_models(self.vocab)
self.cfg.update(cfg)
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
'''Setup models for secondary objectives, to benefit from multi-task
learning. This method is intended to be overridden by subclasses.
For instance, the dependency parser can benefit from sharing
an input representation with a label prediction model. These auxiliary
models are discarded after training.
'''
pass
def preprocess_gold(self, docs_golds):
for doc, gold in docs_golds:
yield doc, gold