From a7626bd7fd2c5bd1d445e75ad36575fbf27079c0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 15:43:14 +0200 Subject: [PATCH] Tmp commit to example --- examples/train_ner_standalone.py | 78 ++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/examples/train_ner_standalone.py b/examples/train_ner_standalone.py index 612e7bec3..abc6a0152 100644 --- a/examples/train_ner_standalone.py +++ b/examples/train_ner_standalone.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python '''Example of training a named entity recognition system from scratch using spaCy This example is written to be self-contained and reasonably transparent. @@ -31,6 +32,8 @@ from spacy.gold import GoldParse from spacy.gold import _iob_to_biluo as iob_to_biluo from spacy.scorer import Scorer +from deepsense import neptune + try: unicode except NameError: @@ -81,7 +84,7 @@ def load_vocab(path): def init_ner_model(vocab, features=None): if features is None: features = tuple(EntityRecognizer.feature_templates) - return BeamEntityRecognizer(vocab, features=features) + return EntityRecognizer(vocab, features=features) def save_ner_model(model, path): @@ -99,7 +102,7 @@ def save_ner_model(model, path): def load_ner_model(vocab, path): - return BeamEntityRecognizer.load(path, vocab) + return EntityRecognizer.load(path, vocab) class Pipeline(object): @@ -110,18 +113,21 @@ class Pipeline(object): raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) if not path.is_dir(): raise IOError("Cannot load pipeline from %s\nNot a directory" % path) - vocab = load_vocab(path / 'vocab') + vocab = load_vocab(path) tokenizer = Tokenizer(vocab, {}, None, None, None) ner_model = load_ner_model(vocab, path / 'ner') return cls(vocab, tokenizer, ner_model) - def __init__(self, vocab=None, tokenizer=None, ner_model=None): + def __init__(self, vocab=None, tokenizer=None, entity=None): if vocab is None: - self.vocab = init_vocab() + vocab = init_vocab() if tokenizer is None: tokenizer = Tokenizer(vocab, {}, None, None, None) - if ner_model is None: - self.entity = init_ner_model(self.vocab) + if entity is None: + entity = init_ner_model(self.vocab) + self.vocab = vocab + self.tokenizer = tokenizer + self.entity = entity self.pipeline = [self.entity] def __call__(self, input_): @@ -173,7 +179,25 @@ class Pipeline(object): save_ner_model(self.entity, path / 'ner') -def train(nlp, train_examples, dev_examples, nr_epoch=5): +def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5): + channels = {} + channels['loss'] = ctx.job.create_channel( + name='loss', + channel_type=neptune.ChannelType.NUMERIC) + + channels['f'] = ctx.job.create_channel( + name='F-Measure', + channel_type=neptune.ChannelType.NUMERIC) + channels['p'] = ctx.job.create_channel( + name='Precision', + channel_type=neptune.ChannelType.NUMERIC) + channels['r'] = ctx.job.create_channel( + name='Recall', + channel_type=neptune.ChannelType.NUMERIC) + channels['log'] = ctx.job.create_channel( + name='logs', + channel_type=neptune.ChannelType.TEXT) + next_epoch = train_examples print("Iter", "Loss", "P", "R", "F") for i in range(nr_epoch): @@ -186,14 +210,25 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5): next_epoch.append((input_, annot)) random.shuffle(next_epoch) scores = nlp.evaluate(dev_examples) - precision = '%.2f' % scores['ents_p'] - recall = '%.2f' % scores['ents_r'] - f_measure = '%.2f' % scores['ents_f'] - print(i, int(loss), precision, recall, f_measure) + report_scores(channels, i, loss, scores) nlp.average_weights() scores = nlp.evaluate(dev_examples) - print("After averaging") - print(scores['ents_p'], scores['ents_r'], scores['ents_f']) + report_scores(channels, i+1, loss, scores) + + +def report_scores(channels, i, loss, scores): + precision = '%.2f' % scores['ents_p'] + recall = '%.2f' % scores['ents_r'] + f_measure = '%.2f' % scores['ents_f'] + print('%d %s %s %s' % (int(loss), precision, recall, f_measure)) + channels['log'].send(x=i, y='%d %s %s %s' % (int(loss), precision, recall, + f_measure)) + channels['f'].send(x=i, y=scores['ents_f']) + channels['p'].send(x=i, y=scores['ents_p']) + channels['r'].send(x=i, y=scores['ents_r']) + channels['loss'].send(x=i, y=loss) + + def read_examples(path): @@ -221,15 +256,22 @@ def read_examples(path): train_loc=("Path to your training data", "positional", None, Path), dev_loc=("Path to your development data", "positional", None, Path), ) -def main(model_dir, train_loc, dev_loc, nr_epoch=10): +def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'), + train_loc=None, dev_loc=None, nr_epoch=30): + ctx = neptune.Context() + + train_loc = Path(ctx.params.train_loc) + dev_loc = Path(ctx.params.dev_loc) + model_dir = model_dir.resolve() + train_examples = read_examples(train_loc) dev_examples = read_examples(dev_loc) - nlp = Pipeline() + nlp = Pipeline.load(model_dir) - train(nlp, train_examples, list(dev_examples), nr_epoch) + train(nlp, train_examples, list(dev_examples), ctx, nr_epoch) nlp.save(model_dir) if __name__ == '__main__': - plac.call(main) + main()