From 4cd9ec0f00788d08c053b885b5591fe134666a65 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Oct 2018 01:40:29 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Update=20training=20examples=20a?= =?UTF-8?q?nd=20use=20minibatching=20(#2830)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results. ### Types of change enhancements ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- examples/training/train_intent_parser.py | 17 +++++++++++------ examples/training/train_ner.py | 12 ++++++++---- examples/training/train_new_entity_type.py | 12 ++++++++---- examples/training/train_parser.py | 10 +++++++--- examples/training/train_tagger.py | 10 +++++++--- 5 files changed, 41 insertions(+), 20 deletions(-) diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index 763c1471d..7c337baff 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -21,8 +21,9 @@ from __future__ import unicode_literals, print_function import plac import random -import spacy from pathlib import Path +import spacy +from spacy.util import minibatch, compounding # training data: texts, heads and dependency labels @@ -63,7 +64,7 @@ TRAIN_DATA = [ model=("Model name. Defaults to blank 'en' model.", "option", "m", str), output_dir=("Optional output directory", "option", "o", Path), n_iter=("Number of training iterations", "option", "n", int)) -def main(model=None, output_dir=None, n_iter=5): +def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model @@ -89,9 +90,12 @@ def main(model=None, output_dir=None, n_iter=5): for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for text, annotations in TRAIN_DATA: - nlp.update([text], [annotations], sgd=optimizer, losses=losses) - print(losses) + # batch up the examples using spaCy's minibatch + batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update(texts, annotations, sgd=optimizer, losses=losses) + print('Losses', losses) # test the trained model test_model(nlp) @@ -135,7 +139,8 @@ if __name__ == '__main__': # [ # ('find', 'ROOT', 'find'), # ('cheapest', 'QUALITY', 'gym'), - # ('gym', 'PLACE', 'find') + # ('gym', 'PLACE', 'find'), + # ('near', 'ATTRIBUTE', 'gym'), # ('work', 'LOCATION', 'near') # ] # show me the best hotel in berlin diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 895ee4a3d..a05d552ea 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -15,6 +15,7 @@ import plac import random from pathlib import Path import spacy +from spacy.util import minibatch, compounding # training data @@ -62,14 +63,17 @@ def main(model=None, output_dir=None, n_iter=100): for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for text, annotations in TRAIN_DATA: + # batch up the examples using spaCy's minibatch + batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) + for batch in batches: + texts, annotations = zip(*batch) nlp.update( - [text], # batch of texts - [annotations], # batch of annotations + texts, # batch of texts + annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) - print(losses) + print('Losses', losses) # test the trained model for text, _ in TRAIN_DATA: diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index b2b1c656d..6a4863b8a 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -31,6 +31,7 @@ import plac import random from pathlib import Path import spacy +from spacy.util import minibatch, compounding # new entity label @@ -73,7 +74,7 @@ TRAIN_DATA = [ new_model_name=("New model name for model meta.", "option", "nm", str), output_dir=("Optional output directory", "option", "o", Path), n_iter=("Number of training iterations", "option", "n", int)) -def main(model=None, new_model_name='animal', output_dir=None, n_iter=20): +def main(model=None, new_model_name='animal', output_dir=None, n_iter=10): """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model @@ -104,10 +105,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=20): for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for text, annotations in TRAIN_DATA: - nlp.update([text], [annotations], sgd=optimizer, drop=0.35, + # batch up the examples using spaCy's minibatch + batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) - print(losses) + print('Losses', losses) # test the trained model test_text = 'Do you like horses?' diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index 6dd3668fd..f91ead7c4 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -13,6 +13,7 @@ import plac import random from pathlib import Path import spacy +from spacy.util import minibatch, compounding # training data @@ -62,9 +63,12 @@ def main(model=None, output_dir=None, n_iter=10): for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for text, annotations in TRAIN_DATA: - nlp.update([text], [annotations], sgd=optimizer, losses=losses) - print(losses) + # batch up the examples using spaCy's minibatch + batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update(texts, annotations, sgd=optimizer, losses=losses) + print('Losses', losses) # test the trained model test_text = "I like securities." diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 6eb7213cf..0971294e5 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -16,6 +16,7 @@ import plac import random from pathlib import Path import spacy +from spacy.util import minibatch, compounding # You need to define a mapping from your data's part-of-speech tag names to the @@ -63,9 +64,12 @@ def main(lang='en', output_dir=None, n_iter=25): for i in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} - for text, annotations in TRAIN_DATA: - nlp.update([text], [annotations], sgd=optimizer, losses=losses) - print(losses) + # batch up the examples using spaCy's minibatch + batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update(texts, annotations, sgd=optimizer, losses=losses) + print('Losses', losses) # test the trained model test_text = "I like blue eggs"