From 4cd9ec0f00788d08c053b885b5591fe134666a65 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Oct 2018 01:40:29 +0200
Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Update=20training=20examples=20a?=
 =?UTF-8?q?nd=20use=20minibatching=20(#2830)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
---
 examples/training/train_intent_parser.py   | 17 +++++++++++------
 examples/training/train_ner.py             | 12 ++++++++----
 examples/training/train_new_entity_type.py | 12 ++++++++----
 examples/training/train_parser.py          | 10 +++++++---
 examples/training/train_tagger.py          | 10 +++++++---
 5 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py
index 763c1471d..7c337baff 100644
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@@ -21,8 +21,9 @@ from __future__ import unicode_literals, print_function
 
 import plac
 import random
-import spacy
 from pathlib import Path
+import spacy
+from spacy.util import minibatch, compounding
 
 
 # training data: texts, heads and dependency labels
@@ -63,7 +64,7 @@ TRAIN_DATA = [
     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
     output_dir=("Optional output directory", "option", "o", Path),
     n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, output_dir=None, n_iter=5):
+def main(model=None, output_dir=None, n_iter=15):
     """Load the model, set up the pipeline and train the parser."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
@@ -89,9 +90,12 @@ def main(model=None, output_dir=None, n_iter=5):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for text, annotations in TRAIN_DATA:
-                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
-            print(losses)
+            # batch up the examples using spaCy's minibatch
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+            print('Losses', losses)
 
     # test the trained model
     test_model(nlp)
@@ -135,7 +139,8 @@ if __name__ == '__main__':
     # [
     #   ('find', 'ROOT', 'find'),
     #   ('cheapest', 'QUALITY', 'gym'),
-    #   ('gym', 'PLACE', 'find')
+    #   ('gym', 'PLACE', 'find'),
+    #   ('near', 'ATTRIBUTE', 'gym'),
     #   ('work', 'LOCATION', 'near')
     # ]
     # show me the best hotel in berlin
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index 895ee4a3d..a05d552ea 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -15,6 +15,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
+from spacy.util import minibatch, compounding
 
 
 # training data
@@ -62,14 +63,17 @@ def main(model=None, output_dir=None, n_iter=100):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            for batch in batches:
+                texts, annotations = zip(*batch)
                 nlp.update(
-                    [text],  # batch of texts
-                    [annotations],  # batch of annotations
+                    texts,  # batch of texts
+                    annotations,  # batch of annotations
                     drop=0.5,  # dropout - make it harder to memorise data
                     sgd=optimizer,  # callable to update weights
                     losses=losses)
-            print(losses)
+            print('Losses', losses)
 
     # test the trained model
     for text, _ in TRAIN_DATA:
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index b2b1c656d..6a4863b8a 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -31,6 +31,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
+from spacy.util import minibatch, compounding
 
 
 # new entity label
@@ -73,7 +74,7 @@ TRAIN_DATA = [
     new_model_name=("New model name for model meta.", "option", "nm", str),
     output_dir=("Optional output directory", "option", "o", Path),
     n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
+def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
     """Set up the pipeline and entity recognizer, and train the new entity."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
@@ -104,10 +105,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for text, annotations in TRAIN_DATA:
-                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
+            # batch up the examples using spaCy's minibatch
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                            losses=losses)
-            print(losses)
+            print('Losses', losses)
 
     # test the trained model
     test_text = 'Do you like horses?'
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
index 6dd3668fd..f91ead7c4 100644
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@@ -13,6 +13,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
+from spacy.util import minibatch, compounding
 
 
 # training data
@@ -62,9 +63,12 @@ def main(model=None, output_dir=None, n_iter=10):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for text, annotations in TRAIN_DATA:
-                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
-            print(losses)
+            # batch up the examples using spaCy's minibatch
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+            print('Losses', losses)
 
     # test the trained model
     test_text = "I like securities."
diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index 6eb7213cf..0971294e5 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -16,6 +16,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
+from spacy.util import minibatch, compounding
 
 
 # You need to define a mapping from your data's part-of-speech tag names to the
@@ -63,9 +64,12 @@ def main(lang='en', output_dir=None, n_iter=25):
     for i in range(n_iter):
         random.shuffle(TRAIN_DATA)
         losses = {}
-        for text, annotations in TRAIN_DATA:
-            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
-        print(losses)
+        # batch up the examples using spaCy's minibatch
+        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+        for batch in batches:
+            texts, annotations = zip(*batch)
+            nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+        print('Losses', losses)
 
     # test the trained model
     test_text = "I like blue eggs"