From 9d3ce7cba237361fd9f442f3b02abfa464eac666 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 2 Oct 2019 12:50:48 +0200
Subject: [PATCH] Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems
---
 spacy/pipeline/pipes.pyx                      | 13 +++++++++++
 spacy/tests/regression/test_issue3001-3500.py |  8 +++++++
 spacy/tests/regression/test_issue4348.py      | 23 +++++++++++++++++++
 3 files changed, 44 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue4348.py

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index edb8de531..23509fcae 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -454,6 +454,10 @@ class Tagger(Pipe):
         if losses is not None and self.name not in losses:
             losses[self.name] = 0.
 
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            return
+
         tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
         bp_tag_scores(d_tag_scores, sgd=sgd)
@@ -467,6 +471,9 @@ class Tagger(Pipe):
         """
         if self._rehearsal_model is None:
             return
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            return
         guesses, backprop = self.model.begin_update(docs, drop=drop)
         target = self._rehearsal_model(docs)
         gradient = guesses - target
@@ -968,6 +975,9 @@ class TextCategorizer(Pipe):
 
     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
         self.require_model()
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            return
         scores, bp_scores = self.model.begin_update(docs, drop=drop)
         loss, d_scores = self.get_loss(docs, golds, scores)
         bp_scores(d_scores, sgd=sgd)
@@ -978,6 +988,9 @@ class TextCategorizer(Pipe):
     def rehearse(self, docs, drop=0., sgd=None, losses=None):
         if self._rehearsal_model is None:
             return
+        if not any(len(doc) for doc in docs):
+            # Handle cases where there are no tokens in any docs.
+            return
         scores, bp_scores = self.model.begin_update(docs, drop=drop)
         target = self._rehearsal_model(docs)
         gradient = scores - target
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index c430678d3..35011b532 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -318,6 +318,14 @@ def test_issue3449():
     assert t3[5].text == "I"
 
 
+def test_issue3456():
+    # this crashed because of a padding error in layer.ops.unflatten in thinc
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("tagger"))
+    nlp.begin_training()
+    list(nlp.pipe(['hi', '']))
+
+
 def test_issue3468():
     """Test that sentence boundaries are set correctly so Doc.is_sentenced can
     be restored after serialization."""
diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py
new file mode 100644
index 000000000..9391c3529
--- /dev/null
+++ b/spacy/tests/regression/test_issue4348.py
@@ -0,0 +1,23 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from spacy.util import minibatch, compounding
+
+
+def test_issue4348():
+    """Test that training the tagger with empty data, doesn't throw errors"""
+
+    TRAIN_DATA = [("", {"tags": []}), ("", {"tags": []})]
+
+    nlp = English()
+    tagger = nlp.create_pipe("tagger")
+    nlp.add_pipe(tagger)
+
+    optimizer = nlp.begin_training()
+    for i in range(5):
+        losses = {}
+        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        for batch in batches:
+            texts, annotations = zip(*batch)
+            nlp.update(texts, annotations, sgd=optimizer, losses=losses)