From b04738903e3afc16f10bc3182c256742222ee3f6 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 13 May 2020 22:08:50 +0200
Subject: [PATCH] prevent None in gold fields (#5425)

* set gold fields to empty list instead of keeping them as None

* add unit test
---
 spacy/gold.pyx                 | 10 +++++++++-
 spacy/tests/parser/test_ner.py | 27 ++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 034bba08f..4b8a4f52d 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -658,7 +658,15 @@ cdef class GoldParse:
         entdoc = None
 
         # avoid allocating memory if the doc does not contain any tokens
-        if self.length > 0:
+        if self.length == 0:
+            self.words = []
+            self.tags = []
+            self.heads = []
+            self.labels = []
+            self.ner = []
+            self.morphology = []
+
+        else:
             if words is None:
                 words = [token.text for token in doc]
             if tags is None:
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 8329391ca..244e9fa25 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -7,7 +7,7 @@ from spacy.lang.en import English
 from spacy.pipeline import EntityRecognizer, EntityRuler
 from spacy.vocab import Vocab
 from spacy.syntax.ner import BiluoPushDown
-from spacy.gold import GoldParse
+from spacy.gold import GoldParse, minibatch
 from spacy.tokens import Doc
 
 
@@ -174,6 +174,31 @@ def test_accept_blocked_token():
     assert ner2.moves.is_valid(state2, "U-")
 
 
+def test_train_empty():
+    """Test that training an empty text does not throw errors."""
+    train_data = [
+        ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
+        ("", {"entities": []}),
+    ]
+
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    ner.add_label("PERSON")
+    nlp.add_pipe(ner, last=True)
+
+    nlp.begin_training()
+    for itn in range(2):
+        losses = {}
+        batches = minibatch(train_data)
+        for batch in batches:
+            texts, annotations = zip(*batch)
+            nlp.update(
+                texts,  # batch of texts
+                annotations,  # batch of annotations
+                losses=losses,
+            )
+
+
 def test_overwrite_token():
     nlp = English()
     ner1 = nlp.create_pipe("ner")