prevent None in gold fields (#5425)

* set gold fields to empty list instead of keeping them as None * add unit test
2020-05-13 22:08:50 +02:00 · 2020-05-13 22:08:50 +02:00 · b04738903e
parent 113e7981d0
commit b04738903e
2 changed files with 35 additions and 2 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -658,7 +658,15 @@ cdef class GoldParse:
        entdoc = None
        # avoid allocating memory if the doc does not contain any tokens
-        if self.length > 0:
+        if self.length == 0:
            self.words = []
            self.tags = []
            self.heads = []
            self.labels = []
            self.ner = []
            self.morphology = []
        else:
            if words is None:
                words = [token.text for token in doc]
            if tags is None:
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -7,7 +7,7 @@ from spacy.lang.en import English
 from spacy.pipeline import EntityRecognizer, EntityRuler
 from spacy.vocab import Vocab
 from spacy.syntax.ner import BiluoPushDown
-from spacy.gold import GoldParse
+from spacy.gold import GoldParse, minibatch
 from spacy.tokens import Doc
@ -174,6 +174,31 @@ def test_accept_blocked_token():
    assert ner2.moves.is_valid(state2, "U-")
 def test_train_empty():
    """Test that training an empty text does not throw errors."""
    train_data = [
        ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
        ("", {"entities": []}),
    ]
    nlp = English()
    ner = nlp.create_pipe("ner")
    ner.add_label("PERSON")
    nlp.add_pipe(ner, last=True)
    nlp.begin_training()
    for itn in range(2):
        losses = {}
        batches = minibatch(train_data)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                losses=losses,
            )
 def test_overwrite_token():
    nlp = English()
    ner1 = nlp.create_pipe("ner")