From b04738903e3afc16f10bc3182c256742222ee3f6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 13 May 2020 22:08:50 +0200 Subject: [PATCH] prevent None in gold fields (#5425) * set gold fields to empty list instead of keeping them as None * add unit test --- spacy/gold.pyx | 10 +++++++++- spacy/tests/parser/test_ner.py | 27 ++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 034bba08f..4b8a4f52d 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -658,7 +658,15 @@ cdef class GoldParse: entdoc = None # avoid allocating memory if the doc does not contain any tokens - if self.length > 0: + if self.length == 0: + self.words = [] + self.tags = [] + self.heads = [] + self.labels = [] + self.ner = [] + self.morphology = [] + + else: if words is None: words = [token.text for token in doc] if tags is None: diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 8329391ca..244e9fa25 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -7,7 +7,7 @@ from spacy.lang.en import English from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown -from spacy.gold import GoldParse +from spacy.gold import GoldParse, minibatch from spacy.tokens import Doc @@ -174,6 +174,31 @@ def test_accept_blocked_token(): assert ner2.moves.is_valid(state2, "U-") +def test_train_empty(): + """Test that training an empty text does not throw errors.""" + train_data = [ + ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), + ("", {"entities": []}), + ] + + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("PERSON") + nlp.add_pipe(ner, last=True) + + nlp.begin_training() + for itn in range(2): + losses = {} + batches = minibatch(train_data) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update( + texts, # batch of texts + annotations, # batch of annotations + losses=losses, + ) + + def test_overwrite_token(): nlp = English() ner1 = nlp.create_pipe("ner")