From 9d3ce7cba237361fd9f442f3b02abfa464eac666 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 2 Oct 2019 12:50:48 +0200 Subject: [PATCH] Ensure training doesn't crash with empty batches (#4360) * unit test for previously resolved unflatten issue * prevent batch of empty docs to cause problems --- spacy/pipeline/pipes.pyx | 13 +++++++++++ spacy/tests/regression/test_issue3001-3500.py | 8 +++++++ spacy/tests/regression/test_issue4348.py | 23 +++++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 spacy/tests/regression/test_issue4348.py diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index edb8de531..23509fcae 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -454,6 +454,10 @@ class Tagger(Pipe): if losses is not None and self.name not in losses: losses[self.name] = 0. + if not any(len(doc) for doc in docs): + # Handle cases where there are no tokens in any docs. + return + tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) bp_tag_scores(d_tag_scores, sgd=sgd) @@ -467,6 +471,9 @@ class Tagger(Pipe): """ if self._rehearsal_model is None: return + if not any(len(doc) for doc in docs): + # Handle cases where there are no tokens in any docs. + return guesses, backprop = self.model.begin_update(docs, drop=drop) target = self._rehearsal_model(docs) gradient = guesses - target @@ -968,6 +975,9 @@ class TextCategorizer(Pipe): def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): self.require_model() + if not any(len(doc) for doc in docs): + # Handle cases where there are no tokens in any docs. + return scores, bp_scores = self.model.begin_update(docs, drop=drop) loss, d_scores = self.get_loss(docs, golds, scores) bp_scores(d_scores, sgd=sgd) @@ -978,6 +988,9 @@ class TextCategorizer(Pipe): def rehearse(self, docs, drop=0., sgd=None, losses=None): if self._rehearsal_model is None: return + if not any(len(doc) for doc in docs): + # Handle cases where there are no tokens in any docs. + return scores, bp_scores = self.model.begin_update(docs, drop=drop) target = self._rehearsal_model(docs) gradient = scores - target diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index c430678d3..35011b532 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -318,6 +318,14 @@ def test_issue3449(): assert t3[5].text == "I" +def test_issue3456(): + # this crashed because of a padding error in layer.ops.unflatten in thinc + nlp = English() + nlp.add_pipe(nlp.create_pipe("tagger")) + nlp.begin_training() + list(nlp.pipe(['hi', ''])) + + def test_issue3468(): """Test that sentence boundaries are set correctly so Doc.is_sentenced can be restored after serialization.""" diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py new file mode 100644 index 000000000..9391c3529 --- /dev/null +++ b/spacy/tests/regression/test_issue4348.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.lang.en import English +from spacy.util import minibatch, compounding + + +def test_issue4348(): + """Test that training the tagger with empty data, doesn't throw errors""" + + TRAIN_DATA = [("", {"tags": []}), ("", {"tags": []})] + + nlp = English() + tagger = nlp.create_pipe("tagger") + nlp.add_pipe(tagger) + + optimizer = nlp.begin_training() + for i in range(5): + losses = {} + batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update(texts, annotations, sgd=optimizer, losses=losses)