diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index a56c9975e..e83fe63ba 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -128,7 +128,7 @@ class EditTreeLemmatizer(TrainablePipe): for (predicted, gold_lemma) in zip( eg.predicted, eg.get_aligned("LEMMA", as_string=True) ): - if gold_lemma is None: + if gold_lemma is None or gold_lemma == "": label = -1 else: tree_id = self.trees.add(predicted.text, gold_lemma) diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py index b12ca5dd4..c4f9b09f3 100644 --- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py +++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py @@ -139,6 +139,20 @@ def test_incomplete_data(): assert doc[1].lemma_ == "like" assert doc[2].lemma_ == "blue" + # Check that incomplete annotations are ignored. + scores, _ = lemmatizer.model([eg.predicted for eg in train_examples], is_train=True) + _, dX = lemmatizer.get_loss(train_examples, scores) + xp = lemmatizer.model.ops.xp + + # Missing annotations. + assert xp.count_nonzero(dX[0][0]) == 0 + assert xp.count_nonzero(dX[0][3]) == 0 + assert xp.count_nonzero(dX[1][0]) == 0 + assert xp.count_nonzero(dX[1][3]) == 0 + + # Misaligned annotations. + assert xp.count_nonzero(dX[1][1]) == 0 + def test_overfitting_IO(): nlp = English()