mirror of https://github.com/explosion/spaCy.git
Handle missing annotations in the edit tree lemmatizer (#12098)
The losses/gradients of missing annotations were not correctly masked out. Fix this and check the masking in the partial data test.
This commit is contained in:
parent
319eb508b5
commit
dda7331da3
|
@ -128,7 +128,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
for (predicted, gold_lemma) in zip(
|
for (predicted, gold_lemma) in zip(
|
||||||
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
|
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
|
||||||
):
|
):
|
||||||
if gold_lemma is None:
|
if gold_lemma is None or gold_lemma == "":
|
||||||
label = -1
|
label = -1
|
||||||
else:
|
else:
|
||||||
tree_id = self.trees.add(predicted.text, gold_lemma)
|
tree_id = self.trees.add(predicted.text, gold_lemma)
|
||||||
|
|
|
@ -139,6 +139,20 @@ def test_incomplete_data():
|
||||||
assert doc[1].lemma_ == "like"
|
assert doc[1].lemma_ == "like"
|
||||||
assert doc[2].lemma_ == "blue"
|
assert doc[2].lemma_ == "blue"
|
||||||
|
|
||||||
|
# Check that incomplete annotations are ignored.
|
||||||
|
scores, _ = lemmatizer.model([eg.predicted for eg in train_examples], is_train=True)
|
||||||
|
_, dX = lemmatizer.get_loss(train_examples, scores)
|
||||||
|
xp = lemmatizer.model.ops.xp
|
||||||
|
|
||||||
|
# Missing annotations.
|
||||||
|
assert xp.count_nonzero(dX[0][0]) == 0
|
||||||
|
assert xp.count_nonzero(dX[0][3]) == 0
|
||||||
|
assert xp.count_nonzero(dX[1][0]) == 0
|
||||||
|
assert xp.count_nonzero(dX[1][3]) == 0
|
||||||
|
|
||||||
|
# Misaligned annotations.
|
||||||
|
assert xp.count_nonzero(dX[1][1]) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
Loading…
Reference in New Issue