Ignore misaligned in Morphologizer.get_loss (#6363)

Fix bug where `Morphologizer.get_loss` treated misaligned annotation as `EMPTY_MORPH` rather than ignoring it. Remove unneeded default `EMPTY_MORPH` mappings.
2020-11-10 13:15:09 +01:00 · 2020-11-10 13:15:09 +01:00 · a7e7d6c6c9
parent a0c899a0ff
commit a7e7d6c6c9
2 changed files with 28 additions and 11 deletions
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -92,9 +92,6 @@ class Morphologizer(Tagger):
        # 2) labels_pos stores a mapping from morph+POS->POS
        cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
        self.cfg = dict(sorted(cfg.items()))
-        # add mappings for empty morph
-        self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
-        self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]

    @property
    def labels(self):
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
-                doc.c[j].pos = self.cfg["labels_pos"][morph]
+                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
+                doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)

    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
@ -228,12 +225,12 @@ class Morphologizer(Tagger):
                # doesn't, so if either is None, treat both as None here so that
                # truths doesn't end up with an unknown morph+POS combination
                if pos is None or morph is None:
-                    pos = None
-                    morph = None
-                label_dict = Morphology.feats_to_dict(morph)
-                if pos:
-                    label_dict[self.POS_FEAT] = pos
-                label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+                    label = None
+                else:
+                    label_dict = Morphology.feats_to_dict(morph)
+                    if pos:
+                        label_dict[self.POS_FEAT] = pos
+                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
                eg_truths.append(label)
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -116,3 +116,23 @@ def test_overfitting_IO():
    no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
+
+    # Test without POS
+    nlp.remove_pipe("morphologizer")
+    nlp.add_pipe("morphologizer")
+    for example in train_examples:
+        for token in example.reference:
+            token.pos_ = ""
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["morphologizer"] < 0.00001
+
+    # Test the trained model
+    test_text = "I like blue ham"
+    doc = nlp(test_text)
+    gold_morphs = ["Feat=N", "Feat=V", "", ""]
+    gold_pos_tags = ["", "", "", ""]
+    assert [str(t.morph) for t in doc] == gold_morphs
+    assert [t.pos_ for t in doc] == gold_pos_tags