From a7e7d6c6c902055b66b208c299cfe8b578a497d8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 10 Nov 2020 13:15:09 +0100 Subject: [PATCH] Ignore misaligned in Morphologizer.get_loss (#6363) Fix bug where `Morphologizer.get_loss` treated misaligned annotation as `EMPTY_MORPH` rather than ignoring it. Remove unneeded default `EMPTY_MORPH` mappings. --- spacy/pipeline/morphologizer.pyx | 19 ++++++++----------- spacy/tests/pipeline/test_morphologizer.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index a03c7daf0..305f8f5df 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -92,9 +92,6 @@ class Morphologizer(Tagger): # 2) labels_pos stores a mapping from morph+POS->POS cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}} self.cfg = dict(sorted(cfg.items())) - # add mappings for empty morph - self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH - self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""] @property def labels(self): @@ -201,8 +198,8 @@ class Morphologizer(Tagger): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): morph = self.labels[tag_id] - doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph]) - doc.c[j].pos = self.cfg["labels_pos"][morph] + doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0)) + doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0) def get_loss(self, examples, scores): """Find the loss and gradient of loss for the batch of documents and @@ -228,12 +225,12 @@ class Morphologizer(Tagger): # doesn't, so if either is None, treat both as None here so that # truths doesn't end up with an unknown morph+POS combination if pos is None or morph is None: - pos = None - morph = None - label_dict = Morphology.feats_to_dict(morph) - if pos: - label_dict[self.POS_FEAT] = pos - label = self.vocab.strings[self.vocab.morphology.add(label_dict)] + label = None + else: + label_dict = Morphology.feats_to_dict(morph) + if pos: + label_dict[self.POS_FEAT] = pos + label = self.vocab.strings[self.vocab.morphology.add(label_dict)] eg_truths.append(label) truths.append(eg_truths) d_scores, loss = loss_func(scores, truths) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 85d1d6c8b..add42e00a 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -116,3 +116,23 @@ def test_overfitting_IO(): no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + + # Test without POS + nlp.remove_pipe("morphologizer") + nlp.add_pipe("morphologizer") + for example in train_examples: + for token in example.reference: + token.pos_ = "" + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses["morphologizer"] < 0.00001 + + # Test the trained model + test_text = "I like blue ham" + doc = nlp(test_text) + gold_morphs = ["Feat=N", "Feat=V", "", ""] + gold_pos_tags = ["", "", "", ""] + assert [str(t.morph) for t in doc] == gold_morphs + assert [t.pos_ for t in doc] == gold_pos_tags