Update morphologizer (#5766)

* update `Morphologizer.begin_training` for use with `Example` * make init and begin_training more consistent * add `Morphology.normalize_features` to normalize outside of `Morphology.add` * make sure `get_loss` doesn't create unknown labels when the POS and morph alignments differ
2020-07-19 11:10:51 +02:00 · 2020-07-19 11:10:51 +02:00 · b81a89f0a9
parent 38b59d728d
commit b81a89f0a9
3 changed files with 91 additions and 40 deletions
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -58,7 +58,7 @@ cdef class Morphology:
    FEATURE_SEP = "|"
    FIELD_SEP = "="
    VALUE_SEP = ","
-    EMPTY_MORPH = "_"
+    EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0

    def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
        self.mem = Pool()
@ -117,13 +117,7 @@ cdef class Morphology:
        if not isinstance(features, dict):
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
-        features = _normalize_props(features)
        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # normalized UFEATS string with sorted fields and values
-        norm_feats_string = self.FEATURE_SEP.join(sorted([
-                self.FIELD_SEP.join([field, values])
-            for field, values in string_features.items()
-        ]))
        # intified ("Field", "Field=Value") pairs
        field_feature_pairs = []
        for field in sorted(string_features):
@ -137,6 +131,7 @@ cdef class Morphology:
        # the hash key for the tag is either the hash of the normalized UFEATS
        # string or the hash of an empty placeholder (using the empty string
        # would give a hash key of 0, which is not good for PreshMap)
+        norm_feats_string = self.normalize_features(features)
        if norm_feats_string:
            tag.key = self.strings.add(norm_feats_string)
        else:
@ -144,6 +139,26 @@ cdef class Morphology:
        self.insert(tag)
        return tag.key

+    def normalize_features(self, features):
+        """Create a normalized UFEATS string from a features string or dict.
+
+        features (Union[dict, str]): Features as dict or UFEATS string.
+        RETURNS (str): Features as normalized UFEATS string.
+        """
+        if isinstance(features, str):
+            features = self.feats_to_dict(features)
+        if not isinstance(features, dict):
+            warnings.warn(Warnings.W100.format(feature=features))
+            features = {}
+        features = _normalize_props(features)
+        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
+        # normalized UFEATS string with sorted fields and values
+        norm_feats_string = self.FEATURE_SEP.join(sorted([
+                self.FIELD_SEP.join([field, values])
+            for field, values in string_features.items()
+        ]))
+        return norm_feats_string or self.EMPTY_MORPH
+
    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
        """Creates a MorphAnalysisC from a list of intified
        ("Field", "Field=Value") tuples where fields with multiple values have
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -23,29 +23,45 @@ from .defaults import default_morphologizer
@component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer)
 class Morphologizer(Tagger):

+    POS_FEAT = "POS"
+
    def __init__(self, vocab, model, **cfg):
        self.vocab = vocab
        self.model = model
        self._rehearsal_model = None
        self.cfg = dict(sorted(cfg.items()))
-        self.cfg.setdefault("labels", {})
-        self.cfg.setdefault("morph_pos", {})
+        # to be able to set annotations without string operations on labels,
+        # store mappings from morph+POS labels to token-level annotations:
+        # 1) labels_morph stores a mapping from morph+POS->morph
+        self.cfg.setdefault("labels_morph", {})
+        # 2) labels_pos stores a mapping from morph+POS->POS
+        self.cfg.setdefault("labels_pos", {})
+        # add mappings for empty morph
+        self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
+        self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]

    @property
    def labels(self):
-        return tuple(self.cfg["labels"].keys())
+        return tuple(self.cfg["labels_morph"].keys())

    def add_label(self, label):
        if not isinstance(label, str):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
-        morph = Morphology.feats_to_dict(label)
-        norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
-        pos = morph.get("POS", "")
-        if norm_morph_pos not in self.cfg["labels"]:
-            self.cfg["labels"][norm_morph_pos] = norm_morph_pos
-            self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
+        # normalize label
+        norm_label = self.vocab.morphology.normalize_features(label)
+        # extract separate POS and morph tags
+        label_dict = Morphology.feats_to_dict(label)
+        pos = label_dict.get(self.POS_FEAT, "")
+        if self.POS_FEAT in label_dict:
+            label_dict.pop(self.POS_FEAT)
+        # normalize morph string and add to morphology table
+        norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+        # add label mappings
+        if norm_label not in self.cfg["labels_morph"]:
+            self.cfg["labels_morph"][norm_label] = norm_morph
+            self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
        return 1

    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
@ -53,14 +69,16 @@ class Morphologizer(Tagger):
        for example in get_examples():
            for i, token in enumerate(example.reference):
                pos = token.pos_
-                morph = token.morph
-                norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
+                morph = token.morph_
+                # create and add the combined morph+POS label
+                morph_dict = Morphology.feats_to_dict(morph)
                if pos:
-                    morph["POS"] = pos
-                norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)]
-                if norm_morph_pos not in self.cfg["labels"]:
-                    self.cfg["labels"][norm_morph_pos] = norm_morph
-                    self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos]
+                    morph_dict[self.POS_FEAT] = pos
+                norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
+                # add label->morph and label->POS mappings
+                if norm_label not in self.cfg["labels_morph"]:
+                    self.cfg["labels_morph"][norm_label] = morph
+                    self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
        self.set_output(len(self.labels))
        self.model.initialize()
        link_vectors_to_models(self.vocab)
@ -79,8 +97,8 @@ class Morphologizer(Tagger):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph])
-                doc.c[j].pos = self.cfg["morph_pos"][morph]
+                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
+                doc.c[j].pos = self.cfg["labels_pos"][morph]

            doc.is_morphed = True

@ -94,14 +112,17 @@ class Morphologizer(Tagger):
            for i in range(len(morphs)):
                pos = pos_tags[i]
                morph = morphs[i]
-                feats = Morphology.feats_to_dict(morph)
+                # POS may align (same value for multiple tokens) when morph
+                # doesn't, so if either is None, treat both as None here so that
+                # truths doesn't end up with an unknown morph+POS combination
+                if pos is None or morph is None:
+                    pos = None
+                    morph = None
+                label_dict = Morphology.feats_to_dict(morph)
                if pos:
-                    feats["POS"] = pos
-                if len(feats) > 0:
-                    morph = self.vocab.strings[self.vocab.morphology.add(feats)]
-                if morph == "":
-                    morph = Morphology.EMPTY_MORPH
-                eg_truths.append(morph)
+                    label_dict[self.POS_FEAT] = pos
+                label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+                eg_truths.append(label)
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -5,6 +5,7 @@ from spacy.gold import Example
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tests.util import make_tempdir
+from spacy.morphology import Morphology


 def test_label_types():
@ -23,9 +24,10 @@ TRAIN_DATA = [
            "pos": ["NOUN", "VERB", "ADJ", "NOUN"],
        },
    ),
+    # test combinations of morph+POS
    (
        "Eat blue ham",
-        {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]},
+        {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},
    ),
 ]

@ -38,7 +40,12 @@ def test_overfitting_IO():
    for inst in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
        for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
-            morphologizer.add_label(morph + "|POS=" + pos)
+            if morph and pos:
+                morphologizer.add_label(morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos)
+            elif pos:
+                morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos)
+            elif morph:
+                morphologizer.add_label(morph)
    nlp.add_pipe(morphologizer)
    optimizer = nlp.begin_training()

@ -48,19 +55,27 @@ def test_overfitting_IO():
    assert losses["morphologizer"] < 0.00001

    # test the trained model
-    test_text = "I like blue eggs"
+    test_text = "I like blue ham"
    doc = nlp(test_text)
    gold_morphs = [
-        "Feat=N|POS=NOUN",
-        "Feat=V|POS=VERB",
-        "Feat=J|POS=ADJ",
-        "Feat=N|POS=NOUN",
+        "Feat=N",
+        "Feat=V",
+        "",
+        "",
+    ]
+    gold_pos_tags = [
+        "NOUN",
+        "VERB",
+        "ADJ",
+        "",
    ]
    assert [t.morph_ for t in doc] == gold_morphs
+    assert [t.pos_ for t in doc] == gold_pos_tags

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
-        assert gold_morphs == [t.morph_ for t in doc2]
+        assert [t.morph_ for t in doc2] == gold_morphs
+        assert [t.pos_ for t in doc2] == gold_pos_tags