From b81a89f0a94ce5a191720ba0eccff43667da6ba9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Jul 2020 11:10:51 +0200 Subject: [PATCH] Update morphologizer (#5766) * update `Morphologizer.begin_training` for use with `Example` * make init and begin_training more consistent * add `Morphology.normalize_features` to normalize outside of `Morphology.add` * make sure `get_loss` doesn't create unknown labels when the POS and morph alignments differ --- spacy/morphology.pyx | 29 ++++++--- spacy/pipeline/morphologizer.pyx | 71 ++++++++++++++-------- spacy/tests/pipeline/test_morphologizer.py | 31 +++++++--- 3 files changed, 91 insertions(+), 40 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index a3aa8be22..0852418f2 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -58,7 +58,7 @@ cdef class Morphology: FEATURE_SEP = "|" FIELD_SEP = "=" VALUE_SEP = "," - EMPTY_MORPH = "_" + EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0 def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None): self.mem = Pool() @@ -117,13 +117,7 @@ cdef class Morphology: if not isinstance(features, dict): warnings.warn(Warnings.W100.format(feature=features)) features = {} - features = _normalize_props(features) string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} - # normalized UFEATS string with sorted fields and values - norm_feats_string = self.FEATURE_SEP.join(sorted([ - self.FIELD_SEP.join([field, values]) - for field, values in string_features.items() - ])) # intified ("Field", "Field=Value") pairs field_feature_pairs = [] for field in sorted(string_features): @@ -137,6 +131,7 @@ cdef class Morphology: # the hash key for the tag is either the hash of the normalized UFEATS # string or the hash of an empty placeholder (using the empty string # would give a hash key of 0, which is not good for PreshMap) + norm_feats_string = self.normalize_features(features) if norm_feats_string: tag.key = self.strings.add(norm_feats_string) else: @@ -144,6 +139,26 @@ cdef class Morphology: self.insert(tag) return tag.key + def normalize_features(self, features): + """Create a normalized UFEATS string from a features string or dict. + + features (Union[dict, str]): Features as dict or UFEATS string. + RETURNS (str): Features as normalized UFEATS string. + """ + if isinstance(features, str): + features = self.feats_to_dict(features) + if not isinstance(features, dict): + warnings.warn(Warnings.W100.format(feature=features)) + features = {} + features = _normalize_props(features) + string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} + # normalized UFEATS string with sorted fields and values + norm_feats_string = self.FEATURE_SEP.join(sorted([ + self.FIELD_SEP.join([field, values]) + for field, values in string_features.items() + ])) + return norm_feats_string or self.EMPTY_MORPH + cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *: """Creates a MorphAnalysisC from a list of intified ("Field", "Field=Value") tuples where fields with multiple values have diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 57b778434..bc77dda47 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -23,29 +23,45 @@ from .defaults import default_morphologizer @component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer) class Morphologizer(Tagger): + POS_FEAT = "POS" + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model self._rehearsal_model = None self.cfg = dict(sorted(cfg.items())) - self.cfg.setdefault("labels", {}) - self.cfg.setdefault("morph_pos", {}) + # to be able to set annotations without string operations on labels, + # store mappings from morph+POS labels to token-level annotations: + # 1) labels_morph stores a mapping from morph+POS->morph + self.cfg.setdefault("labels_morph", {}) + # 2) labels_pos stores a mapping from morph+POS->POS + self.cfg.setdefault("labels_pos", {}) + # add mappings for empty morph + self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH + self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""] @property def labels(self): - return tuple(self.cfg["labels"].keys()) + return tuple(self.cfg["labels_morph"].keys()) def add_label(self, label): if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 - morph = Morphology.feats_to_dict(label) - norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)] - pos = morph.get("POS", "") - if norm_morph_pos not in self.cfg["labels"]: - self.cfg["labels"][norm_morph_pos] = norm_morph_pos - self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos] + # normalize label + norm_label = self.vocab.morphology.normalize_features(label) + # extract separate POS and morph tags + label_dict = Morphology.feats_to_dict(label) + pos = label_dict.get(self.POS_FEAT, "") + if self.POS_FEAT in label_dict: + label_dict.pop(self.POS_FEAT) + # normalize morph string and add to morphology table + norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)] + # add label mappings + if norm_label not in self.cfg["labels_morph"]: + self.cfg["labels_morph"][norm_label] = norm_morph + self.cfg["labels_pos"][norm_label] = POS_IDS[pos] return 1 def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, @@ -53,14 +69,16 @@ class Morphologizer(Tagger): for example in get_examples(): for i, token in enumerate(example.reference): pos = token.pos_ - morph = token.morph - norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)] + morph = token.morph_ + # create and add the combined morph+POS label + morph_dict = Morphology.feats_to_dict(morph) if pos: - morph["POS"] = pos - norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)] - if norm_morph_pos not in self.cfg["labels"]: - self.cfg["labels"][norm_morph_pos] = norm_morph - self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos] + morph_dict[self.POS_FEAT] = pos + norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] + # add label->morph and label->POS mappings + if norm_label not in self.cfg["labels_morph"]: + self.cfg["labels_morph"][norm_label] = morph + self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.set_output(len(self.labels)) self.model.initialize() link_vectors_to_models(self.vocab) @@ -79,8 +97,8 @@ class Morphologizer(Tagger): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): morph = self.labels[tag_id] - doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph]) - doc.c[j].pos = self.cfg["morph_pos"][morph] + doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph]) + doc.c[j].pos = self.cfg["labels_pos"][morph] doc.is_morphed = True @@ -94,14 +112,17 @@ class Morphologizer(Tagger): for i in range(len(morphs)): pos = pos_tags[i] morph = morphs[i] - feats = Morphology.feats_to_dict(morph) + # POS may align (same value for multiple tokens) when morph + # doesn't, so if either is None, treat both as None here so that + # truths doesn't end up with an unknown morph+POS combination + if pos is None or morph is None: + pos = None + morph = None + label_dict = Morphology.feats_to_dict(morph) if pos: - feats["POS"] = pos - if len(feats) > 0: - morph = self.vocab.strings[self.vocab.morphology.add(feats)] - if morph == "": - morph = Morphology.EMPTY_MORPH - eg_truths.append(morph) + label_dict[self.POS_FEAT] = pos + label = self.vocab.strings[self.vocab.morphology.add(label_dict)] + eg_truths.append(label) truths.append(eg_truths) d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 9b7e2788d..757c9214c 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -5,6 +5,7 @@ from spacy.gold import Example from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir +from spacy.morphology import Morphology def test_label_types(): @@ -23,9 +24,10 @@ TRAIN_DATA = [ "pos": ["NOUN", "VERB", "ADJ", "NOUN"], }, ), + # test combinations of morph+POS ( "Eat blue ham", - {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}, + {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]}, ), ] @@ -38,7 +40,12 @@ def test_overfitting_IO(): for inst in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]): - morphologizer.add_label(morph + "|POS=" + pos) + if morph and pos: + morphologizer.add_label(morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos) + elif pos: + morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos) + elif morph: + morphologizer.add_label(morph) nlp.add_pipe(morphologizer) optimizer = nlp.begin_training() @@ -48,19 +55,27 @@ def test_overfitting_IO(): assert losses["morphologizer"] < 0.00001 # test the trained model - test_text = "I like blue eggs" + test_text = "I like blue ham" doc = nlp(test_text) gold_morphs = [ - "Feat=N|POS=NOUN", - "Feat=V|POS=VERB", - "Feat=J|POS=ADJ", - "Feat=N|POS=NOUN", + "Feat=N", + "Feat=V", + "", + "", + ] + gold_pos_tags = [ + "NOUN", + "VERB", + "ADJ", + "", ] assert [t.morph_ for t in doc] == gold_morphs + assert [t.pos_ for t in doc] == gold_pos_tags # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) - assert gold_morphs == [t.morph_ for t in doc2] + assert [t.morph_ for t in doc2] == gold_morphs + assert [t.pos_ for t in doc2] == gold_pos_tags