Support loading labels in morphologizer

This commit is contained in:
Matthew Honnibal 2020-10-03 19:13:42 +02:00
parent d6c967401f
commit 8ea8b7d940
1 changed files with 19 additions and 15 deletions

View File

@ -134,7 +134,7 @@ class Morphologizer(Tagger):
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
return 1
def initialize(self, get_examples, *, nlp=None):
def initialize(self, get_examples, *, nlp=None, labels=None):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -145,20 +145,24 @@ class Morphologizer(Tagger):
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
"""
self._ensure_examples(get_examples)
# First, fetch all labels from the data
for example in get_examples():
for i, token in enumerate(example.reference):
pos = token.pos_
morph = str(token.morph)
# create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph)
if pos:
morph_dict[self.POS_FEAT] = pos
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
# add label->morph and label->POS mappings
if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = morph
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
if labels is not None:
self.cfg["labels_morph"] = labels["labels_morph"]
self.cfg["labels_pos"] = labels["labels_pos"]
else:
# First, fetch all labels from the data
for example in get_examples():
for i, token in enumerate(example.reference):
pos = token.pos_
morph = str(token.morph)
# create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph)
if pos:
morph_dict[self.POS_FEAT] = pos
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
# add label->morph and label->POS mappings
if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = morph
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
if len(self.labels) <= 1:
raise ValueError(Errors.E143.format(name=self.name))
doc_sample = []