mirror of https://github.com/explosion/spaCy.git
Serialize morph rules with tagger
Serialize `morph_rules` with the tagger alongside the `tag_map`. Use `Morphology.load_tag_map` and `Morphology.load_morph_exceptions` to load these settings rather than reinitializing the morphology each time they are changed.
This commit is contained in:
parent
d106cf66dd
commit
50db3f0cdb
|
@ -359,9 +359,7 @@ class Tagger(Pipe):
|
||||||
if new_tag_map:
|
if new_tag_map:
|
||||||
if "_SP" in orig_tag_map:
|
if "_SP" in orig_tag_map:
|
||||||
new_tag_map["_SP"] = orig_tag_map["_SP"]
|
new_tag_map["_SP"] = orig_tag_map["_SP"]
|
||||||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
vocab.morphology.load_tag_map(new_tag_map)
|
||||||
vocab.morphology.lemmatizer,
|
|
||||||
exc=vocab.morphology.exc)
|
|
||||||
self.set_output(len(self.labels))
|
self.set_output(len(self.labels))
|
||||||
doc_sample = [Doc(self.vocab, words=["hello", "world"])]
|
doc_sample = [Doc(self.vocab, words=["hello", "world"])]
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
|
@ -400,10 +398,7 @@ class Tagger(Pipe):
|
||||||
if values is None:
|
if values is None:
|
||||||
values = {POS: "X"}
|
values = {POS: "X"}
|
||||||
tag_map[label] = values
|
tag_map[label] = values
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology.load_tag_map(tag_map)
|
||||||
self.vocab.strings, tag_map=tag_map,
|
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
|
||||||
exc=self.vocab.morphology.exc)
|
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
|
@ -417,6 +412,8 @@ class Tagger(Pipe):
|
||||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
|
serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
|
||||||
|
morph_rules = dict(self.vocab.morphology.exc)
|
||||||
|
serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules)
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple()):
|
def from_bytes(self, bytes_data, exclude=tuple()):
|
||||||
|
@ -428,14 +425,18 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def load_tag_map(b):
|
def load_tag_map(b):
|
||||||
tag_map = srsly.msgpack_loads(b)
|
tag_map = srsly.msgpack_loads(b)
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology.load_tag_map(tag_map)
|
||||||
self.vocab.strings, tag_map=tag_map,
|
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
|
||||||
exc=self.vocab.morphology.exc)
|
|
||||||
|
|
||||||
|
def load_morph_rules(b):
|
||||||
|
morph_rules = srsly.msgpack_loads(b)
|
||||||
|
self.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||||
|
|
||||||
|
self.vocab.morphology = Morphology(self.vocab.strings, dict(),
|
||||||
|
lemmatizer=self.vocab.morphology.lemmatizer)
|
||||||
deserialize = {
|
deserialize = {
|
||||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||||
"tag_map": load_tag_map,
|
"tag_map": load_tag_map,
|
||||||
|
"morph_rules": load_morph_rules,
|
||||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||||
"model": lambda b: load_model(b),
|
"model": lambda b: load_model(b),
|
||||||
}
|
}
|
||||||
|
@ -444,9 +445,11 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = dict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
|
morph_rules = dict(self.vocab.morphology.exc)
|
||||||
serialize = {
|
serialize = {
|
||||||
"vocab": lambda p: self.vocab.to_disk(p),
|
"vocab": lambda p: self.vocab.to_disk(p),
|
||||||
"tag_map": lambda p: srsly.write_msgpack(p, tag_map),
|
"tag_map": lambda p: srsly.write_msgpack(p, tag_map),
|
||||||
|
"morph_rules": lambda p: srsly.write_msgpack(p, morph_rules),
|
||||||
"model": lambda p: self.model.to_disk(p),
|
"model": lambda p: self.model.to_disk(p),
|
||||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||||
}
|
}
|
||||||
|
@ -462,15 +465,19 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def load_tag_map(p):
|
def load_tag_map(p):
|
||||||
tag_map = srsly.read_msgpack(p)
|
tag_map = srsly.read_msgpack(p)
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology.load_tag_map(tag_map)
|
||||||
self.vocab.strings, tag_map=tag_map,
|
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
|
||||||
exc=self.vocab.morphology.exc)
|
|
||||||
|
|
||||||
|
def load_morph_rules(p):
|
||||||
|
morph_rules = srsly.read_msgpack(p)
|
||||||
|
self.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||||
|
|
||||||
|
self.vocab.morphology = Morphology(self.vocab.strings, dict(),
|
||||||
|
lemmatizer=self.vocab.morphology.lemmatizer)
|
||||||
deserialize = {
|
deserialize = {
|
||||||
"vocab": lambda p: self.vocab.from_disk(p),
|
"vocab": lambda p: self.vocab.from_disk(p),
|
||||||
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
"cfg": lambda p: self.cfg.update(_load_cfg(p)),
|
||||||
"tag_map": load_tag_map,
|
"tag_map": load_tag_map,
|
||||||
|
"morph_rules": load_morph_rules,
|
||||||
"model": load_model,
|
"model": load_model,
|
||||||
}
|
}
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
|
|
@ -17,6 +17,8 @@ def test_label_types():
|
||||||
|
|
||||||
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
|
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
|
||||||
|
|
||||||
|
MORPH_RULES = {"V": {"like": {"lemma": "luck"}}}
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||||
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
||||||
|
@ -26,9 +28,9 @@ TRAIN_DATA = [
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
nlp.vocab.morphology.load_tag_map(TAG_MAP)
|
||||||
|
nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES)
|
||||||
tagger = nlp.create_pipe("tagger")
|
tagger = nlp.create_pipe("tagger")
|
||||||
for tag, values in TAG_MAP.items():
|
|
||||||
tagger.add_label(tag, values)
|
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for t in TRAIN_DATA:
|
for t in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
@ -47,6 +49,7 @@ def test_overfitting_IO():
|
||||||
assert doc[1].tag_ is "V"
|
assert doc[1].tag_ is "V"
|
||||||
assert doc[2].tag_ is "J"
|
assert doc[2].tag_ is "J"
|
||||||
assert doc[3].tag_ is "N"
|
assert doc[3].tag_ is "N"
|
||||||
|
assert doc[1].lemma_ == "luck"
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
|
@ -57,3 +60,4 @@ def test_overfitting_IO():
|
||||||
assert doc2[1].tag_ is "V"
|
assert doc2[1].tag_ is "V"
|
||||||
assert doc2[2].tag_ is "J"
|
assert doc2[2].tag_ is "J"
|
||||||
assert doc2[3].tag_ is "N"
|
assert doc2[3].tag_ is "N"
|
||||||
|
assert doc[1].lemma_ == "luck"
|
||||||
|
|
Loading…
Reference in New Issue