2020-08-04 15:02:39 +00:00
|
|
|
import pytest
|
|
|
|
import numpy
|
2020-09-09 08:31:03 +00:00
|
|
|
from spacy.training import Example
|
2020-08-04 15:02:39 +00:00
|
|
|
from spacy.lang.en import English
|
|
|
|
from spacy.pipeline import AttributeRuler
|
|
|
|
from spacy import util, registry
|
2020-09-21 18:43:54 +00:00
|
|
|
from spacy.tokens import Doc
|
2020-08-04 15:02:39 +00:00
|
|
|
|
2020-09-21 18:43:54 +00:00
|
|
|
from ..util import make_tempdir
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def nlp():
|
|
|
|
return English()
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def pattern_dicts():
|
|
|
|
return [
|
|
|
|
{
|
|
|
|
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
|
|
|
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
|
|
|
},
|
|
|
|
# one pattern sets the lemma
|
|
|
|
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
|
|
|
# another pattern sets the morphology
|
|
|
|
{
|
|
|
|
"patterns": [[{"ORTH": "test"}]],
|
|
|
|
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
|
|
|
"index": 0,
|
|
|
|
},
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2020-09-03 15:31:14 +00:00
|
|
|
@registry.misc("attribute_ruler_patterns")
|
2020-08-04 15:02:39 +00:00
|
|
|
def attribute_ruler_patterns():
|
|
|
|
return [
|
|
|
|
{
|
|
|
|
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
|
|
|
|
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
|
|
|
},
|
|
|
|
# one pattern sets the lemma
|
|
|
|
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
|
|
|
|
# another pattern sets the morphology
|
|
|
|
{
|
|
|
|
"patterns": [[{"ORTH": "test"}]],
|
|
|
|
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
|
|
|
|
"index": 0,
|
|
|
|
},
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def tag_map():
|
|
|
|
return {
|
|
|
|
".": {"POS": "PUNCT", "PunctType": "peri"},
|
|
|
|
",": {"POS": "PUNCT", "PunctType": "comm"},
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def morph_rules():
|
|
|
|
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
|
|
|
|
|
|
|
|
|
|
|
|
def test_attributeruler_init(nlp, pattern_dicts):
|
|
|
|
a = nlp.add_pipe("attribute_ruler")
|
|
|
|
for p in pattern_dicts:
|
|
|
|
a.add(**p)
|
|
|
|
doc = nlp("This is a test.")
|
|
|
|
assert doc[2].lemma_ == "the"
|
|
|
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
|
|
|
assert doc[3].lemma_ == "cat"
|
|
|
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
2020-09-16 22:14:01 +00:00
|
|
|
assert doc.has_annotation("LEMMA")
|
|
|
|
assert doc.has_annotation("MORPH")
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|
|
|
# initialize with patterns
|
2020-08-05 14:00:59 +00:00
|
|
|
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
2020-08-04 15:02:39 +00:00
|
|
|
doc = nlp("This is a test.")
|
|
|
|
assert doc[2].lemma_ == "the"
|
|
|
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
|
|
|
assert doc[3].lemma_ == "cat"
|
|
|
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
2020-09-16 22:14:01 +00:00
|
|
|
assert doc.has_annotation("LEMMA")
|
|
|
|
assert doc.has_annotation("MORPH")
|
2020-08-04 15:02:39 +00:00
|
|
|
nlp.remove_pipe("attribute_ruler")
|
|
|
|
# initialize with patterns from asset
|
2020-08-05 14:00:59 +00:00
|
|
|
nlp.add_pipe(
|
|
|
|
"attribute_ruler",
|
2020-09-03 15:31:14 +00:00
|
|
|
config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}},
|
2020-08-05 14:00:59 +00:00
|
|
|
)
|
2020-08-04 15:02:39 +00:00
|
|
|
doc = nlp("This is a test.")
|
|
|
|
assert doc[2].lemma_ == "the"
|
|
|
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
|
|
|
assert doc[3].lemma_ == "cat"
|
|
|
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
2020-09-16 22:14:01 +00:00
|
|
|
assert doc.has_annotation("LEMMA")
|
|
|
|
assert doc.has_annotation("MORPH")
|
2020-08-04 15:02:39 +00:00
|
|
|
|
|
|
|
|
2020-08-26 13:39:30 +00:00
|
|
|
def test_attributeruler_score(nlp, pattern_dicts):
|
|
|
|
# initialize with patterns
|
|
|
|
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
|
|
|
doc = nlp("This is a test.")
|
|
|
|
assert doc[2].lemma_ == "the"
|
|
|
|
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
|
|
|
assert doc[3].lemma_ == "cat"
|
|
|
|
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
|
|
|
|
2020-08-29 11:01:10 +00:00
|
|
|
dev_examples = [
|
|
|
|
Example.from_dict(
|
|
|
|
nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]}
|
|
|
|
)
|
|
|
|
]
|
2020-08-26 13:39:30 +00:00
|
|
|
scores = nlp.evaluate(dev_examples)
|
|
|
|
# "cat" is the only correct lemma
|
|
|
|
assert scores["lemma_acc"] == pytest.approx(0.2)
|
|
|
|
# the empty morphs are correct
|
|
|
|
assert scores["morph_acc"] == pytest.approx(0.6)
|
|
|
|
|
|
|
|
|
2020-08-28 18:45:19 +00:00
|
|
|
def test_attributeruler_rule_order(nlp):
|
|
|
|
a = AttributeRuler(nlp.vocab)
|
|
|
|
patterns = [
|
2020-08-29 11:01:10 +00:00
|
|
|
{"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "VERB"}},
|
|
|
|
{"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},
|
2020-08-28 18:45:19 +00:00
|
|
|
]
|
|
|
|
a.add_patterns(patterns)
|
2020-09-21 18:43:54 +00:00
|
|
|
doc = Doc(
|
2020-08-28 18:45:19 +00:00
|
|
|
nlp.vocab,
|
|
|
|
words=["This", "is", "a", "test", "."],
|
2020-08-29 11:01:10 +00:00
|
|
|
tags=["DT", "VBZ", "DT", "NN", "."],
|
2020-08-28 18:45:19 +00:00
|
|
|
)
|
|
|
|
doc = a(doc)
|
|
|
|
assert doc[1].pos_ == "NOUN"
|
|
|
|
|
|
|
|
|
2020-08-04 15:02:39 +00:00
|
|
|
def test_attributeruler_tag_map(nlp, tag_map):
|
|
|
|
a = AttributeRuler(nlp.vocab)
|
|
|
|
a.load_from_tag_map(tag_map)
|
2020-09-21 18:43:54 +00:00
|
|
|
doc = Doc(
|
2020-08-04 15:02:39 +00:00
|
|
|
nlp.vocab,
|
|
|
|
words=["This", "is", "a", "test", "."],
|
|
|
|
tags=["DT", "VBZ", "DT", "NN", "."],
|
|
|
|
)
|
|
|
|
doc = a(doc)
|
|
|
|
for i in range(len(doc)):
|
|
|
|
if i == 4:
|
|
|
|
assert doc[i].pos_ == "PUNCT"
|
|
|
|
assert doc[i].morph_ == "PunctType=peri"
|
|
|
|
else:
|
|
|
|
assert doc[i].pos_ == ""
|
|
|
|
assert doc[i].morph_ == ""
|
|
|
|
|
|
|
|
|
|
|
|
def test_attributeruler_morph_rules(nlp, morph_rules):
|
|
|
|
a = AttributeRuler(nlp.vocab)
|
|
|
|
a.load_from_morph_rules(morph_rules)
|
2020-09-21 18:43:54 +00:00
|
|
|
doc = Doc(
|
2020-08-04 15:02:39 +00:00
|
|
|
nlp.vocab,
|
|
|
|
words=["This", "is", "the", "test", "."],
|
|
|
|
tags=["DT", "VBZ", "DT", "NN", "."],
|
|
|
|
)
|
|
|
|
doc = a(doc)
|
|
|
|
for i in range(len(doc)):
|
|
|
|
if i != 2:
|
|
|
|
assert doc[i].pos_ == ""
|
|
|
|
assert doc[i].morph_ == ""
|
|
|
|
else:
|
|
|
|
assert doc[2].pos_ == "DET"
|
|
|
|
assert doc[2].lemma_ == "a"
|
|
|
|
assert doc[2].morph_ == "Case=Nom"
|
|
|
|
|
|
|
|
|
|
|
|
def test_attributeruler_indices(nlp):
|
|
|
|
a = nlp.add_pipe("attribute_ruler")
|
|
|
|
a.add(
|
|
|
|
[[{"ORTH": "a"}, {"ORTH": "test"}]],
|
|
|
|
{"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
|
|
|
|
index=0,
|
|
|
|
)
|
|
|
|
a.add(
|
|
|
|
[[{"ORTH": "This"}, {"ORTH": "is"}]],
|
|
|
|
{"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
|
|
|
|
index=1,
|
|
|
|
)
|
|
|
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
|
|
|
|
|
|
|
|
text = "This is a test."
|
|
|
|
doc = nlp(text)
|
|
|
|
for i in range(len(doc)):
|
|
|
|
if i == 1:
|
|
|
|
assert doc[i].lemma_ == "was"
|
|
|
|
assert doc[i].morph_ == "Case=Nom|Number=Sing"
|
|
|
|
elif i == 2:
|
|
|
|
assert doc[i].lemma_ == "the"
|
|
|
|
assert doc[i].morph_ == "Case=Nom|Number=Plur"
|
|
|
|
elif i == 3:
|
|
|
|
assert doc[i].lemma_ == "cat"
|
|
|
|
else:
|
|
|
|
assert doc[i].morph_ == ""
|
|
|
|
# raises an error when trying to modify a token outside of the match
|
|
|
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
doc = nlp(text)
|
|
|
|
# raises an error when trying to modify a token outside of the match
|
|
|
|
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
doc = nlp(text)
|
|
|
|
|
|
|
|
|
|
|
|
def test_attributeruler_patterns_prop(nlp, pattern_dicts):
|
|
|
|
a = nlp.add_pipe("attribute_ruler")
|
|
|
|
a.add_patterns(pattern_dicts)
|
|
|
|
for p1, p2 in zip(pattern_dicts, a.patterns):
|
|
|
|
assert p1["patterns"] == p2["patterns"]
|
|
|
|
assert p1["attrs"] == p2["attrs"]
|
|
|
|
if p1.get("index"):
|
|
|
|
assert p1["index"] == p2["index"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_attributeruler_serialize(nlp, pattern_dicts):
|
|
|
|
a = nlp.add_pipe("attribute_ruler")
|
|
|
|
a.add_patterns(pattern_dicts)
|
|
|
|
text = "This is a test."
|
|
|
|
attrs = ["ORTH", "LEMMA", "MORPH"]
|
|
|
|
doc = nlp(text)
|
|
|
|
# bytes roundtrip
|
|
|
|
a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
|
|
|
|
assert a.to_bytes() == a_reloaded.to_bytes()
|
|
|
|
doc1 = a_reloaded(nlp.make_doc(text))
|
|
|
|
numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
|
2020-08-28 18:42:26 +00:00
|
|
|
assert a.patterns == a_reloaded.patterns
|
2020-08-04 15:02:39 +00:00
|
|
|
# disk roundtrip
|
|
|
|
with make_tempdir() as tmp_dir:
|
|
|
|
nlp.to_disk(tmp_dir)
|
|
|
|
nlp2 = util.load_model_from_path(tmp_dir)
|
|
|
|
doc2 = nlp2(text)
|
|
|
|
assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
|
|
|
|
assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))
|
2020-08-28 18:42:26 +00:00
|
|
|
assert a.patterns == nlp2.get_pipe("attribute_ruler").patterns
|