spaCy/spacy/tests/pipeline/test_attributeruler.py

import pytest
import numpy
from spacy.lang.en import English
from spacy.pipeline import AttributeRuler
from spacy import util, registry

from ..util import get_doc, make_tempdir


@pytest.fixture
def nlp():
    return English()


@pytest.fixture
def pattern_dicts():
    return [
        {
            "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
            "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
        },
        # one pattern sets the lemma
        {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
        # another pattern sets the morphology
        {
            "patterns": [[{"ORTH": "test"}]],
            "attrs": {"MORPH": "Case=Nom|Number=Sing"},
            "index": 0,
        },
    ]


@registry.assets("attribute_ruler_patterns")
def attribute_ruler_patterns():
    return [
        {
            "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
            "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
        },
        # one pattern sets the lemma
        {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
        # another pattern sets the morphology
        {
            "patterns": [[{"ORTH": "test"}]],
            "attrs": {"MORPH": "Case=Nom|Number=Sing"},
            "index": 0,
        },
    ]


@pytest.fixture
def tag_map():
    return {
        ".": {"POS": "PUNCT", "PunctType": "peri"},
        ",": {"POS": "PUNCT", "PunctType": "comm"},
    }


@pytest.fixture
def morph_rules():
    return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}


def test_attributeruler_init(nlp, pattern_dicts):
    a = nlp.add_pipe("attribute_ruler")
    for p in pattern_dicts:
        a.add(**p)

    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert doc[3].morph_ == "Case=Nom|Number=Sing"


def test_attributeruler_init_patterns(nlp, pattern_dicts):
    # initialize with patterns
    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert doc[3].morph_ == "Case=Nom|Number=Sing"
    nlp.remove_pipe("attribute_ruler")
    # initialize with patterns from asset
    nlp.add_pipe(
        "attribute_ruler",
        config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
    )
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert doc[2].morph_ == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert doc[3].morph_ == "Case=Nom|Number=Sing"


def test_attributeruler_tag_map(nlp, tag_map):
    a = AttributeRuler(nlp.vocab)
    a.load_from_tag_map(tag_map)
    doc = get_doc(
        nlp.vocab,
        words=["This", "is", "a", "test", "."],
        tags=["DT", "VBZ", "DT", "NN", "."],
    )
    doc = a(doc)

    for i in range(len(doc)):
        if i == 4:
            assert doc[i].pos_ == "PUNCT"
            assert doc[i].morph_ == "PunctType=peri"
        else:
            assert doc[i].pos_ == ""
            assert doc[i].morph_ == ""


def test_attributeruler_morph_rules(nlp, morph_rules):
    a = AttributeRuler(nlp.vocab)
    a.load_from_morph_rules(morph_rules)
    doc = get_doc(
        nlp.vocab,
        words=["This", "is", "the", "test", "."],
        tags=["DT", "VBZ", "DT", "NN", "."],
    )
    doc = a(doc)

    for i in range(len(doc)):
        if i != 2:
            assert doc[i].pos_ == ""
            assert doc[i].morph_ == ""
        else:
            assert doc[2].pos_ == "DET"
            assert doc[2].lemma_ == "a"
            assert doc[2].morph_ == "Case=Nom"


def test_attributeruler_indices(nlp):
    a = nlp.add_pipe("attribute_ruler")
    a.add(
        [[{"ORTH": "a"}, {"ORTH": "test"}]],
        {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
        index=0,
    )
    a.add(
        [[{"ORTH": "This"}, {"ORTH": "is"}]],
        {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
        index=1,
    )
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)

    text = "This is a test."
    doc = nlp(text)

    for i in range(len(doc)):
        if i == 1:
            assert doc[i].lemma_ == "was"
            assert doc[i].morph_ == "Case=Nom|Number=Sing"
        elif i == 2:
            assert doc[i].lemma_ == "the"
            assert doc[i].morph_ == "Case=Nom|Number=Plur"
        elif i == 3:
            assert doc[i].lemma_ == "cat"
        else:
            assert doc[i].morph_ == ""

    # raises an error when trying to modify a token outside of the match
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
    with pytest.raises(ValueError):
        doc = nlp(text)

    # raises an error when trying to modify a token outside of the match
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
    with pytest.raises(ValueError):
        doc = nlp(text)


def test_attributeruler_patterns_prop(nlp, pattern_dicts):
    a = nlp.add_pipe("attribute_ruler")
    a.add_patterns(pattern_dicts)

    for p1, p2 in zip(pattern_dicts, a.patterns):
        assert p1["patterns"] == p2["patterns"]
        assert p1["attrs"] == p2["attrs"]
        if p1.get("index"):
            assert p1["index"] == p2["index"]


def test_attributeruler_serialize(nlp, pattern_dicts):
    a = nlp.add_pipe("attribute_ruler")
    a.add_patterns(pattern_dicts)

    text = "This is a test."
    attrs = ["ORTH", "LEMMA", "MORPH"]
    doc = nlp(text)

    # bytes roundtrip
    a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
    assert a.to_bytes() == a_reloaded.to_bytes()
    doc1 = a_reloaded(nlp.make_doc(text))
    numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))

    # disk roundtrip
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(text)
        assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
        assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))
Add AttributeRuler for token attribute exceptions (#5842) * Add AttributeRuler for token attribute exceptions Add the `AttributeRuler` to handle exceptions for token-level attributes. The `AttributeRuler` uses `Matcher` patterns to identify target spans and applies the specified attributes to the token at the provided index in the matched span. A negative index can be used to index from the end of the matched span. The retokenizer is used to "merge" the individual tokens and assign them the provided attributes. Helper functions can import existing tag maps and morph rules to the corresponding `Matcher` patterns. There is an additional minor bug fix for `MORPH` attributes in the retokenizer to correctly normalize the values and to handle `MORPH` alongside `_` in an attrs dict. * Fix default name * Update name in error message * Extend AttributeRuler functionality * Add option to initialize with a dict of AttributeRuler patterns * Instead of silently discarding overlapping matches (the default behavior for the retokenizer if only the attrs differ), split the matches into disjoint sets and retokenize each set separately. This allows, for instance, one pattern to set the POS and another pattern to set the lemma. (If two matches modify the same attribute, it looks like the attrs are applied in the order they were added, but it may not be deterministic?) * Improve types * Sort spans before processing * Fix index boundaries in Span * Refactor retokenizer to separate attrs methods Add top-level `normalize_token_attrs` and `set_token_attrs` methods. * Update AttributeRuler to use refactored methods Update `AttributeRuler` to replace use of full retokenizer with only the relevant methods for normalizing and setting attributes for a single token. * Update spacy/pipeline/attributeruler.py Co-authored-by: Ines Montani <ines@ines.io> * Make API more similar to EntityRuler * Add `AttributeRuler.add_patterns` to add patterns from a list of dicts * Return list of dicts as property `AttributeRuler.patterns` * Make attrs_unnormed private * Add test loading patterns from assets * Revert "Fix index boundaries in Span" This reverts commit 8f8a5c33861bff2d7c3f19914e289139ab3a2c28. * Add Span index boundary checks (#5861) * Add Span index boundary checks * Return Span-specific IndexError in all cases * Simplify and fix if/else Co-authored-by: Ines Montani <ines@ines.io> 2020-08-04 15:02:39 +00:00			`import pytest`
			`import numpy`
			`from spacy.lang.en import English`
			`from spacy.pipeline import AttributeRuler`
			`from spacy import util, registry`

			`from ..util import get_doc, make_tempdir`


			`@pytest.fixture`
			`def nlp():`
			`return English()`


			`@pytest.fixture`
			`def pattern_dicts():`
			`return [`
			`{`
			`"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],`
			`"attrs": {"LEMMA": "the", "MORPH": "Case=Nom\|Number=Plur"},`
			`},`
			`# one pattern sets the lemma`
			`{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},`
			`# another pattern sets the morphology`
			`{`
			`"patterns": [[{"ORTH": "test"}]],`
			`"attrs": {"MORPH": "Case=Nom\|Number=Sing"},`
			`"index": 0,`
			`},`
			`]`


			`@registry.assets("attribute_ruler_patterns")`
			`def attribute_ruler_patterns():`
			`return [`
			`{`
			`"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],`
			`"attrs": {"LEMMA": "the", "MORPH": "Case=Nom\|Number=Plur"},`
			`},`
			`# one pattern sets the lemma`
			`{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},`
			`# another pattern sets the morphology`
			`{`
			`"patterns": [[{"ORTH": "test"}]],`
			`"attrs": {"MORPH": "Case=Nom\|Number=Sing"},`
			`"index": 0,`
			`},`
			`]`


			`@pytest.fixture`
			`def tag_map():`
			`return {`
			`".": {"POS": "PUNCT", "PunctType": "peri"},`
			`",": {"POS": "PUNCT", "PunctType": "comm"},`
			`}`


			`@pytest.fixture`
			`def morph_rules():`
			`return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}`


			`def test_attributeruler_init(nlp, pattern_dicts):`
			`a = nlp.add_pipe("attribute_ruler")`
			`for p in pattern_dicts:`
			`a.add(**p)`

			`doc = nlp("This is a test.")`
			`assert doc[2].lemma_ == "the"`
			`assert doc[2].morph_ == "Case=Nom\|Number=Plur"`
			`assert doc[3].lemma_ == "cat"`
			`assert doc[3].morph_ == "Case=Nom\|Number=Sing"`


			`def test_attributeruler_init_patterns(nlp, pattern_dicts):`
			`# initialize with patterns`
Tidy up and auto-format 2020-08-05 14:00:59 +00:00			`nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})`
Add AttributeRuler for token attribute exceptions (#5842) * Add AttributeRuler for token attribute exceptions Add the `AttributeRuler` to handle exceptions for token-level attributes. The `AttributeRuler` uses `Matcher` patterns to identify target spans and applies the specified attributes to the token at the provided index in the matched span. A negative index can be used to index from the end of the matched span. The retokenizer is used to "merge" the individual tokens and assign them the provided attributes. Helper functions can import existing tag maps and morph rules to the corresponding `Matcher` patterns. There is an additional minor bug fix for `MORPH` attributes in the retokenizer to correctly normalize the values and to handle `MORPH` alongside `_` in an attrs dict. * Fix default name * Update name in error message * Extend AttributeRuler functionality * Add option to initialize with a dict of AttributeRuler patterns * Instead of silently discarding overlapping matches (the default behavior for the retokenizer if only the attrs differ), split the matches into disjoint sets and retokenize each set separately. This allows, for instance, one pattern to set the POS and another pattern to set the lemma. (If two matches modify the same attribute, it looks like the attrs are applied in the order they were added, but it may not be deterministic?) * Improve types * Sort spans before processing * Fix index boundaries in Span * Refactor retokenizer to separate attrs methods Add top-level `normalize_token_attrs` and `set_token_attrs` methods. * Update AttributeRuler to use refactored methods Update `AttributeRuler` to replace use of full retokenizer with only the relevant methods for normalizing and setting attributes for a single token. * Update spacy/pipeline/attributeruler.py Co-authored-by: Ines Montani <ines@ines.io> * Make API more similar to EntityRuler * Add `AttributeRuler.add_patterns` to add patterns from a list of dicts * Return list of dicts as property `AttributeRuler.patterns` * Make attrs_unnormed private * Add test loading patterns from assets * Revert "Fix index boundaries in Span" This reverts commit 8f8a5c33861bff2d7c3f19914e289139ab3a2c28. * Add Span index boundary checks (#5861) * Add Span index boundary checks * Return Span-specific IndexError in all cases * Simplify and fix if/else Co-authored-by: Ines Montani <ines@ines.io> 2020-08-04 15:02:39 +00:00			`doc = nlp("This is a test.")`
			`assert doc[2].lemma_ == "the"`
			`assert doc[2].morph_ == "Case=Nom\|Number=Plur"`
			`assert doc[3].lemma_ == "cat"`
			`assert doc[3].morph_ == "Case=Nom\|Number=Sing"`
			`nlp.remove_pipe("attribute_ruler")`
			`# initialize with patterns from asset`
Tidy up and auto-format 2020-08-05 14:00:59 +00:00			`nlp.add_pipe(`
			`"attribute_ruler",`
			`config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},`
			`)`
Add AttributeRuler for token attribute exceptions (#5842) * Add AttributeRuler for token attribute exceptions Add the `AttributeRuler` to handle exceptions for token-level attributes. The `AttributeRuler` uses `Matcher` patterns to identify target spans and applies the specified attributes to the token at the provided index in the matched span. A negative index can be used to index from the end of the matched span. The retokenizer is used to "merge" the individual tokens and assign them the provided attributes. Helper functions can import existing tag maps and morph rules to the corresponding `Matcher` patterns. There is an additional minor bug fix for `MORPH` attributes in the retokenizer to correctly normalize the values and to handle `MORPH` alongside `_` in an attrs dict. * Fix default name * Update name in error message * Extend AttributeRuler functionality * Add option to initialize with a dict of AttributeRuler patterns * Instead of silently discarding overlapping matches (the default behavior for the retokenizer if only the attrs differ), split the matches into disjoint sets and retokenize each set separately. This allows, for instance, one pattern to set the POS and another pattern to set the lemma. (If two matches modify the same attribute, it looks like the attrs are applied in the order they were added, but it may not be deterministic?) * Improve types * Sort spans before processing * Fix index boundaries in Span * Refactor retokenizer to separate attrs methods Add top-level `normalize_token_attrs` and `set_token_attrs` methods. * Update AttributeRuler to use refactored methods Update `AttributeRuler` to replace use of full retokenizer with only the relevant methods for normalizing and setting attributes for a single token. * Update spacy/pipeline/attributeruler.py Co-authored-by: Ines Montani <ines@ines.io> * Make API more similar to EntityRuler * Add `AttributeRuler.add_patterns` to add patterns from a list of dicts * Return list of dicts as property `AttributeRuler.patterns` * Make attrs_unnormed private * Add test loading patterns from assets * Revert "Fix index boundaries in Span" This reverts commit 8f8a5c33861bff2d7c3f19914e289139ab3a2c28. * Add Span index boundary checks (#5861) * Add Span index boundary checks * Return Span-specific IndexError in all cases * Simplify and fix if/else Co-authored-by: Ines Montani <ines@ines.io> 2020-08-04 15:02:39 +00:00			`doc = nlp("This is a test.")`
			`assert doc[2].lemma_ == "the"`
			`assert doc[2].morph_ == "Case=Nom\|Number=Plur"`
			`assert doc[3].lemma_ == "cat"`
			`assert doc[3].morph_ == "Case=Nom\|Number=Sing"`


			`def test_attributeruler_tag_map(nlp, tag_map):`
			`a = AttributeRuler(nlp.vocab)`
			`a.load_from_tag_map(tag_map)`
			`doc = get_doc(`
			`nlp.vocab,`
			`words=["This", "is", "a", "test", "."],`
			`tags=["DT", "VBZ", "DT", "NN", "."],`
			`)`
			`doc = a(doc)`

			`for i in range(len(doc)):`
			`if i == 4:`
			`assert doc[i].pos_ == "PUNCT"`
			`assert doc[i].morph_ == "PunctType=peri"`
			`else:`
			`assert doc[i].pos_ == ""`
			`assert doc[i].morph_ == ""`


			`def test_attributeruler_morph_rules(nlp, morph_rules):`
			`a = AttributeRuler(nlp.vocab)`
			`a.load_from_morph_rules(morph_rules)`
			`doc = get_doc(`
			`nlp.vocab,`
			`words=["This", "is", "the", "test", "."],`
			`tags=["DT", "VBZ", "DT", "NN", "."],`
			`)`
			`doc = a(doc)`

			`for i in range(len(doc)):`
			`if i != 2:`
			`assert doc[i].pos_ == ""`
			`assert doc[i].morph_ == ""`
			`else:`
			`assert doc[2].pos_ == "DET"`
			`assert doc[2].lemma_ == "a"`
			`assert doc[2].morph_ == "Case=Nom"`


			`def test_attributeruler_indices(nlp):`
			`a = nlp.add_pipe("attribute_ruler")`
			`a.add(`
			`[[{"ORTH": "a"}, {"ORTH": "test"}]],`
			`{"LEMMA": "the", "MORPH": "Case=Nom\|Number=Plur"},`
			`index=0,`
			`)`
			`a.add(`
			`[[{"ORTH": "This"}, {"ORTH": "is"}]],`
			`{"LEMMA": "was", "MORPH": "Case=Nom\|Number=Sing"},`
			`index=1,`
			`)`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)`

			`text = "This is a test."`
			`doc = nlp(text)`

			`for i in range(len(doc)):`
			`if i == 1:`
			`assert doc[i].lemma_ == "was"`
			`assert doc[i].morph_ == "Case=Nom\|Number=Sing"`
			`elif i == 2:`
			`assert doc[i].lemma_ == "the"`
			`assert doc[i].morph_ == "Case=Nom\|Number=Plur"`
			`elif i == 3:`
			`assert doc[i].lemma_ == "cat"`
			`else:`
			`assert doc[i].morph_ == ""`

			`# raises an error when trying to modify a token outside of the match`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)`
			`with pytest.raises(ValueError):`
			`doc = nlp(text)`

			`# raises an error when trying to modify a token outside of the match`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)`
			`with pytest.raises(ValueError):`
			`doc = nlp(text)`


			`def test_attributeruler_patterns_prop(nlp, pattern_dicts):`
			`a = nlp.add_pipe("attribute_ruler")`
			`a.add_patterns(pattern_dicts)`

			`for p1, p2 in zip(pattern_dicts, a.patterns):`
			`assert p1["patterns"] == p2["patterns"]`
			`assert p1["attrs"] == p2["attrs"]`
			`if p1.get("index"):`
			`assert p1["index"] == p2["index"]`


			`def test_attributeruler_serialize(nlp, pattern_dicts):`
			`a = nlp.add_pipe("attribute_ruler")`
			`a.add_patterns(pattern_dicts)`

			`text = "This is a test."`
			`attrs = ["ORTH", "LEMMA", "MORPH"]`
			`doc = nlp(text)`

			`# bytes roundtrip`
			`a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())`
			`assert a.to_bytes() == a_reloaded.to_bytes()`
			`doc1 = a_reloaded(nlp.make_doc(text))`
			`numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))`

			`# disk roundtrip`
			`with make_tempdir() as tmp_dir:`
			`nlp.to_disk(tmp_dir)`
			`nlp2 = util.load_model_from_path(tmp_dir)`
			`doc2 = nlp2(text)`
			`assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()`
			`assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))`