spaCy/spacy/tests/matcher/test_pattern_validation.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
from spacy.matcher import Matcher
from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
from spacy.errors import MatchPatternError
from spacy.util import get_json_validator, validate_json

# (pattern, num errors with validation, num errors identified with minimal
#  checks)
TEST_PATTERNS = [
    # Bad patterns flagged in all cases
    ([{"XX": "foo"}], 1, 1),
    ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1),
    ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
    ([{"_": "foo"}], 1, 1),
    ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
    ([1, 2, 3], 3, 1),
    # Bad patterns flagged outside of Matcher
    ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),
    # Bad patterns not flagged with minimal checks
    ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0),
    ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),
    ([{"LENGTH": {"VALUE": 5}}], 1, 0),
    ([{"TEXT": {"VALUE": "foo"}}], 1, 0),
    ([{"IS_DIGIT": -1}], 1, 0),
    ([{"ORTH": -1}], 1, 0),
    # Good patterns
    ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
    ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
    ([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0),
    ([{"LENGTH": 2}], 0, 0),
    ([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
    ([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
    ([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
    ([{"IS_SENT_START": True}], 0, 0),
    ([{"SENT_START": True}], 0, 0),
]

XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]


@pytest.fixture
def validator():
    return get_json_validator(TOKEN_PATTERN_SCHEMA)


@pytest.mark.parametrize(
    "pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]]
)
def test_matcher_pattern_validation(en_vocab, pattern):
    matcher = Matcher(en_vocab, validate=True)
    with pytest.raises(MatchPatternError):
        matcher.add("TEST", [pattern])


@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
def test_pattern_validation(validator, pattern, n_errors, _):
    errors = validate_json(pattern, validator)
    assert len(errors) == n_errors


@pytest.mark.xfail
@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS)
def test_xfail_pattern_validation(validator, pattern, n_errors, _):
    errors = validate_json(pattern, validator)
    assert len(errors) == n_errors


@pytest.mark.parametrize("pattern,n_errors,n_min_errors", TEST_PATTERNS)
def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
    matcher = Matcher(en_vocab)
    if n_min_errors > 0:
        with pytest.raises(ValueError):
            matcher.add("TEST", [pattern])
    elif n_errors == 0:
        matcher.add("TEST", [pattern])


def test_pattern_warnings(en_vocab):
    matcher = Matcher(en_vocab)
    # normalize "regex" to upper like "text"
    matcher.add("TEST1", [[{"text": {"regex": "regex"}}]])
    # warn if subpattern attribute isn't recognized and processed
    with pytest.warns(UserWarning):
        matcher.add("TEST2", [[{"TEXT": {"XX": "xx"}}]])
💫 Add token match pattern validation via JSON schemas (#3244) * Add custom MatchPatternError * Improve validators and add validation option to Matcher * Adjust formatting * Never validate in Matcher within PhraseMatcher If we do decide to make validate default to True, the PhraseMatcher's Matcher shouldn't ever validate. Here, we create the patterns automatically anyways (and it's currently unclear whether the validation has performance impacts at a very large scale). 2019-02-12 14:47:26 +00:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import pytest`
			`from spacy.matcher import Matcher`
			`from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA`
			`from spacy.errors import MatchPatternError`
			`from spacy.util import get_json_validator, validate_json`

Improve token pattern checking without validation (#4105) * Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2019-08-21 12:00:37 +00:00			`# (pattern, num errors with validation, num errors identified with minimal`
			`# checks)`
			`TEST_PATTERNS = [`
			`# Bad patterns flagged in all cases`
			`([{"XX": "foo"}], 1, 1),`
			`([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1),`
			`([{"IS_PUNCT": True, "OP": "$"}], 1, 1),`
			`([{"_": "foo"}], 1, 1),`
			`('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),`
			`([1, 2, 3], 3, 1),`
			`# Bad patterns flagged outside of Matcher`
			`([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),`
			`# Bad patterns not flagged with minimal checks`
Allow int values in token patterns (#4444) * Add missing int value option to top-level pattern validation in Matcher * Adjust existing tests accordingly * Add new test for valid pattern `{"LENGTH": int}` 2019-10-16 11:40:18 +00:00			`([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0),`
Improve token pattern checking without validation (#4105) * Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2019-08-21 12:00:37 +00:00			`([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),`
			`([{"LENGTH": {"VALUE": 5}}], 1, 0),`
			`([{"TEXT": {"VALUE": "foo"}}], 1, 0),`
Allow int values in token patterns (#4444) * Add missing int value option to top-level pattern validation in Matcher * Adjust existing tests accordingly * Add new test for valid pattern `{"LENGTH": int}` 2019-10-16 11:40:18 +00:00			`([{"IS_DIGIT": -1}], 1, 0),`
			`([{"ORTH": -1}], 1, 0),`
Improve token pattern checking without validation (#4105) * Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2019-08-21 12:00:37 +00:00			`# Good patterns`
			`([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),`
			`([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),`
			`([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0),`
Allow int values in token patterns (#4444) * Add missing int value option to top-level pattern validation in Matcher * Adjust existing tests accordingly * Add new test for valid pattern `{"LENGTH": int}` 2019-10-16 11:40:18 +00:00			`([{"LENGTH": 2}], 0, 0),`
Improve token pattern checking without validation (#4105) * Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2019-08-21 12:00:37 +00:00			`([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),`
			`([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),`
			`([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),`
Normalize IS_SENT_START to SENT_START for Matcher (#5080) 2020-03-03 11:22:39 +00:00			`([{"IS_SENT_START": True}], 0, 0),`
			`([{"SENT_START": True}], 0, 0),`
Improve token pattern checking without validation (#4105) * Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2019-08-21 12:00:37 +00:00			`]`

			`XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]`

💫 Add token match pattern validation via JSON schemas (#3244) * Add custom MatchPatternError * Improve validators and add validation option to Matcher * Adjust formatting * Never validate in Matcher within PhraseMatcher If we do decide to make validate default to True, the PhraseMatcher's Matcher shouldn't ever validate. Here, we create the patterns automatically anyways (and it's currently unclear whether the validation has performance impacts at a very large scale). 2019-02-12 14:47:26 +00:00
			`@pytest.fixture`
			`def validator():`
			`return get_json_validator(TOKEN_PATTERN_SCHEMA)`


			`@pytest.mark.parametrize(`
			`"pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]]`
			`)`
			`def test_matcher_pattern_validation(en_vocab, pattern):`
			`matcher = Matcher(en_vocab, validate=True)`
			`with pytest.raises(MatchPatternError):`
Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522) * Implement new API for {Phrase}Matcher.add (backwards-compatible) * Update docs * Also update DependencyMatcher.add * Update internals * Rewrite tests to use new API * Add basic check for common mistake Raise error with suggestion if user likely passed in a pattern instead of a list of patterns * Fix typo [ci skip] 2019-10-25 20:21:08 +00:00			`matcher.add("TEST", [pattern])`
💫 Add token match pattern validation via JSON schemas (#3244) * Add custom MatchPatternError * Improve validators and add validation option to Matcher * Adjust formatting * Never validate in Matcher within PhraseMatcher If we do decide to make validate default to True, the PhraseMatcher's Matcher shouldn't ever validate. Here, we create the patterns automatically anyways (and it's currently unclear whether the validation has performance impacts at a very large scale). 2019-02-12 14:47:26 +00:00

Improve token pattern checking without validation (#4105) * Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2019-08-21 12:00:37 +00:00			`@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)`
			`def test_pattern_validation(validator, pattern, n_errors, _):`
💫 Add token match pattern validation via JSON schemas (#3244) * Add custom MatchPatternError * Improve validators and add validation option to Matcher * Adjust formatting * Never validate in Matcher within PhraseMatcher If we do decide to make validate default to True, the PhraseMatcher's Matcher shouldn't ever validate. Here, we create the patterns automatically anyways (and it's currently unclear whether the validation has performance impacts at a very large scale). 2019-02-12 14:47:26 +00:00			`errors = validate_json(pattern, validator)`
			`assert len(errors) == n_errors`
Improve token pattern checking without validation (#4105) * Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2019-08-21 12:00:37 +00:00

			`@pytest.mark.xfail`
			`@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS)`
			`def test_xfail_pattern_validation(validator, pattern, n_errors, _):`
			`errors = validate_json(pattern, validator)`
			`assert len(errors) == n_errors`


			`@pytest.mark.parametrize("pattern,n_errors,n_min_errors", TEST_PATTERNS)`
			`def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):`
			`matcher = Matcher(en_vocab)`
			`if n_min_errors > 0:`
			`with pytest.raises(ValueError):`
Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522) * Implement new API for {Phrase}Matcher.add (backwards-compatible) * Update docs * Also update DependencyMatcher.add * Update internals * Rewrite tests to use new API * Add basic check for common mistake Raise error with suggestion if user likely passed in a pattern instead of a list of patterns * Fix typo [ci skip] 2019-10-25 20:21:08 +00:00			`matcher.add("TEST", [pattern])`
Improve token pattern checking without validation (#4105) * Fix typo in rule-based matching docs * Improve token pattern checking without validation Add more detailed token pattern checks without full JSON pattern validation and provide more detailed error messages. Addresses #4070 (also related: #4063, #4100). * Check whether top-level attributes in patterns and attr for PhraseMatcher are in token pattern schema * Check whether attribute value types are supported in general (as opposed to per attribute with full validation) * Report various internal error types (OverflowError, AttributeError, KeyError) as ValueError with standard error messages * Check for tagger/parser in PhraseMatcher pipeline for attributes TAG, POS, LEMMA, and DEP * Add error messages with relevant details on how to use validate=True or nlp() instead of nlp.make_doc() * Support attr=TEXT for PhraseMatcher * Add NORM to schema * Expand tests for pattern validation, Matcher, PhraseMatcher, and EntityRuler * Remove unnecessary .keys() * Rephrase error messages * Add another type check to Matcher Add another type check to Matcher for more understandable error messages in some rare cases. * Support phrase_matcher_attr=TEXT for EntityRuler * Don't use spacy.errors in examples and bin scripts * Fix error code * Auto-format Also try get Azure pipelines to finally start a build :( * Update errors.py Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2019-08-21 12:00:37 +00:00			`elif n_errors == 0:`
Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522) * Implement new API for {Phrase}Matcher.add (backwards-compatible) * Update docs * Also update DependencyMatcher.add * Update internals * Rewrite tests to use new API * Add basic check for common mistake Raise error with suggestion if user likely passed in a pattern instead of a list of patterns * Fix typo [ci skip] 2019-10-25 20:21:08 +00:00			`matcher.add("TEST", [pattern])`
Add warning when Matcher subpattern is discarded (#5873) * Add a warning when a subpattern is not processed and discarded * Normalize subpattern attribute/operator keys to upper case like top-level attributes 2020-08-05 12:56:14 +00:00

			`def test_pattern_warnings(en_vocab):`
			`matcher = Matcher(en_vocab)`
			`# normalize "regex" to upper like "text"`
			`matcher.add("TEST1", [[{"text": {"regex": "regex"}}]])`
			`# warn if subpattern attribute isn't recognized and processed`
			`with pytest.warns(UserWarning):`
			`matcher.add("TEST2", [[{"TEXT": {"XX": "xx"}}]])`