2019-02-12 14:47:26 +00:00
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA
|
|
|
|
from spacy.errors import MatchPatternError
|
|
|
|
from spacy.util import get_json_validator, validate_json
|
|
|
|
|
2019-08-21 12:00:37 +00:00
|
|
|
# (pattern, num errors with validation, num errors identified with minimal
|
|
|
|
# checks)
|
|
|
|
TEST_PATTERNS = [
|
|
|
|
# Bad patterns flagged in all cases
|
|
|
|
([{"XX": "foo"}], 1, 1),
|
|
|
|
([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1),
|
|
|
|
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
|
|
|
|
([{"_": "foo"}], 1, 1),
|
|
|
|
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
|
|
|
([1, 2, 3], 3, 1),
|
|
|
|
# Bad patterns flagged outside of Matcher
|
|
|
|
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),
|
|
|
|
# Bad patterns not flagged with minimal checks
|
2019-10-16 11:40:18 +00:00
|
|
|
([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0),
|
2019-08-21 12:00:37 +00:00
|
|
|
([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),
|
|
|
|
([{"LENGTH": {"VALUE": 5}}], 1, 0),
|
|
|
|
([{"TEXT": {"VALUE": "foo"}}], 1, 0),
|
2019-10-16 11:40:18 +00:00
|
|
|
([{"IS_DIGIT": -1}], 1, 0),
|
|
|
|
([{"ORTH": -1}], 1, 0),
|
2019-08-21 12:00:37 +00:00
|
|
|
# Good patterns
|
|
|
|
([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
|
|
|
|
([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
|
|
|
|
([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0),
|
2019-10-16 11:40:18 +00:00
|
|
|
([{"LENGTH": 2}], 0, 0),
|
2019-08-21 12:00:37 +00:00
|
|
|
([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
|
|
|
|
([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
|
|
|
|
([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
|
2020-03-03 11:22:39 +00:00
|
|
|
([{"IS_SENT_START": True}], 0, 0),
|
|
|
|
([{"SENT_START": True}], 0, 0),
|
2019-08-21 12:00:37 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
XFAIL_TEST_PATTERNS = [([{"orth": "foo"}], 0, 0)]
|
|
|
|
|
2019-02-12 14:47:26 +00:00
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def validator():
|
|
|
|
return get_json_validator(TOKEN_PATTERN_SCHEMA)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]]
|
|
|
|
)
|
|
|
|
def test_matcher_pattern_validation(en_vocab, pattern):
|
|
|
|
matcher = Matcher(en_vocab, validate=True)
|
|
|
|
with pytest.raises(MatchPatternError):
|
2019-10-25 20:21:08 +00:00
|
|
|
matcher.add("TEST", [pattern])
|
2019-02-12 14:47:26 +00:00
|
|
|
|
|
|
|
|
2019-08-21 12:00:37 +00:00
|
|
|
@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
|
|
|
|
def test_pattern_validation(validator, pattern, n_errors, _):
|
2019-02-12 14:47:26 +00:00
|
|
|
errors = validate_json(pattern, validator)
|
|
|
|
assert len(errors) == n_errors
|
2019-08-21 12:00:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.xfail
|
|
|
|
@pytest.mark.parametrize("pattern,n_errors,_", XFAIL_TEST_PATTERNS)
|
|
|
|
def test_xfail_pattern_validation(validator, pattern, n_errors, _):
|
|
|
|
errors = validate_json(pattern, validator)
|
|
|
|
assert len(errors) == n_errors
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("pattern,n_errors,n_min_errors", TEST_PATTERNS)
|
|
|
|
def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
|
|
|
|
matcher = Matcher(en_vocab)
|
|
|
|
if n_min_errors > 0:
|
|
|
|
with pytest.raises(ValueError):
|
2019-10-25 20:21:08 +00:00
|
|
|
matcher.add("TEST", [pattern])
|
2019-08-21 12:00:37 +00:00
|
|
|
elif n_errors == 0:
|
2019-10-25 20:21:08 +00:00
|
|
|
matcher.add("TEST", [pattern])
|
2020-08-05 12:56:14 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_pattern_warnings(en_vocab):
|
|
|
|
matcher = Matcher(en_vocab)
|
|
|
|
# normalize "regex" to upper like "text"
|
|
|
|
matcher.add("TEST1", [[{"text": {"regex": "regex"}}]])
|
|
|
|
# warn if subpattern attribute isn't recognized and processed
|
|
|
|
with pytest.warns(UserWarning):
|
|
|
|
matcher.add("TEST2", [[{"TEXT": {"XX": "xx"}}]])
|