spaCy/spacy/tests/matcher/test_matcher_logic.py

117 lines
3.7 KiB
Python

# coding: utf-8
from __future__ import unicode_literals
import pytest
import re
from spacy.matcher import Matcher
from spacy.tokens import Doc
pattern1 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}]
pattern2 = [{'ORTH':'A', 'OP':'*'}, {'ORTH':'A', 'OP':'1'}]
pattern3 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'1'}]
pattern4 = [{'ORTH':'B', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
pattern5 = [{'ORTH':'B', 'OP':'*'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
re_pattern1 = 'AA*'
re_pattern2 = 'A*A'
re_pattern3 = 'AA'
re_pattern4 = 'BA*B'
re_pattern5 = 'B*A*B'
@pytest.fixture
def text():
return "(ABBAAAAAB)."
@pytest.fixture
def doc(en_tokenizer, text):
doc = en_tokenizer(' '.join(text))
return doc
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern', [
(pattern1, re_pattern1),
(pattern2, re_pattern2),
(pattern3, re_pattern3),
(pattern4, re_pattern4),
(pattern5, re_pattern5)])
def test_greedy_matching(doc, text, pattern, re_pattern):
"""Test that the greedy matching behavior of the * op is consistant with
other re implementations."""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern, None, pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
for match, re_match in zip(matches, re_matches):
assert match[1:] == re_match
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern', [
(pattern1, re_pattern1),
(pattern2, re_pattern2),
(pattern3, re_pattern3),
(pattern4, re_pattern4),
(pattern5, re_pattern5)])
def test_match_consuming(doc, text, pattern, re_pattern):
"""Test that matcher.__call__ consumes tokens on a match similar to
re.findall."""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern, None, pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
assert len(matches) == len(re_matches)
def test_operator_combos(en_vocab):
cases = [
('aaab', 'a a a b', True),
('aaab', 'a+ b', True),
('aaab', 'a+ a+ b', True),
('aaab', 'a+ a+ a b', True),
('aaab', 'a+ a+ a+ b', True),
('aaab', 'a+ a a b', True),
('aaab', 'a+ a a', True),
('aaab', 'a+', True),
('aaa', 'a+ b', False),
('aaa', 'a+ a+ b', False),
('aaa', 'a+ a+ a+ b', False),
('aaa', 'a+ a b', False),
('aaa', 'a+ a a b', False),
('aaab', 'a+ a a', True),
('aaab', 'a+', True),
('aaab', 'a+ a b', True)
]
for string, pattern_str, result in cases:
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=list(string))
pattern = []
for part in pattern_str.split():
if part.endswith('+'):
pattern.append({'ORTH': part[0], 'OP': '+'})
else:
pattern.append({'ORTH': part})
matcher.add('PATTERN', None, pattern)
matches = matcher(doc)
if result:
assert matches, (string, pattern_str)
else:
assert not matches, (string, pattern_str)
def test_matcher_end_zero_plus(en_vocab):
"""Test matcher works when patterns end with * operator. (issue 1450)"""
matcher = Matcher(en_vocab)
pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
matcher.add('TSTEND', None, pattern)
nlp = lambda string: Doc(matcher.vocab, words=string.split())
assert len(matcher(nlp('a'))) == 1
assert len(matcher(nlp('a b'))) == 2
assert len(matcher(nlp('a c'))) == 1
assert len(matcher(nlp('a b c'))) == 2
assert len(matcher(nlp('a b b c'))) == 3
assert len(matcher(nlp('a b b'))) == 3