Add 1 operator to matcher, and make sure open patterns are closed at end of document. Closes Issue #766

This commit is contained in:
Matthew Honnibal 2017-02-24 14:27:02 +01:00
parent f028f8ad28
commit 8f94897d07
2 changed files with 36 additions and 2 deletions

View File

@ -138,7 +138,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
def _convert_strings(token_specs, string_store):
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
'?': (ZERO_ONE,)}
'?': (ZERO_ONE,), '1': (ONE,)}
tokens = []
op = ONE
for spec in token_specs:
@ -150,7 +150,7 @@ def _convert_strings(token_specs, string_store):
ops = operators[value]
else:
raise KeyError(
"Unknown operator. Options: %s" % ', '.join(operators.keys()))
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
if isinstance(attr, basestring):
attr = attrs.IDS.get(attr.upper())
if isinstance(value, basestring):
@ -418,6 +418,22 @@ cdef class Matcher:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
# Look for open patterns that are actually satisfied
for state in partials:
while state.second.quantifier in (ZERO, ZERO_PLUS):
state.second += 1
if state.second.nr_attr == 0:
start = state.first
end = len(doc)
ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value
acceptor = self._acceptors.get(ent_id)
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
for i, (ent_id, label, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
if on_match is not None:

View File

@ -105,3 +105,21 @@ def test_matcher_match_zero_plus(matcher):
matcher.add('Quote', '', {}, [pattern])
doc = get_doc(matcher.vocab, words)
assert len(matcher(doc)) == 1
def test_matcher_match_one_plus(matcher):
control = Matcher(matcher.vocab)
control.add_pattern('BasicPhilippe',
[{'ORTH': 'Philippe'}], label=321)
doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
m = control(doc)
assert len(m) == 2
matcher.add_pattern('KleenePhilippe',
[
{'ORTH': 'Philippe', 'OP': '1'},
{'ORTH': 'Philippe', 'OP': '+'}], label=321)
m = matcher(doc)
assert len(m) == 1