diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index a45597b28..5c52ae9d0 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -138,7 +138,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: def _convert_strings(token_specs, string_store): # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), - '?': (ZERO_ONE,)} + '?': (ZERO_ONE,), '1': (ONE,)} tokens = [] op = ONE for spec in token_specs: @@ -150,7 +150,7 @@ def _convert_strings(token_specs, string_store): ops = operators[value] else: raise KeyError( - "Unknown operator. Options: %s" % ', '.join(operators.keys())) + "Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys()))) if isinstance(attr, basestring): attr = attrs.IDS.get(attr.upper()) if isinstance(value, basestring): @@ -418,6 +418,22 @@ cdef class Matcher: match = acceptor(doc, ent_id, label, start, end) if match: matches.append(match) + # Look for open patterns that are actually satisfied + for state in partials: + while state.second.quantifier in (ZERO, ZERO_PLUS): + state.second += 1 + if state.second.nr_attr == 0: + start = state.first + end = len(doc) + ent_id = state.second.attrs[0].value + label = state.second.attrs[0].value + acceptor = self._acceptors.get(ent_id) + if acceptor is None: + matches.append((ent_id, label, start, end)) + else: + match = acceptor(doc, ent_id, label, start, end) + if match: + matches.append(match) for i, (ent_id, label, start, end) in enumerate(matches): on_match = self._callbacks.get(ent_id) if on_match is not None: diff --git a/spacy/tests/matcher/test_matcher.py b/spacy/tests/matcher/test_matcher.py index 95ef930ea..7c9c4ddfe 100644 --- a/spacy/tests/matcher/test_matcher.py +++ b/spacy/tests/matcher/test_matcher.py @@ -105,3 +105,21 @@ def test_matcher_match_zero_plus(matcher): matcher.add('Quote', '', {}, [pattern]) doc = get_doc(matcher.vocab, words) assert len(matcher(doc)) == 1 + +def test_matcher_match_one_plus(matcher): + control = Matcher(matcher.vocab) + control.add_pattern('BasicPhilippe', + [{'ORTH': 'Philippe'}], label=321) + + doc = get_doc(control.vocab, ['Philippe', 'Philippe']) + + m = control(doc) + assert len(m) == 2 + matcher.add_pattern('KleenePhilippe', + [ + {'ORTH': 'Philippe', 'OP': '1'}, + {'ORTH': 'Philippe', 'OP': '+'}], label=321) + m = matcher(doc) + assert len(m) == 1 + +