diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index a6b02ba2c..6d40045ae 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -62,10 +62,10 @@ cdef enum action_t: REPEAT ACCEPT ADVANCE_ZERO + ADVANCE_PLUS ACCEPT_PREV PANIC -# A "match expression" conists of one or more token patterns # Each token pattern consists of a quantifier and 0+ (attr, value) pairs. # A state is an (int, pattern pointer) pair, where the int is the start # position, and the pattern pointer shows where we're up to @@ -128,7 +128,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: if pattern.quantifier == ZERO: return REJECT elif lookahead.nr_attr == 0: - return ACCEPT + if pattern.quantifier == ZERO_PLUS: + return REPEAT + else: + return ACCEPT elif pattern.quantifier in (ONE, ZERO_ONE): return ADVANCE elif pattern.quantifier == ZERO_PLUS: @@ -138,7 +141,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: if next_action is REJECT: return REPEAT else: - return ADVANCE_ZERO + return ADVANCE_PLUS else: return PANIC @@ -330,14 +333,26 @@ cdef class Matcher: cdef int i, token_i cdef const TokenC* token cdef StateC state + cdef int j = 0 + cdef int k + cdef bint add_match,overlap = False matches = [] + matches_dict = {} for token_i in range(doc.length): token = &doc.c[token_i] q = 0 # Go over the open matches, extending or finalizing if able. # Otherwise, we over-write them (q doesn't advance) - for state in partials: + #for state in partials: + j=0 + while j < n_partials: + state = partials[j] action = get_action(state.second, token) + j += 1 + # Skip patterns that would overlap with an existing match + ent_id = get_pattern_key(state.second) + if ent_id in matches_dict and state.first>matches_dict[ent_id][0] and state.first= matches_dict[ent_id][1]: + matches_dict[ent_id] = (start,end,len(matches)) + matches.append((ent_id,start,end)) + elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]: + i = matches_dict[ent_id][2] + matches[i] = (ent_id,start,end) + matches_dict[ent_id] = (start,end,i) + else: + pass partials.resize(q) + n_partials = q # Check whether we open any new patterns on this token for pattern in self.patterns: + # Skip patterns that would overlap with an existing match + ent_id = get_pattern_key(pattern) + if ent_id in matches_dict and token_i>matches_dict[ent_id][0] and token_i= matches_dict[ent_id][1]: + matches_dict[ent_id] = (start,end,len(matches)) + matches.append((ent_id,start,end)) + elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]: + j = matches_dict[ent_id][2] + matches[j] = (ent_id,start,end) + matches_dict[ent_id] = (start,end,j) + else: + pass + # Look for open patterns that are actually satisfied for state in partials: while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): @@ -399,8 +531,21 @@ cdef class Matcher: start = state.first end = len(doc) ent_id = state.second.attrs[0].value - label = state.second.attrs[0].value - matches.append((ent_id, start, end)) + # ent_id = get_pattern_key(state.second) + label = state.second.attrs[1].value + # matches.append((ent_id, start, end)) + if ent_id not in matches_dict: + matches_dict[ent_id] = (start,end,len(matches)) + matches.append((ent_id,start,end)) + elif start >= matches_dict[ent_id][1]: + matches_dict[ent_id] = (start,end,len(matches)) + matches.append((ent_id,start,end)) + elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]: + j = matches_dict[ent_id][2] + matches[j] = (ent_id,start,end) + matches_dict[ent_id] = (start,end,j) + else: + pass for i, (ent_id, start, end) in enumerate(matches): on_match = self._callbacks.get(ent_id) if on_match is not None: