Correct bugs for greedy matching and introduce ADVANCE_PLUS action

2018-01-16 13:21:43 -05:00 · 2018-01-16 13:21:43 -05:00 · 8bea62f26e
parent f246fab0c1
commit 8bea62f26e
1 changed files with 157 additions and 12 deletions
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -62,10 +62,10 @@ cdef enum action_t:
    REPEAT
    ACCEPT
    ADVANCE_ZERO
+    ADVANCE_PLUS
    ACCEPT_PREV
    PANIC

-# A "match expression" conists of one or more token patterns
 # Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
 # A state is an (int, pattern pointer) pair, where the int is the start
 # position, and the pattern pointer shows where we're up to
@ -128,7 +128,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    if pattern.quantifier == ZERO:
        return REJECT
    elif lookahead.nr_attr == 0:
-        return ACCEPT
+        if pattern.quantifier == ZERO_PLUS:
+            return REPEAT
+        else:
+            return ACCEPT
    elif pattern.quantifier in (ONE, ZERO_ONE):
        return ADVANCE
    elif pattern.quantifier == ZERO_PLUS:
@ -138,7 +141,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
        if next_action is REJECT:
            return REPEAT
        else:
-            return ADVANCE_ZERO
+            return ADVANCE_PLUS
    else:
        return PANIC

@ -330,14 +333,26 @@ cdef class Matcher:
        cdef int i, token_i
        cdef const TokenC* token
        cdef StateC state
+        cdef int j = 0
+        cdef int k
+        cdef bint add_match,overlap = False
        matches = []
+        matches_dict = {}
        for token_i in range(doc.length):
            token = &doc.c[token_i]
            q = 0
            # Go over the open matches, extending or finalizing if able.
            # Otherwise, we over-write them (q doesn't advance)
-            for state in partials:
+            #for state in partials:
+            j=0
+            while j < n_partials:
+                state = partials[j]
                action = get_action(state.second, token)
+                j += 1
+                # Skip patterns that would overlap with an existing match
+                ent_id = get_pattern_key(state.second)
+                if ent_id in matches_dict and state.first>matches_dict[ent_id][0] and state.first<matches_dict[ent_id][1]:
+                    continue
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                while action == ADVANCE_ZERO:
@ -346,51 +361,168 @@ cdef class Matcher:
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                
+
+                if action == ADVANCE_PLUS:
+                    state.second += 1
+                    partials.push_back(state)
+                    n_partials += 1
+                    state.second -= 1
+                    action = REPEAT
+
+                if action == ADVANCE:
+                    state.second += 1
+
+                overlap=False
+                for i in range(q):
+                    if ent_id != get_pattern_key(partials[i].second):
+                        continue
+                    if state.second == partials[i].second and state.first < partials[i].first:
+                        partials[i] = state
+                        j = i
+                        overlap = True
+                        break
+                if overlap:
+                    continue
+                overlap=False
+                for i in range(q):
+                    if ent_id != get_pattern_key(partials[i].second):
+                        continue
+                    if state.second == partials[i].second:
+                        overlap = True
+                        break
+                if overlap:
+                    continue
+
+                # overlap=False
+                # for i in range(q):
+                #     if state.second == partials[i].second:
+                #         if state.first < partials[i].first:
+                #             partials[i] = state
+                #             j = i-1
+                #         else:
+                #             overlap=True
+                #         break
+                # if overlap:
+                #     continue
+
+    
                if action == REPEAT:
                    # Leave the state in the queue, and advance to next slot
                    # (i.e. we don't overwrite -- we want to greedily match
                    # more pattern.
+                    partials[q] = state
                    q += 1
                elif action == REJECT:
                    pass
                elif action == ADVANCE:
                    partials[q] = state
-                    partials[q].second += 1
                    q += 1
                elif action in (ACCEPT, ACCEPT_PREV):
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
                    start = state.first
                    end = token_i+1 if action == ACCEPT else token_i
-                    ent_id = state.second[1].attrs[0].value
+                    # ent_id = state.second[1].attrs[0].value
+                    # ent_id = get_pattern_key(state.second)
                    label = state.second[1].attrs[1].value
-                    matches.append((ent_id, start, end))
+                    # matches.append((ent_id, start, end))
+                    # Check that this match doesn't overlap with an earlier match.
+                    # Only overwrite an earlier match if it is a substring of this
+                    # match.
+
+                    if ent_id not in matches_dict:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start >= matches_dict[ent_id][1]:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
+                        i = matches_dict[ent_id][2]
+                        matches[i] = (ent_id,start,end)
+                        matches_dict[ent_id] = (start,end,i)
+                    else:
+                        pass

            partials.resize(q)
+            n_partials = q
            # Check whether we open any new patterns on this token
            for pattern in self.patterns:
+                # Skip patterns that would overlap with an existing match
+                ent_id = get_pattern_key(pattern)
+                if ent_id in matches_dict and token_i>matches_dict[ent_id][0] and token_i<matches_dict[ent_id][1]:
+                    continue
                action = get_action(pattern, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
-                while action == ADVANCE_ZERO:
+                # while acton == ADVANCE_ZERO:
+                #     pattern += 1
+                #     action = get_action(pattern,token)
+                # if action == PANIC:
+                #     raise Exception("Error selecting action in matcher")
+                while action in (ADVANCE_PLUS,ADVANCE_ZERO):
+                    if action == ADVANCE_PLUS:
+                        # j=0
+                        # overlap = False
+                        # for j in range(q):
+                        #     if pattern == partials[j].second:
+                        #         overlap = True
+                        #         break
+                        # if overlap:
+                        #     pattern += 1
+                        #     action = get_action(pattern, token)
+                        #     continue
+                        state.first = token_i
+                        state.second = pattern
+                        partials.push_back(state)
+                        n_partials += 1
                    pattern += 1
                    action = get_action(pattern, token)
+
+                if action == ADVANCE:
+                    pattern += 1
+                j=0
+                overlap = False
+                for j in range(q):
+                    if ent_id == get_pattern_key(partials[j].second):
+                        continue
+                    if pattern == partials[j].second:
+                        overlap = True
+                        break
+                if overlap:
+                    continue
+
+
                if action == REPEAT:
                    state.first = token_i
                    state.second = pattern
                    partials.push_back(state)
+                    n_partials += 1
                elif action == ADVANCE:
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
                    state.first = token_i
-                    state.second = pattern + 1
+                    state.second = pattern
                    partials.push_back(state)
+                    n_partials += 1
                elif action in (ACCEPT, ACCEPT_PREV):
                    start = token_i
                    end = token_i+1 if action == ACCEPT else token_i
                    ent_id = pattern[1].attrs[0].value
+                    # ent_id = get_pattern_key(state.second)
                    label = pattern[1].attrs[1].value
-                    matches.append((ent_id, start, end))
+                    if ent_id not in matches_dict:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start >= matches_dict[ent_id][1]:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
+                        j = matches_dict[ent_id][2]
+                        matches[j] = (ent_id,start,end)
+                        matches_dict[ent_id] = (start,end,j)
+                    else:
+                        pass
+
        # Look for open patterns that are actually satisfied
        for state in partials:
            while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
@ -399,8 +531,21 @@ cdef class Matcher:
                    start = state.first
                    end = len(doc)
                    ent_id = state.second.attrs[0].value
-                    label = state.second.attrs[0].value
-                    matches.append((ent_id, start, end))
+                    # ent_id = get_pattern_key(state.second)
+                    label = state.second.attrs[1].value
+                    # matches.append((ent_id, start, end))
+                    if ent_id not in matches_dict:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start >= matches_dict[ent_id][1]:
+                        matches_dict[ent_id] = (start,end,len(matches))
+                        matches.append((ent_id,start,end))
+                    elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
+                        j = matches_dict[ent_id][2]
+                        matches[j] = (ent_id,start,end)
+                        matches_dict[ent_id] = (start,end,j)
+                    else:
+                        pass
        for i, (ent_id, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None: