Correct bugs for greedy matching and introduce ADVANCE_PLUS action

This commit is contained in:
greg 2018-01-16 13:21:43 -05:00
parent f246fab0c1
commit 8bea62f26e
1 changed files with 157 additions and 12 deletions

View File

@ -62,10 +62,10 @@ cdef enum action_t:
REPEAT
ACCEPT
ADVANCE_ZERO
ADVANCE_PLUS
ACCEPT_PREV
PANIC
# A "match expression" conists of one or more token patterns
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
# A state is an (int, pattern pointer) pair, where the int is the start
# position, and the pattern pointer shows where we're up to
@ -128,7 +128,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
if pattern.quantifier == ZERO:
return REJECT
elif lookahead.nr_attr == 0:
return ACCEPT
if pattern.quantifier == ZERO_PLUS:
return REPEAT
else:
return ACCEPT
elif pattern.quantifier in (ONE, ZERO_ONE):
return ADVANCE
elif pattern.quantifier == ZERO_PLUS:
@ -138,7 +141,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
if next_action is REJECT:
return REPEAT
else:
return ADVANCE_ZERO
return ADVANCE_PLUS
else:
return PANIC
@ -330,14 +333,26 @@ cdef class Matcher:
cdef int i, token_i
cdef const TokenC* token
cdef StateC state
cdef int j = 0
cdef int k
cdef bint add_match,overlap = False
matches = []
matches_dict = {}
for token_i in range(doc.length):
token = &doc.c[token_i]
q = 0
# Go over the open matches, extending or finalizing if able.
# Otherwise, we over-write them (q doesn't advance)
for state in partials:
#for state in partials:
j=0
while j < n_partials:
state = partials[j]
action = get_action(state.second, token)
j += 1
# Skip patterns that would overlap with an existing match
ent_id = get_pattern_key(state.second)
if ent_id in matches_dict and state.first>matches_dict[ent_id][0] and state.first<matches_dict[ent_id][1]:
continue
if action == PANIC:
raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO:
@ -345,52 +360,169 @@ cdef class Matcher:
action = get_action(state.second, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
if action == ADVANCE_PLUS:
state.second += 1
partials.push_back(state)
n_partials += 1
state.second -= 1
action = REPEAT
if action == ADVANCE:
state.second += 1
overlap=False
for i in range(q):
if ent_id != get_pattern_key(partials[i].second):
continue
if state.second == partials[i].second and state.first < partials[i].first:
partials[i] = state
j = i
overlap = True
break
if overlap:
continue
overlap=False
for i in range(q):
if ent_id != get_pattern_key(partials[i].second):
continue
if state.second == partials[i].second:
overlap = True
break
if overlap:
continue
# overlap=False
# for i in range(q):
# if state.second == partials[i].second:
# if state.first < partials[i].first:
# partials[i] = state
# j = i-1
# else:
# overlap=True
# break
# if overlap:
# continue
if action == REPEAT:
# Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match
# more pattern.
partials[q] = state
q += 1
elif action == REJECT:
pass
elif action == ADVANCE:
partials[q] = state
partials[q].second += 1
q += 1
elif action in (ACCEPT, ACCEPT_PREV):
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
start = state.first
end = token_i+1 if action == ACCEPT else token_i
ent_id = state.second[1].attrs[0].value
# ent_id = state.second[1].attrs[0].value
# ent_id = get_pattern_key(state.second)
label = state.second[1].attrs[1].value
matches.append((ent_id, start, end))
# matches.append((ent_id, start, end))
# Check that this match doesn't overlap with an earlier match.
# Only overwrite an earlier match if it is a substring of this
# match.
if ent_id not in matches_dict:
matches_dict[ent_id] = (start,end,len(matches))
matches.append((ent_id,start,end))
elif start >= matches_dict[ent_id][1]:
matches_dict[ent_id] = (start,end,len(matches))
matches.append((ent_id,start,end))
elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
i = matches_dict[ent_id][2]
matches[i] = (ent_id,start,end)
matches_dict[ent_id] = (start,end,i)
else:
pass
partials.resize(q)
n_partials = q
# Check whether we open any new patterns on this token
for pattern in self.patterns:
# Skip patterns that would overlap with an existing match
ent_id = get_pattern_key(pattern)
if ent_id in matches_dict and token_i>matches_dict[ent_id][0] and token_i<matches_dict[ent_id][1]:
continue
action = get_action(pattern, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO:
# while acton == ADVANCE_ZERO:
# pattern += 1
# action = get_action(pattern,token)
# if action == PANIC:
# raise Exception("Error selecting action in matcher")
while action in (ADVANCE_PLUS,ADVANCE_ZERO):
if action == ADVANCE_PLUS:
# j=0
# overlap = False
# for j in range(q):
# if pattern == partials[j].second:
# overlap = True
# break
# if overlap:
# pattern += 1
# action = get_action(pattern, token)
# continue
state.first = token_i
state.second = pattern
partials.push_back(state)
n_partials += 1
pattern += 1
action = get_action(pattern, token)
if action == ADVANCE:
pattern += 1
j=0
overlap = False
for j in range(q):
if ent_id == get_pattern_key(partials[j].second):
continue
if pattern == partials[j].second:
overlap = True
break
if overlap:
continue
if action == REPEAT:
state.first = token_i
state.second = pattern
partials.push_back(state)
n_partials += 1
elif action == ADVANCE:
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
state.first = token_i
state.second = pattern + 1
state.second = pattern
partials.push_back(state)
n_partials += 1
elif action in (ACCEPT, ACCEPT_PREV):
start = token_i
end = token_i+1 if action == ACCEPT else token_i
ent_id = pattern[1].attrs[0].value
# ent_id = get_pattern_key(state.second)
label = pattern[1].attrs[1].value
matches.append((ent_id, start, end))
if ent_id not in matches_dict:
matches_dict[ent_id] = (start,end,len(matches))
matches.append((ent_id,start,end))
elif start >= matches_dict[ent_id][1]:
matches_dict[ent_id] = (start,end,len(matches))
matches.append((ent_id,start,end))
elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
j = matches_dict[ent_id][2]
matches[j] = (ent_id,start,end)
matches_dict[ent_id] = (start,end,j)
else:
pass
# Look for open patterns that are actually satisfied
for state in partials:
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
@ -399,8 +531,21 @@ cdef class Matcher:
start = state.first
end = len(doc)
ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value
matches.append((ent_id, start, end))
# ent_id = get_pattern_key(state.second)
label = state.second.attrs[1].value
# matches.append((ent_id, start, end))
if ent_id not in matches_dict:
matches_dict[ent_id] = (start,end,len(matches))
matches.append((ent_id,start,end))
elif start >= matches_dict[ent_id][1]:
matches_dict[ent_id] = (start,end,len(matches))
matches.append((ent_id,start,end))
elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
j = matches_dict[ent_id][2]
matches[j] = (ent_id,start,end)
matches_dict[ent_id] = (start,end,j)
else:
pass
for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
if on_match is not None: