mirror of https://github.com/explosion/spaCy.git
Correct bugs for greedy matching and introduce ADVANCE_PLUS action
This commit is contained in:
parent
f246fab0c1
commit
8bea62f26e
|
@ -62,10 +62,10 @@ cdef enum action_t:
|
||||||
REPEAT
|
REPEAT
|
||||||
ACCEPT
|
ACCEPT
|
||||||
ADVANCE_ZERO
|
ADVANCE_ZERO
|
||||||
|
ADVANCE_PLUS
|
||||||
ACCEPT_PREV
|
ACCEPT_PREV
|
||||||
PANIC
|
PANIC
|
||||||
|
|
||||||
# A "match expression" conists of one or more token patterns
|
|
||||||
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
|
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
|
||||||
# A state is an (int, pattern pointer) pair, where the int is the start
|
# A state is an (int, pattern pointer) pair, where the int is the start
|
||||||
# position, and the pattern pointer shows where we're up to
|
# position, and the pattern pointer shows where we're up to
|
||||||
|
@ -128,7 +128,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||||
if pattern.quantifier == ZERO:
|
if pattern.quantifier == ZERO:
|
||||||
return REJECT
|
return REJECT
|
||||||
elif lookahead.nr_attr == 0:
|
elif lookahead.nr_attr == 0:
|
||||||
return ACCEPT
|
if pattern.quantifier == ZERO_PLUS:
|
||||||
|
return REPEAT
|
||||||
|
else:
|
||||||
|
return ACCEPT
|
||||||
elif pattern.quantifier in (ONE, ZERO_ONE):
|
elif pattern.quantifier in (ONE, ZERO_ONE):
|
||||||
return ADVANCE
|
return ADVANCE
|
||||||
elif pattern.quantifier == ZERO_PLUS:
|
elif pattern.quantifier == ZERO_PLUS:
|
||||||
|
@ -138,7 +141,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||||
if next_action is REJECT:
|
if next_action is REJECT:
|
||||||
return REPEAT
|
return REPEAT
|
||||||
else:
|
else:
|
||||||
return ADVANCE_ZERO
|
return ADVANCE_PLUS
|
||||||
else:
|
else:
|
||||||
return PANIC
|
return PANIC
|
||||||
|
|
||||||
|
@ -330,14 +333,26 @@ cdef class Matcher:
|
||||||
cdef int i, token_i
|
cdef int i, token_i
|
||||||
cdef const TokenC* token
|
cdef const TokenC* token
|
||||||
cdef StateC state
|
cdef StateC state
|
||||||
|
cdef int j = 0
|
||||||
|
cdef int k
|
||||||
|
cdef bint add_match,overlap = False
|
||||||
matches = []
|
matches = []
|
||||||
|
matches_dict = {}
|
||||||
for token_i in range(doc.length):
|
for token_i in range(doc.length):
|
||||||
token = &doc.c[token_i]
|
token = &doc.c[token_i]
|
||||||
q = 0
|
q = 0
|
||||||
# Go over the open matches, extending or finalizing if able.
|
# Go over the open matches, extending or finalizing if able.
|
||||||
# Otherwise, we over-write them (q doesn't advance)
|
# Otherwise, we over-write them (q doesn't advance)
|
||||||
for state in partials:
|
#for state in partials:
|
||||||
|
j=0
|
||||||
|
while j < n_partials:
|
||||||
|
state = partials[j]
|
||||||
action = get_action(state.second, token)
|
action = get_action(state.second, token)
|
||||||
|
j += 1
|
||||||
|
# Skip patterns that would overlap with an existing match
|
||||||
|
ent_id = get_pattern_key(state.second)
|
||||||
|
if ent_id in matches_dict and state.first>matches_dict[ent_id][0] and state.first<matches_dict[ent_id][1]:
|
||||||
|
continue
|
||||||
if action == PANIC:
|
if action == PANIC:
|
||||||
raise Exception("Error selecting action in matcher")
|
raise Exception("Error selecting action in matcher")
|
||||||
while action == ADVANCE_ZERO:
|
while action == ADVANCE_ZERO:
|
||||||
|
@ -346,51 +361,168 @@ cdef class Matcher:
|
||||||
if action == PANIC:
|
if action == PANIC:
|
||||||
raise Exception("Error selecting action in matcher")
|
raise Exception("Error selecting action in matcher")
|
||||||
|
|
||||||
|
|
||||||
|
if action == ADVANCE_PLUS:
|
||||||
|
state.second += 1
|
||||||
|
partials.push_back(state)
|
||||||
|
n_partials += 1
|
||||||
|
state.second -= 1
|
||||||
|
action = REPEAT
|
||||||
|
|
||||||
|
if action == ADVANCE:
|
||||||
|
state.second += 1
|
||||||
|
|
||||||
|
overlap=False
|
||||||
|
for i in range(q):
|
||||||
|
if ent_id != get_pattern_key(partials[i].second):
|
||||||
|
continue
|
||||||
|
if state.second == partials[i].second and state.first < partials[i].first:
|
||||||
|
partials[i] = state
|
||||||
|
j = i
|
||||||
|
overlap = True
|
||||||
|
break
|
||||||
|
if overlap:
|
||||||
|
continue
|
||||||
|
overlap=False
|
||||||
|
for i in range(q):
|
||||||
|
if ent_id != get_pattern_key(partials[i].second):
|
||||||
|
continue
|
||||||
|
if state.second == partials[i].second:
|
||||||
|
overlap = True
|
||||||
|
break
|
||||||
|
if overlap:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# overlap=False
|
||||||
|
# for i in range(q):
|
||||||
|
# if state.second == partials[i].second:
|
||||||
|
# if state.first < partials[i].first:
|
||||||
|
# partials[i] = state
|
||||||
|
# j = i-1
|
||||||
|
# else:
|
||||||
|
# overlap=True
|
||||||
|
# break
|
||||||
|
# if overlap:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
|
||||||
if action == REPEAT:
|
if action == REPEAT:
|
||||||
# Leave the state in the queue, and advance to next slot
|
# Leave the state in the queue, and advance to next slot
|
||||||
# (i.e. we don't overwrite -- we want to greedily match
|
# (i.e. we don't overwrite -- we want to greedily match
|
||||||
# more pattern.
|
# more pattern.
|
||||||
|
partials[q] = state
|
||||||
q += 1
|
q += 1
|
||||||
elif action == REJECT:
|
elif action == REJECT:
|
||||||
pass
|
pass
|
||||||
elif action == ADVANCE:
|
elif action == ADVANCE:
|
||||||
partials[q] = state
|
partials[q] = state
|
||||||
partials[q].second += 1
|
|
||||||
q += 1
|
q += 1
|
||||||
elif action in (ACCEPT, ACCEPT_PREV):
|
elif action in (ACCEPT, ACCEPT_PREV):
|
||||||
# TODO: What to do about patterns starting with ZERO? Need
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
# to adjust the start position.
|
# to adjust the start position.
|
||||||
start = state.first
|
start = state.first
|
||||||
end = token_i+1 if action == ACCEPT else token_i
|
end = token_i+1 if action == ACCEPT else token_i
|
||||||
ent_id = state.second[1].attrs[0].value
|
# ent_id = state.second[1].attrs[0].value
|
||||||
|
# ent_id = get_pattern_key(state.second)
|
||||||
label = state.second[1].attrs[1].value
|
label = state.second[1].attrs[1].value
|
||||||
matches.append((ent_id, start, end))
|
# matches.append((ent_id, start, end))
|
||||||
|
# Check that this match doesn't overlap with an earlier match.
|
||||||
|
# Only overwrite an earlier match if it is a substring of this
|
||||||
|
# match.
|
||||||
|
|
||||||
|
if ent_id not in matches_dict:
|
||||||
|
matches_dict[ent_id] = (start,end,len(matches))
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
elif start >= matches_dict[ent_id][1]:
|
||||||
|
matches_dict[ent_id] = (start,end,len(matches))
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
|
||||||
|
i = matches_dict[ent_id][2]
|
||||||
|
matches[i] = (ent_id,start,end)
|
||||||
|
matches_dict[ent_id] = (start,end,i)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
partials.resize(q)
|
partials.resize(q)
|
||||||
|
n_partials = q
|
||||||
# Check whether we open any new patterns on this token
|
# Check whether we open any new patterns on this token
|
||||||
for pattern in self.patterns:
|
for pattern in self.patterns:
|
||||||
|
# Skip patterns that would overlap with an existing match
|
||||||
|
ent_id = get_pattern_key(pattern)
|
||||||
|
if ent_id in matches_dict and token_i>matches_dict[ent_id][0] and token_i<matches_dict[ent_id][1]:
|
||||||
|
continue
|
||||||
action = get_action(pattern, token)
|
action = get_action(pattern, token)
|
||||||
if action == PANIC:
|
if action == PANIC:
|
||||||
raise Exception("Error selecting action in matcher")
|
raise Exception("Error selecting action in matcher")
|
||||||
while action == ADVANCE_ZERO:
|
# while acton == ADVANCE_ZERO:
|
||||||
|
# pattern += 1
|
||||||
|
# action = get_action(pattern,token)
|
||||||
|
# if action == PANIC:
|
||||||
|
# raise Exception("Error selecting action in matcher")
|
||||||
|
while action in (ADVANCE_PLUS,ADVANCE_ZERO):
|
||||||
|
if action == ADVANCE_PLUS:
|
||||||
|
# j=0
|
||||||
|
# overlap = False
|
||||||
|
# for j in range(q):
|
||||||
|
# if pattern == partials[j].second:
|
||||||
|
# overlap = True
|
||||||
|
# break
|
||||||
|
# if overlap:
|
||||||
|
# pattern += 1
|
||||||
|
# action = get_action(pattern, token)
|
||||||
|
# continue
|
||||||
|
state.first = token_i
|
||||||
|
state.second = pattern
|
||||||
|
partials.push_back(state)
|
||||||
|
n_partials += 1
|
||||||
pattern += 1
|
pattern += 1
|
||||||
action = get_action(pattern, token)
|
action = get_action(pattern, token)
|
||||||
|
|
||||||
|
if action == ADVANCE:
|
||||||
|
pattern += 1
|
||||||
|
j=0
|
||||||
|
overlap = False
|
||||||
|
for j in range(q):
|
||||||
|
if ent_id == get_pattern_key(partials[j].second):
|
||||||
|
continue
|
||||||
|
if pattern == partials[j].second:
|
||||||
|
overlap = True
|
||||||
|
break
|
||||||
|
if overlap:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
if action == REPEAT:
|
if action == REPEAT:
|
||||||
state.first = token_i
|
state.first = token_i
|
||||||
state.second = pattern
|
state.second = pattern
|
||||||
partials.push_back(state)
|
partials.push_back(state)
|
||||||
|
n_partials += 1
|
||||||
elif action == ADVANCE:
|
elif action == ADVANCE:
|
||||||
# TODO: What to do about patterns starting with ZERO? Need
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
# to adjust the start position.
|
# to adjust the start position.
|
||||||
state.first = token_i
|
state.first = token_i
|
||||||
state.second = pattern + 1
|
state.second = pattern
|
||||||
partials.push_back(state)
|
partials.push_back(state)
|
||||||
|
n_partials += 1
|
||||||
elif action in (ACCEPT, ACCEPT_PREV):
|
elif action in (ACCEPT, ACCEPT_PREV):
|
||||||
start = token_i
|
start = token_i
|
||||||
end = token_i+1 if action == ACCEPT else token_i
|
end = token_i+1 if action == ACCEPT else token_i
|
||||||
ent_id = pattern[1].attrs[0].value
|
ent_id = pattern[1].attrs[0].value
|
||||||
|
# ent_id = get_pattern_key(state.second)
|
||||||
label = pattern[1].attrs[1].value
|
label = pattern[1].attrs[1].value
|
||||||
matches.append((ent_id, start, end))
|
if ent_id not in matches_dict:
|
||||||
|
matches_dict[ent_id] = (start,end,len(matches))
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
elif start >= matches_dict[ent_id][1]:
|
||||||
|
matches_dict[ent_id] = (start,end,len(matches))
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
|
||||||
|
j = matches_dict[ent_id][2]
|
||||||
|
matches[j] = (ent_id,start,end)
|
||||||
|
matches_dict[ent_id] = (start,end,j)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
# Look for open patterns that are actually satisfied
|
# Look for open patterns that are actually satisfied
|
||||||
for state in partials:
|
for state in partials:
|
||||||
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
|
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
|
||||||
|
@ -399,8 +531,21 @@ cdef class Matcher:
|
||||||
start = state.first
|
start = state.first
|
||||||
end = len(doc)
|
end = len(doc)
|
||||||
ent_id = state.second.attrs[0].value
|
ent_id = state.second.attrs[0].value
|
||||||
label = state.second.attrs[0].value
|
# ent_id = get_pattern_key(state.second)
|
||||||
matches.append((ent_id, start, end))
|
label = state.second.attrs[1].value
|
||||||
|
# matches.append((ent_id, start, end))
|
||||||
|
if ent_id not in matches_dict:
|
||||||
|
matches_dict[ent_id] = (start,end,len(matches))
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
elif start >= matches_dict[ent_id][1]:
|
||||||
|
matches_dict[ent_id] = (start,end,len(matches))
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
elif start <= matches_dict[ent_id][0] and end>=matches_dict[ent_id][1]:
|
||||||
|
j = matches_dict[ent_id][2]
|
||||||
|
matches[j] = (ent_id,start,end)
|
||||||
|
matches_dict[ent_id] = (start,end,j)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
for i, (ent_id, start, end) in enumerate(matches):
|
for i, (ent_id, start, end) in enumerate(matches):
|
||||||
on_match = self._callbacks.get(ent_id)
|
on_match = self._callbacks.get(ent_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
|
|
Loading…
Reference in New Issue