changed _c_find_near_matches_generic_linear_programming to use the new KMP
This commit is contained in:
parent
e1a7b0cc30
commit
76ee4e0715
File diff suppressed because it is too large
Load Diff
|
@ -2,8 +2,18 @@ from sys import maxint
|
||||||
import six
|
import six
|
||||||
from fuzzysearch.common import Match
|
from fuzzysearch.common import Match
|
||||||
from libc.stdlib cimport malloc, free, realloc
|
from libc.stdlib cimport malloc, free, realloc
|
||||||
from libc.string cimport strstr, strncpy
|
|
||||||
|
|
||||||
|
cdef extern from "kmp.h":
|
||||||
|
struct KMPstate:
|
||||||
|
pass # no need to specify the fields if they aren't accessed directly
|
||||||
|
|
||||||
|
void preKMP(const char *subsequence, int subsequence_len, int *kmpNext)
|
||||||
|
|
||||||
|
KMPstate KMP_init(const char *subseq, int subseq_len,
|
||||||
|
const char *seq, int seq_len,
|
||||||
|
int *kmpNext)
|
||||||
|
|
||||||
|
const char* KMP_find_next(KMPstate *kmp_state)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'c_find_near_matches_generic_linear_programming',
|
'c_find_near_matches_generic_linear_programming',
|
||||||
|
@ -47,8 +57,8 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
|
||||||
if not subsequence:
|
if not subsequence:
|
||||||
raise ValueError('Given subsequence is empty!')
|
raise ValueError('Given subsequence is empty!')
|
||||||
|
|
||||||
c_subsequence = <char *>subsequence
|
cdef const char *c_subsequence = subsequence
|
||||||
c_sequence = <char *>sequence
|
cdef const char *c_sequence = sequence
|
||||||
|
|
||||||
return _c_find_near_matches_generic_linear_programming(
|
return _c_find_near_matches_generic_linear_programming(
|
||||||
c_subsequence, len(subsequence),
|
c_subsequence, len(subsequence),
|
||||||
|
@ -59,9 +69,12 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
|
||||||
max_l_dist if max_l_dist is not None else (1<<29),
|
max_l_dist if max_l_dist is not None else (1<<29),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _c_find_near_matches_generic_linear_programming(
|
# The following MUST be a cdef, otherwise Cython copies the sequence and
|
||||||
char* subsequence, size_t subseq_len,
|
# subsequence strings, which means if they contain null bytes the data after
|
||||||
char* sequence, size_t seq_len,
|
# the first null byte will not be copied.
|
||||||
|
cdef _c_find_near_matches_generic_linear_programming(
|
||||||
|
const char* subsequence, size_t subseq_len,
|
||||||
|
const char* sequence, size_t seq_len,
|
||||||
unsigned int max_substitutions,
|
unsigned int max_substitutions,
|
||||||
unsigned int max_insertions,
|
unsigned int max_insertions,
|
||||||
unsigned int max_deletions,
|
unsigned int max_deletions,
|
||||||
|
@ -90,11 +103,12 @@ def _c_find_near_matches_generic_linear_programming(
|
||||||
matches = []
|
matches = []
|
||||||
|
|
||||||
cdef size_t index
|
cdef size_t index
|
||||||
cdef char charchar
|
cdef char seq_char
|
||||||
|
|
||||||
try:
|
try:
|
||||||
index = 0
|
index = 0
|
||||||
have_realloced = False
|
have_realloced = False
|
||||||
for charchar in sequence[:seq_len]:
|
for seq_char in sequence[:seq_len]:
|
||||||
candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0)
|
candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0)
|
||||||
n_candidates += 1
|
n_candidates += 1
|
||||||
|
|
||||||
|
@ -110,7 +124,7 @@ def _c_find_near_matches_generic_linear_programming(
|
||||||
have_realloced = True
|
have_realloced = True
|
||||||
|
|
||||||
# if this sequence char is the candidate's next expected char
|
# if this sequence char is the candidate's next expected char
|
||||||
if charchar == subsequence[cand.subseq_index]:
|
if seq_char == subsequence[cand.subseq_index]:
|
||||||
# if reached the end of the subsequence, return a match
|
# if reached the end of the subsequence, return a match
|
||||||
if cand.subseq_index == subseq_len_minus_one:
|
if cand.subseq_index == subseq_len_minus_one:
|
||||||
matches.append(Match(cand.start, index + 1, cand.l_dist))
|
matches.append(Match(cand.start, index + 1, cand.l_dist))
|
||||||
|
@ -181,7 +195,7 @@ def _c_find_near_matches_generic_linear_programming(
|
||||||
# otherwise, if skipping n_skipped sub-sequence chars
|
# otherwise, if skipping n_skipped sub-sequence chars
|
||||||
# reaches a sub-sequence char identical to this sequence
|
# reaches a sub-sequence char identical to this sequence
|
||||||
# char ...
|
# char ...
|
||||||
elif charchar == subsequence[cand.subseq_index + n_skipped]:
|
elif seq_char == subsequence[cand.subseq_index + n_skipped]:
|
||||||
# if this is the last char of the sub-sequence, yield
|
# if this is the last char of the sub-sequence, yield
|
||||||
# a match
|
# a match
|
||||||
if cand.subseq_index + n_skipped + 1 == subseq_len:
|
if cand.subseq_index + n_skipped + 1 == subseq_len:
|
||||||
|
@ -269,35 +283,32 @@ def c_find_near_matches_generic_ngrams(subsequence, sequence,
|
||||||
c_max_substitutions + c_max_insertions + c_max_deletions,
|
c_max_substitutions + c_max_insertions + c_max_deletions,
|
||||||
)
|
)
|
||||||
|
|
||||||
cdef char* c_sequence = sequence
|
cdef const char* c_sequence = sequence
|
||||||
cdef char* c_subsequence = subsequence
|
cdef const char* c_subsequence = subsequence
|
||||||
cdef char* ngram_str
|
|
||||||
|
|
||||||
cdef size_t ngram_len = _subseq_len // (c_max_l_dist + 1)
|
cdef size_t ngram_len = _subseq_len // (c_max_l_dist + 1)
|
||||||
if ngram_len == 0:
|
if ngram_len == 0:
|
||||||
raise ValueError('the subsequence length must be greater than max_l_dist')
|
raise ValueError('the subsequence length must be greater than max_l_dist')
|
||||||
|
|
||||||
ngram_str = <char *> malloc((ngram_len + 1) * sizeof(char))
|
|
||||||
if ngram_str is NULL:
|
|
||||||
raise MemoryError()
|
|
||||||
|
|
||||||
cdef int index, small_search_start_index
|
cdef int index, small_search_start_index
|
||||||
cdef size_t ngram_start
|
cdef size_t ngram_start
|
||||||
cdef char *match_ptr
|
|
||||||
|
|
||||||
matches = []
|
cdef const char *match_ptr
|
||||||
|
cdef int *kmpNext
|
||||||
|
cdef KMPstate kmp_state
|
||||||
|
kmpNext = <int *> malloc(ngram_len * sizeof(int))
|
||||||
|
if kmpNext is NULL:
|
||||||
|
raise MemoryError()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ngram_str[ngram_len] = 0
|
matches = []
|
||||||
|
|
||||||
for ngram_start in xrange(0, _subseq_len - ngram_len + 1, ngram_len):
|
for ngram_start in xrange(0, _subseq_len - ngram_len + 1, ngram_len):
|
||||||
strncpy(ngram_str, c_subsequence + ngram_start, ngram_len)
|
preKMP(c_subsequence + ngram_start, ngram_len, kmpNext)
|
||||||
|
|
||||||
# TODO: handle null characters properly!
|
kmp_state = KMP_init(c_subsequence + ngram_start, ngram_len, c_sequence, _seq_len, kmpNext)
|
||||||
match_ptr = strstr(c_sequence, ngram_str)
|
match_ptr = KMP_find_next(&kmp_state)
|
||||||
while match_ptr != NULL:
|
while match_ptr != NULL:
|
||||||
index = (match_ptr - c_sequence)
|
small_search_start_index = (match_ptr - c_sequence) - ngram_start - c_max_l_dist
|
||||||
small_search_start_index = index - ngram_start - c_max_l_dist
|
|
||||||
small_search_length = _subseq_len + (2 * c_max_l_dist)
|
small_search_length = _subseq_len + (2 * c_max_l_dist)
|
||||||
if small_search_start_index < 0:
|
if small_search_start_index < 0:
|
||||||
small_search_length += small_search_start_index
|
small_search_length += small_search_start_index
|
||||||
|
@ -315,9 +326,9 @@ def c_find_near_matches_generic_ngrams(subsequence, sequence,
|
||||||
start=match.start + small_search_start_index,
|
start=match.start + small_search_start_index,
|
||||||
end=match.end + small_search_start_index,
|
end=match.end + small_search_start_index,
|
||||||
))
|
))
|
||||||
match_ptr = strstr(match_ptr + 1, ngram_str)
|
match_ptr = KMP_find_next(&kmp_state)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
free(ngram_str)
|
free(kmpNext)
|
||||||
|
|
||||||
return matches
|
return matches
|
3
setup.py
3
setup.py
|
@ -27,7 +27,8 @@ _common_module = Extension(
|
||||||
)
|
)
|
||||||
_generic_search_module = Extension(
|
_generic_search_module = Extension(
|
||||||
'fuzzysearch._generic_search',
|
'fuzzysearch._generic_search',
|
||||||
sources=['fuzzysearch/_generic_search.c'],
|
sources=['fuzzysearch/_generic_search.c', 'fuzzysearch/kmp.c'],
|
||||||
|
include_dirs=['.'],
|
||||||
)
|
)
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
|
|
|
@ -130,6 +130,17 @@ class TestGenericSearchBase(object):
|
||||||
[Match(start=3, end=5, dist=1)],
|
[Match(start=3, end=5, dist=1)],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_null_bytes(self):
|
||||||
|
self.assertEqual(
|
||||||
|
self.search('abc', 'xx\0abcxx', 0, 0, 0, 0),
|
||||||
|
[Match(start=3, end=6, dist=0)],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
self.search('a\0b', 'xxa\0bcxx', 0, 0, 0, 0),
|
||||||
|
[Match(start=2, end=5, dist=0)],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestGenericSearch(TestGenericSearchBase, unittest.TestCase):
|
class TestGenericSearch(TestGenericSearchBase, unittest.TestCase):
|
||||||
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
|
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
|
||||||
|
|
Loading…
Reference in New Issue