changed _c_find_near_matches_generic_linear_programming to use the new KMP

This commit is contained in:
Tal Einat 2014-05-11 02:40:14 +03:00
parent e1a7b0cc30
commit 76ee4e0715
4 changed files with 790 additions and 1085 deletions

File diff suppressed because it is too large Load Diff

View File

@ -2,8 +2,18 @@ from sys import maxint
import six import six
from fuzzysearch.common import Match from fuzzysearch.common import Match
from libc.stdlib cimport malloc, free, realloc from libc.stdlib cimport malloc, free, realloc
from libc.string cimport strstr, strncpy
cdef extern from "kmp.h":
struct KMPstate:
pass # no need to specify the fields if they aren't accessed directly
void preKMP(const char *subsequence, int subsequence_len, int *kmpNext)
KMPstate KMP_init(const char *subseq, int subseq_len,
const char *seq, int seq_len,
int *kmpNext)
const char* KMP_find_next(KMPstate *kmp_state)
__all__ = [ __all__ = [
'c_find_near_matches_generic_linear_programming', 'c_find_near_matches_generic_linear_programming',
@ -47,8 +57,8 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
if not subsequence: if not subsequence:
raise ValueError('Given subsequence is empty!') raise ValueError('Given subsequence is empty!')
c_subsequence = <char *>subsequence cdef const char *c_subsequence = subsequence
c_sequence = <char *>sequence cdef const char *c_sequence = sequence
return _c_find_near_matches_generic_linear_programming( return _c_find_near_matches_generic_linear_programming(
c_subsequence, len(subsequence), c_subsequence, len(subsequence),
@ -59,9 +69,12 @@ def c_find_near_matches_generic_linear_programming(subsequence, sequence,
max_l_dist if max_l_dist is not None else (1<<29), max_l_dist if max_l_dist is not None else (1<<29),
) )
def _c_find_near_matches_generic_linear_programming( # The following MUST be a cdef, otherwise Cython copies the sequence and
char* subsequence, size_t subseq_len, # subsequence strings, which means if they contain null bytes the data after
char* sequence, size_t seq_len, # the first null byte will not be copied.
cdef _c_find_near_matches_generic_linear_programming(
const char* subsequence, size_t subseq_len,
const char* sequence, size_t seq_len,
unsigned int max_substitutions, unsigned int max_substitutions,
unsigned int max_insertions, unsigned int max_insertions,
unsigned int max_deletions, unsigned int max_deletions,
@ -90,11 +103,12 @@ def _c_find_near_matches_generic_linear_programming(
matches = [] matches = []
cdef size_t index cdef size_t index
cdef char charchar cdef char seq_char
try: try:
index = 0 index = 0
have_realloced = False have_realloced = False
for charchar in sequence[:seq_len]: for seq_char in sequence[:seq_len]:
candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0) candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0)
n_candidates += 1 n_candidates += 1
@ -110,7 +124,7 @@ def _c_find_near_matches_generic_linear_programming(
have_realloced = True have_realloced = True
# if this sequence char is the candidate's next expected char # if this sequence char is the candidate's next expected char
if charchar == subsequence[cand.subseq_index]: if seq_char == subsequence[cand.subseq_index]:
# if reached the end of the subsequence, return a match # if reached the end of the subsequence, return a match
if cand.subseq_index == subseq_len_minus_one: if cand.subseq_index == subseq_len_minus_one:
matches.append(Match(cand.start, index + 1, cand.l_dist)) matches.append(Match(cand.start, index + 1, cand.l_dist))
@ -181,7 +195,7 @@ def _c_find_near_matches_generic_linear_programming(
# otherwise, if skipping n_skipped sub-sequence chars # otherwise, if skipping n_skipped sub-sequence chars
# reaches a sub-sequence char identical to this sequence # reaches a sub-sequence char identical to this sequence
# char ... # char ...
elif charchar == subsequence[cand.subseq_index + n_skipped]: elif seq_char == subsequence[cand.subseq_index + n_skipped]:
# if this is the last char of the sub-sequence, yield # if this is the last char of the sub-sequence, yield
# a match # a match
if cand.subseq_index + n_skipped + 1 == subseq_len: if cand.subseq_index + n_skipped + 1 == subseq_len:
@ -269,35 +283,32 @@ def c_find_near_matches_generic_ngrams(subsequence, sequence,
c_max_substitutions + c_max_insertions + c_max_deletions, c_max_substitutions + c_max_insertions + c_max_deletions,
) )
cdef char* c_sequence = sequence cdef const char* c_sequence = sequence
cdef char* c_subsequence = subsequence cdef const char* c_subsequence = subsequence
cdef char* ngram_str
cdef size_t ngram_len = _subseq_len // (c_max_l_dist + 1) cdef size_t ngram_len = _subseq_len // (c_max_l_dist + 1)
if ngram_len == 0: if ngram_len == 0:
raise ValueError('the subsequence length must be greater than max_l_dist') raise ValueError('the subsequence length must be greater than max_l_dist')
ngram_str = <char *> malloc((ngram_len + 1) * sizeof(char))
if ngram_str is NULL:
raise MemoryError()
cdef int index, small_search_start_index cdef int index, small_search_start_index
cdef size_t ngram_start cdef size_t ngram_start
cdef char *match_ptr
matches = [] cdef const char *match_ptr
cdef int *kmpNext
cdef KMPstate kmp_state
kmpNext = <int *> malloc(ngram_len * sizeof(int))
if kmpNext is NULL:
raise MemoryError()
try: try:
ngram_str[ngram_len] = 0 matches = []
for ngram_start in xrange(0, _subseq_len - ngram_len + 1, ngram_len): for ngram_start in xrange(0, _subseq_len - ngram_len + 1, ngram_len):
strncpy(ngram_str, c_subsequence + ngram_start, ngram_len) preKMP(c_subsequence + ngram_start, ngram_len, kmpNext)
# TODO: handle null characters properly! kmp_state = KMP_init(c_subsequence + ngram_start, ngram_len, c_sequence, _seq_len, kmpNext)
match_ptr = strstr(c_sequence, ngram_str) match_ptr = KMP_find_next(&kmp_state)
while match_ptr != NULL: while match_ptr != NULL:
index = (match_ptr - c_sequence) small_search_start_index = (match_ptr - c_sequence) - ngram_start - c_max_l_dist
small_search_start_index = index - ngram_start - c_max_l_dist
small_search_length = _subseq_len + (2 * c_max_l_dist) small_search_length = _subseq_len + (2 * c_max_l_dist)
if small_search_start_index < 0: if small_search_start_index < 0:
small_search_length += small_search_start_index small_search_length += small_search_start_index
@ -315,9 +326,9 @@ def c_find_near_matches_generic_ngrams(subsequence, sequence,
start=match.start + small_search_start_index, start=match.start + small_search_start_index,
end=match.end + small_search_start_index, end=match.end + small_search_start_index,
)) ))
match_ptr = strstr(match_ptr + 1, ngram_str) match_ptr = KMP_find_next(&kmp_state)
finally: finally:
free(ngram_str) free(kmpNext)
return matches return matches

View File

@ -27,7 +27,8 @@ _common_module = Extension(
) )
_generic_search_module = Extension( _generic_search_module = Extension(
'fuzzysearch._generic_search', 'fuzzysearch._generic_search',
sources=['fuzzysearch/_generic_search.c'], sources=['fuzzysearch/_generic_search.c', 'fuzzysearch/kmp.c'],
include_dirs=['.'],
) )
setup( setup(

View File

@ -130,6 +130,17 @@ class TestGenericSearchBase(object):
[Match(start=3, end=5, dist=1)], [Match(start=3, end=5, dist=1)],
) )
def test_null_bytes(self):
self.assertEqual(
self.search('abc', 'xx\0abcxx', 0, 0, 0, 0),
[Match(start=3, end=6, dist=0)],
)
self.assertEqual(
self.search('a\0b', 'xxa\0bcxx', 0, 0, 0, 0),
[Match(start=2, end=5, dist=0)],
)
class TestGenericSearch(TestGenericSearchBase, unittest.TestCase): class TestGenericSearch(TestGenericSearchBase, unittest.TestCase):
def search(self, pattern, sequence, max_subs, max_ins, max_dels, def search(self, pattern, sequence, max_subs, max_ins, max_dels,