greatly optimized Cython search function + fixed and added benchmarks
This commit is contained in:
parent
2863b57236
commit
8b65f29b3f
|
@ -5,6 +5,14 @@ from fuzzysearch.levenshtein_ngram import \
|
|||
from fuzzysearch.susbstitutions_only import \
|
||||
find_near_matches_substitutions_ngrams as fnm_substitutions_ngrams, \
|
||||
find_near_matches_substitutions_linear_programming
|
||||
from fuzzysearch.generic_search import \
|
||||
find_near_matches_generic_linear_programming
|
||||
import pyximport
|
||||
pyximport.install()
|
||||
from fuzzysearch._generic_search import \
|
||||
find_near_matches_generic_linear_programming as \
|
||||
find_near_matches_generic_linear_programming_cython
|
||||
import random
|
||||
|
||||
|
||||
def fnm_levenshtein_lp(subsequence, sequence, max_l_dist):
|
||||
|
@ -15,32 +23,56 @@ def fnm_substitutions_lp(subsequence, sequence, max_substitutions):
|
|||
return list(find_near_matches_substitutions_linear_programming(
|
||||
subsequence, sequence, max_substitutions))
|
||||
|
||||
def fnm_generic_lp(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_generic_linear_programming(
|
||||
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
||||
|
||||
def fnm_generic_lp_cython(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_generic_linear_programming_cython(
|
||||
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
||||
|
||||
|
||||
search_functions = {
|
||||
'levenshtein_lp': fnm_levenshtein_lp,
|
||||
'levenshtein_ngrams': fnm_levenshtein_ngrams,
|
||||
'substitutions_lp': fnm_substitutions_lp,
|
||||
'substitutions_ngrams': fnm_substitutions_ngrams,
|
||||
'generic_lp': fnm_generic_lp,
|
||||
'generic_lp_cython': fnm_generic_lp_cython,
|
||||
}
|
||||
|
||||
benchmarks = {
|
||||
'dna_no_match': dict(
|
||||
subsequence = 'GCTAGCTAGCTA',
|
||||
sequence = '"ATCG" * (10**3)',
|
||||
sequence = "ATCG" * (10**3),
|
||||
max_dist = 1,
|
||||
),
|
||||
'dna_no_match2': dict(
|
||||
subsequence = 'ATGATGATG',
|
||||
sequence = 'ATCG' * (10**3),
|
||||
max_dist = 2,
|
||||
),
|
||||
'random_kevin': dict(
|
||||
subsequence = ''.join(random.choice('ATCG') for _i in xrange(36)),
|
||||
sequence = ''.join(random.choice('ATCG' * 5 + 'N') for _i in xrange(90)),
|
||||
max_dist = 3,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def run_benchmark(search_func_name, benchmark_name):
|
||||
def get_benchmark(search_func_name, benchmark_name):
|
||||
search_func = search_functions[search_func_name]
|
||||
search_args = dict(benchmarks[benchmark_name])
|
||||
|
||||
if search_func in (fnm_levenshtein_ngrams, fnm_levenshtein_lp):
|
||||
if search_func in (fnm_levenshtein_ngrams, fnm_levenshtein_lp, fnm_generic_lp, fnm_generic_lp_cython):
|
||||
search_args['max_l_dist'] = search_args.pop('max_dist')
|
||||
elif search_func in (fnm_substitutions_ngrams, fnm_substitutions_lp):
|
||||
search_args['max_substitutions'] = search_args.pop('max_dist')
|
||||
else:
|
||||
raise Exception('Unsupported search function: %r' % search_func)
|
||||
|
||||
return search_func, search_args
|
||||
|
||||
|
||||
def run_benchmark(search_func, search_args):
|
||||
return search_func(**search_args)
|
||||
|
|
|
@ -4,4 +4,4 @@ import timeit
|
|||
args = sys.argv[1:]
|
||||
search_func_name, benchmark_name = args
|
||||
|
||||
timeit.main(['-s', 'from benchmarks import run_benchmark', 'run_benchmark(%r, %r)' % (search_func_name, benchmark_name)])
|
||||
timeit.main(['-s', 'from benchmarks import get_benchmark, run_benchmark; search_func, search_args = get_benchmark(%r, %r)' % (search_func_name, benchmark_name), 'run_benchmark(search_func, search_args)'])
|
||||
|
|
|
@ -1,9 +1,23 @@
|
|||
from fuzzysearch.common import Match
|
||||
from libc.stdlib cimport malloc, free, realloc
|
||||
|
||||
|
||||
__all__ = ['find_near_matches_generic_linear_programming']
|
||||
|
||||
|
||||
cdef struct GenericSearchCandidate:
|
||||
int start, subseq_index, l_dist, n_subs, n_ins, n_dels
|
||||
|
||||
|
||||
ALLOWED_TYPES = (str,)
|
||||
try:
|
||||
from Bio.Seq import Seq
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
ALLOWED_TYPES += (Seq,)
|
||||
|
||||
|
||||
def find_near_matches_generic_linear_programming(subsequence, sequence,
|
||||
max_substitutions,
|
||||
max_insertions,
|
||||
|
@ -22,8 +36,14 @@ def find_near_matches_generic_linear_programming(subsequence, sequence,
|
|||
if not subsequence:
|
||||
raise ValueError('Given subsequence is empty!')
|
||||
|
||||
if not isinstance(sequence, ALLOWED_TYPES):
|
||||
raise TypeError('sequence is of invalid type %s' % type(subsequence))
|
||||
if not isinstance(subsequence, ALLOWED_TYPES):
|
||||
raise TypeError('subsequence is of invalid type %s' % type(subsequence))
|
||||
|
||||
# optimization: prepare some often used things in advance
|
||||
_subseq_len = len(subsequence)
|
||||
cdef int _subseq_len = len(subsequence)
|
||||
cdef int _subseq_len_minus_one = _subseq_len - 1
|
||||
|
||||
maxes_sum = sum(
|
||||
(x if x is not None else 0)
|
||||
|
@ -32,125 +52,166 @@ def find_near_matches_generic_linear_programming(subsequence, sequence,
|
|||
if max_l_dist is None or max_l_dist >= maxes_sum:
|
||||
max_l_dist = maxes_sum
|
||||
|
||||
cdef GenericSearchCandidate[1000] _candidates1
|
||||
cdef GenericSearchCandidate[1000] _candidates2
|
||||
cdef GenericSearchCandidate* candidates = _candidates1
|
||||
cdef GenericSearchCandidate* new_candidates = _candidates2
|
||||
cdef c_max_l_dist = max_l_dist
|
||||
cdef c_max_substitutions = max_substitutions
|
||||
cdef c_max_insertions = max_insertions
|
||||
cdef c_max_deletions = max_deletions
|
||||
|
||||
cdef alloc_size
|
||||
cdef GenericSearchCandidate* candidates
|
||||
cdef GenericSearchCandidate* new_candidates
|
||||
cdef GenericSearchCandidate* _tmp
|
||||
cdef GenericSearchCandidate cand
|
||||
cdef int n_candidates = 0
|
||||
cdef int n_new_candidates = 0
|
||||
cdef int n_cand
|
||||
|
||||
for index, char in enumerate(sequence):
|
||||
candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0)
|
||||
n_candidates += 1
|
||||
cdef char* c_sequence = sequence
|
||||
cdef char* c_subsequence = subsequence
|
||||
cdef char char
|
||||
|
||||
for n_cand in xrange(n_candidates):
|
||||
cand = candidates[n_cand]
|
||||
# if this sequence char is the candidate's next expected char
|
||||
if char == subsequence[cand.subseq_index]:
|
||||
# if reached the end of the subsequence, return a match
|
||||
if cand.subseq_index + 1 == _subseq_len:
|
||||
yield Match(cand.start, index + 1, cand.l_dist)
|
||||
# otherwise, update the candidate's subseq_index and keep it
|
||||
else:
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1,
|
||||
cand.l_dist, cand.n_subs,
|
||||
cand.n_ins, cand.n_dels,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
alloc_size = min(10, _subseq_len * 3 + 1)
|
||||
candidates = <GenericSearchCandidate *> malloc(alloc_size * sizeof(GenericSearchCandidate))
|
||||
if candidates is NULL:
|
||||
raise MemoryError()
|
||||
new_candidates = <GenericSearchCandidate *> malloc(alloc_size * sizeof(GenericSearchCandidate))
|
||||
if candidates is NULL:
|
||||
free(candidates)
|
||||
raise MemoryError()
|
||||
|
||||
# if this sequence char is *not* the candidate's next expected char
|
||||
else:
|
||||
# we can try skipping a sequence or sub-sequence char (or both),
|
||||
# unless this candidate has already skipped the maximum allowed
|
||||
# number of characters
|
||||
if cand.l_dist == max_l_dist:
|
||||
continue
|
||||
try:
|
||||
have_realloced = False
|
||||
for index in xrange(len(sequence)):
|
||||
char = c_sequence[index]
|
||||
|
||||
if cand.n_ins < max_insertions:
|
||||
# add a candidate skipping a sequence char
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index,
|
||||
cand.l_dist + 1, cand.n_subs,
|
||||
cand.n_ins + 1, cand.n_dels,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0)
|
||||
n_candidates += 1
|
||||
|
||||
if cand.subseq_index + 1 < _subseq_len:
|
||||
if cand.n_subs < max_substitutions:
|
||||
# add a candidate skipping both a sequence char and a
|
||||
# subsequence char
|
||||
for n_cand in xrange(n_candidates):
|
||||
cand = candidates[n_cand]
|
||||
|
||||
if n_new_candidates + 4 > alloc_size:
|
||||
alloc_size += alloc_size // 2
|
||||
_tmp = <GenericSearchCandidate *>realloc(new_candidates, alloc_size * sizeof(GenericSearchCandidate))
|
||||
if _tmp is NULL:
|
||||
raise MemoryError()
|
||||
new_candidates = _tmp
|
||||
have_realloced = True
|
||||
|
||||
# if this sequence char is the candidate's next expected char
|
||||
if char == c_subsequence[cand.subseq_index]:
|
||||
# if reached the end of the subsequence, return a match
|
||||
if cand.subseq_index == _subseq_len_minus_one:
|
||||
yield Match(cand.start, index + 1, cand.l_dist)
|
||||
# otherwise, update the candidate's subseq_index and keep it
|
||||
else:
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1,
|
||||
cand.l_dist + 1, cand.n_subs + 1,
|
||||
cand.l_dist, cand.n_subs,
|
||||
cand.n_ins, cand.n_dels,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
elif cand.n_dels < max_deletions and cand.n_ins < max_insertions:
|
||||
# add a candidate skipping both a sequence char and a
|
||||
# subsequence char
|
||||
|
||||
# if this sequence char is *not* the candidate's next expected char
|
||||
else:
|
||||
# we can try skipping a sequence or sub-sequence char (or both),
|
||||
# unless this candidate has already skipped the maximum allowed
|
||||
# number of characters
|
||||
if cand.l_dist == c_max_l_dist:
|
||||
continue
|
||||
|
||||
if cand.n_ins < c_max_insertions:
|
||||
# add a candidate skipping a sequence char
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1,
|
||||
cand.start, cand.subseq_index,
|
||||
cand.l_dist + 1, cand.n_subs,
|
||||
cand.n_ins + 1, cand.n_dels + 1,
|
||||
cand.n_ins + 1, cand.n_dels,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
else:
|
||||
# cand.subseq_index == _subseq_len - 1
|
||||
if (
|
||||
cand.n_subs < max_substitutions or
|
||||
(
|
||||
cand.n_dels < max_deletions and
|
||||
cand.n_ins < max_insertions
|
||||
)
|
||||
):
|
||||
yield Match(cand.start, index + 1, cand.l_dist + 1)
|
||||
|
||||
# try skipping subsequence chars
|
||||
for n_skipped in xrange(1, min(max_deletions - cand.n_dels, max_l_dist - cand.l_dist) + 1):
|
||||
# if skipping n_dels sub-sequence chars reaches the end
|
||||
# of the sub-sequence, yield a match
|
||||
if cand.subseq_index + n_skipped == _subseq_len:
|
||||
yield Match(cand.start, index + 1,
|
||||
cand.l_dist + n_skipped)
|
||||
break
|
||||
# otherwise, if skipping n_skipped sub-sequence chars
|
||||
# reaches a sub-sequence char identical to this sequence
|
||||
# char ...
|
||||
elif subsequence[cand.subseq_index + n_skipped] == char:
|
||||
# if this is the last char of the sub-sequence, yield
|
||||
# a match
|
||||
if cand.subseq_index + n_skipped + 1 == _subseq_len:
|
||||
yield Match(cand.start, index + 1,
|
||||
cand.l_dist + n_skipped)
|
||||
# otherwise add a candidate skipping n_skipped
|
||||
# subsequence chars
|
||||
else:
|
||||
if cand.subseq_index + 1 < _subseq_len:
|
||||
if cand.n_subs < c_max_substitutions:
|
||||
# add a candidate skipping both a sequence char and a
|
||||
# subsequence char
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1 + n_skipped,
|
||||
cand.l_dist + n_skipped, cand.n_subs,
|
||||
cand.n_ins, cand.n_dels + n_skipped,
|
||||
cand.start, cand.subseq_index + 1,
|
||||
cand.l_dist + 1, cand.n_subs + 1,
|
||||
cand.n_ins, cand.n_dels,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
break
|
||||
# note: if the above loop ends without a break, that means that
|
||||
# no candidate could be added / yielded by skipping sub-sequence
|
||||
# chars
|
||||
elif cand.n_dels < c_max_deletions and cand.n_ins < c_max_insertions:
|
||||
# add a candidate skipping both a sequence char and a
|
||||
# subsequence char
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1,
|
||||
cand.l_dist + 1, cand.n_subs,
|
||||
cand.n_ins + 1, cand.n_dels + 1,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
else:
|
||||
# cand.subseq_index == _subseq_len - 1
|
||||
if (
|
||||
cand.n_subs < c_max_substitutions or
|
||||
(
|
||||
cand.n_dels < c_max_deletions and
|
||||
cand.n_ins < c_max_insertions
|
||||
)
|
||||
):
|
||||
yield Match(cand.start, index + 1, cand.l_dist + 1)
|
||||
|
||||
# new_candidates = candidates; candidates = []
|
||||
_tmp = candidates
|
||||
candidates = new_candidates
|
||||
new_candidates = _tmp
|
||||
n_candidates = n_new_candidates
|
||||
n_new_candidates = 0
|
||||
# try skipping subsequence chars
|
||||
for n_skipped in xrange(1, min(c_max_deletions - cand.n_dels, c_max_l_dist - cand.l_dist) + 1):
|
||||
# if skipping n_dels sub-sequence chars reaches the end
|
||||
# of the sub-sequence, yield a match
|
||||
if cand.subseq_index + n_skipped == _subseq_len:
|
||||
yield Match(cand.start, index + 1,
|
||||
cand.l_dist + n_skipped)
|
||||
break
|
||||
# otherwise, if skipping n_skipped sub-sequence chars
|
||||
# reaches a sub-sequence char identical to this sequence
|
||||
# char ...
|
||||
elif char == c_subsequence[cand.subseq_index + n_skipped]:
|
||||
# if this is the last char of the sub-sequence, yield
|
||||
# a match
|
||||
if cand.subseq_index + n_skipped + 1 == _subseq_len:
|
||||
yield Match(cand.start, index + 1,
|
||||
cand.l_dist + n_skipped)
|
||||
# otherwise add a candidate skipping n_skipped
|
||||
# subsequence chars
|
||||
else:
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1 + n_skipped,
|
||||
cand.l_dist + n_skipped, cand.n_subs,
|
||||
cand.n_ins, cand.n_dels + n_skipped,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
break
|
||||
# note: if the above loop ends without a break, that means that
|
||||
# no candidate could be added / yielded by skipping sub-sequence
|
||||
# chars
|
||||
|
||||
for n_cand in xrange(n_candidates):
|
||||
cand = candidates[n_cand]
|
||||
# note: index + 1 == length(sequence)
|
||||
n_skipped = _subseq_len - cand.subseq_index
|
||||
if cand.n_dels + n_skipped <= max_deletions and \
|
||||
cand.l_dist + n_skipped <= max_l_dist:
|
||||
yield Match(cand.start, index + 1, cand.l_dist + n_skipped)
|
||||
# new_candidates = candidates; candidates = []
|
||||
_tmp = candidates
|
||||
candidates = new_candidates
|
||||
new_candidates = _tmp
|
||||
n_candidates = n_new_candidates
|
||||
n_new_candidates = 0
|
||||
|
||||
if have_realloced:
|
||||
have_realloced = False
|
||||
_tmp = <GenericSearchCandidate *>realloc(new_candidates, alloc_size * sizeof(GenericSearchCandidate))
|
||||
if _tmp is NULL:
|
||||
raise MemoryError()
|
||||
new_candidates = _tmp
|
||||
|
||||
for n_cand in xrange(n_candidates):
|
||||
cand = candidates[n_cand]
|
||||
# note: index + 1 == length(sequence)
|
||||
n_skipped = _subseq_len - cand.subseq_index
|
||||
if cand.n_dels + n_skipped <= c_max_deletions and \
|
||||
cand.l_dist + n_skipped <= c_max_l_dist:
|
||||
yield Match(cand.start, index + 1, cand.l_dist + n_skipped)
|
||||
|
||||
finally:
|
||||
free(candidates)
|
||||
free(new_candidates)
|
||||
|
|
Loading…
Reference in New Issue