2014-03-28 10:52:47 +00:00
|
|
|
import random
|
|
|
|
|
2014-05-16 09:18:05 +00:00
|
|
|
from fuzzysearch import find_near_matches
|
2014-03-15 17:36:13 +00:00
|
|
|
from fuzzysearch.levenshtein import \
|
|
|
|
find_near_matches_levenshtein_linear_programming
|
|
|
|
from fuzzysearch.levenshtein_ngram import \
|
|
|
|
find_near_matches_levenshtein_ngrams as fnm_levenshtein_ngrams
|
2014-04-12 15:20:10 +00:00
|
|
|
from fuzzysearch.substitutions_only import \
|
2014-03-15 17:36:13 +00:00
|
|
|
find_near_matches_substitutions_ngrams as fnm_substitutions_ngrams, \
|
2015-02-13 11:08:47 +00:00
|
|
|
find_near_matches_substitutions_lp, \
|
2014-03-28 17:18:35 +00:00
|
|
|
has_near_match_substitutions_ngrams
|
2014-04-12 16:48:04 +00:00
|
|
|
from fuzzysearch._substitutions_only import \
|
2015-02-13 11:08:47 +00:00
|
|
|
substitutions_only_has_near_matches_lp_byteslike, \
|
2014-05-16 09:18:05 +00:00
|
|
|
substitutions_only_has_near_matches_ngrams_byteslike
|
2014-03-28 09:51:32 +00:00
|
|
|
from fuzzysearch.generic_search import \
|
2014-03-28 10:52:47 +00:00
|
|
|
find_near_matches_generic_linear_programming, \
|
|
|
|
find_near_matches_generic_ngrams, has_near_match_generic_ngrams
|
2014-03-28 09:51:32 +00:00
|
|
|
from fuzzysearch._generic_search import \
|
2014-05-16 09:18:05 +00:00
|
|
|
c_find_near_matches_generic_linear_programming as \
|
2014-03-28 09:51:32 +00:00
|
|
|
find_near_matches_generic_linear_programming_cython
|
2014-03-12 10:00:21 +00:00
|
|
|
|
|
|
|
|
2014-03-15 17:36:13 +00:00
|
|
|
def fnm_levenshtein_lp(subsequence, sequence, max_l_dist):
|
|
|
|
return list(find_near_matches_levenshtein_linear_programming(
|
|
|
|
subsequence, sequence, max_l_dist))
|
|
|
|
|
|
|
|
def fnm_substitutions_lp(subsequence, sequence, max_substitutions):
|
2015-02-13 11:08:47 +00:00
|
|
|
return list(find_near_matches_substitutions_lp(
|
2014-03-15 17:36:13 +00:00
|
|
|
subsequence, sequence, max_substitutions))
|
|
|
|
|
2014-03-28 09:51:32 +00:00
|
|
|
def fnm_generic_lp(subsequence, sequence, max_l_dist):
|
|
|
|
return list(find_near_matches_generic_linear_programming(
|
|
|
|
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
|
|
|
|
|
|
|
def fnm_generic_lp_cython(subsequence, sequence, max_l_dist):
|
|
|
|
return list(find_near_matches_generic_linear_programming_cython(
|
|
|
|
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
|
|
|
|
2014-03-28 10:52:47 +00:00
|
|
|
def fnm_generic_ngrams(subsequence, sequence, max_l_dist):
|
|
|
|
return list(find_near_matches_generic_ngrams(
|
|
|
|
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
|
|
|
|
|
|
|
def hnm_generic_ngrams(subsequence, sequence, max_l_dist):
|
|
|
|
return has_near_match_generic_ngrams(
|
|
|
|
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist)
|
|
|
|
|
2014-05-16 09:18:05 +00:00
|
|
|
def hnm_substitutions_ngrams(subsequence, sequence, max_substitutions):
|
2014-03-28 17:18:35 +00:00
|
|
|
return has_near_match_substitutions_ngrams(
|
2014-05-16 09:18:05 +00:00
|
|
|
subsequence, sequence, max_substitutions)
|
2014-03-28 17:18:35 +00:00
|
|
|
|
2014-05-16 09:18:05 +00:00
|
|
|
def hnm_substitutions_byteslike(subsequence, sequence, max_substitutions):
|
2015-02-13 11:08:47 +00:00
|
|
|
return substitutions_only_has_near_matches_lp_byteslike(
|
2014-05-16 09:18:05 +00:00
|
|
|
subsequence, sequence, max_substitutions)
|
|
|
|
|
|
|
|
def hnm_substitutions_ngrams_byteslike(subsequence, sequence, max_substitutions):
|
|
|
|
return substitutions_only_has_near_matches_ngrams_byteslike(
|
|
|
|
subsequence, sequence, max_substitutions)
|
2014-04-12 16:48:04 +00:00
|
|
|
|
2014-03-12 10:00:21 +00:00
|
|
|
|
|
|
|
search_functions = {
|
2014-05-16 09:18:05 +00:00
|
|
|
'fnm': find_near_matches,
|
2014-03-15 17:36:13 +00:00
|
|
|
'levenshtein_lp': fnm_levenshtein_lp,
|
|
|
|
'levenshtein_ngrams': fnm_levenshtein_ngrams,
|
|
|
|
'substitutions_lp': fnm_substitutions_lp,
|
|
|
|
'substitutions_ngrams': fnm_substitutions_ngrams,
|
2014-03-28 09:51:32 +00:00
|
|
|
'generic_lp': fnm_generic_lp,
|
|
|
|
'generic_lp_cython': fnm_generic_lp_cython,
|
2014-03-28 10:52:47 +00:00
|
|
|
'generic_ngrams': fnm_generic_ngrams,
|
|
|
|
'has_match_generic_ngrams': hnm_generic_ngrams,
|
2014-03-28 17:18:35 +00:00
|
|
|
'has_match_substitutions_ngrams': hnm_substitutions_ngrams,
|
2014-04-12 16:48:04 +00:00
|
|
|
'has_match_substitutions_byteslike': hnm_substitutions_byteslike,
|
2014-05-16 09:18:05 +00:00
|
|
|
'has_match_substitutions_ngrams_byteslike': hnm_substitutions_ngrams_byteslike,
|
2014-03-12 10:00:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
benchmarks = {
|
2014-03-15 17:36:13 +00:00
|
|
|
'dna_no_match': dict(
|
2014-03-12 10:00:21 +00:00
|
|
|
subsequence = 'GCTAGCTAGCTA',
|
2014-03-28 09:51:32 +00:00
|
|
|
sequence = "ATCG" * (10**3),
|
2014-03-15 17:36:13 +00:00
|
|
|
max_dist = 1,
|
2014-03-12 10:00:21 +00:00
|
|
|
),
|
2014-03-28 09:51:32 +00:00
|
|
|
'dna_no_match2': dict(
|
|
|
|
subsequence = 'ATGATGATG',
|
|
|
|
sequence = 'ATCG' * (10**3),
|
|
|
|
max_dist = 2,
|
|
|
|
),
|
|
|
|
'random_kevin': dict(
|
|
|
|
subsequence = ''.join(random.choice('ATCG') for _i in xrange(36)),
|
|
|
|
sequence = ''.join(random.choice('ATCG' * 5 + 'N') for _i in xrange(90)),
|
|
|
|
max_dist = 3,
|
|
|
|
),
|
2014-04-10 22:28:01 +00:00
|
|
|
'random_kevin_partial_match': dict(
|
|
|
|
subsequence = 'AAGTCTAGT' + ''.join(random.choice('ATCG') for _i in xrange(36-9)),
|
|
|
|
sequence = 'AAGTCTAGT' + ''.join(random.choice('ATCG' * 5 + 'N') for _i in xrange(90-9)),
|
|
|
|
max_dist = 3,
|
|
|
|
),
|
2014-03-12 10:00:21 +00:00
|
|
|
}
|
|
|
|
|
2014-03-15 17:36:13 +00:00
|
|
|
|
2014-03-28 09:51:32 +00:00
|
|
|
def get_benchmark(search_func_name, benchmark_name):
|
2014-03-12 10:00:21 +00:00
|
|
|
search_func = search_functions[search_func_name]
|
2014-03-15 17:36:13 +00:00
|
|
|
search_args = dict(benchmarks[benchmark_name])
|
|
|
|
|
2014-05-16 09:18:05 +00:00
|
|
|
if search_func in (find_near_matches,):
|
|
|
|
search_args['max_l_dist'] = search_args.pop('max_dist')
|
|
|
|
elif search_func in (fnm_levenshtein_ngrams, fnm_levenshtein_lp, fnm_generic_lp, fnm_generic_lp_cython, fnm_generic_ngrams, hnm_generic_ngrams):
|
2014-03-15 17:36:13 +00:00
|
|
|
search_args['max_l_dist'] = search_args.pop('max_dist')
|
2014-05-16 09:18:05 +00:00
|
|
|
elif search_func in (fnm_substitutions_ngrams, fnm_substitutions_lp, hnm_substitutions_ngrams, hnm_substitutions_byteslike, hnm_substitutions_ngrams_byteslike):
|
2014-03-15 17:36:13 +00:00
|
|
|
search_args['max_substitutions'] = search_args.pop('max_dist')
|
|
|
|
else:
|
|
|
|
raise Exception('Unsupported search function: %r' % search_func)
|
|
|
|
|
2014-03-28 09:51:32 +00:00
|
|
|
return search_func, search_args
|
|
|
|
|
|
|
|
|
|
|
|
def run_benchmark(search_func, search_args):
|
2014-03-12 10:00:21 +00:00
|
|
|
return search_func(**search_args)
|