WIP
This commit is contained in:
parent
54552b8bba
commit
1375471eea
|
@ -26,10 +26,6 @@ fuzzysearch
|
|||
:target: https://pypi.python.org/pypi/fuzzysearch
|
||||
:alt: Supported Python versions
|
||||
|
||||
.. image:: https://img.shields.io/pypi/implementation/fuzzysearch.svg?style=flat
|
||||
:target: https://pypi.python.org/pypi/fuzzysearch
|
||||
:alt: Supported Python implementations
|
||||
|
||||
.. image:: https://img.shields.io/pypi/l/fuzzysearch.svg?style=flat
|
||||
:target: https://pypi.python.org/pypi/fuzzysearch/
|
||||
:alt: License
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,116 +0,0 @@
|
|||
import random
|
||||
|
||||
from fuzzysearch import find_near_matches
|
||||
from fuzzysearch.levenshtein import \
|
||||
find_near_matches_levenshtein_linear_programming
|
||||
from fuzzysearch.levenshtein_ngram import \
|
||||
find_near_matches_levenshtein_ngrams as fnm_levenshtein_ngrams
|
||||
from fuzzysearch.substitutions_only import \
|
||||
find_near_matches_substitutions_ngrams as fnm_substitutions_ngrams, \
|
||||
find_near_matches_substitutions_lp, \
|
||||
has_near_match_substitutions_ngrams
|
||||
from fuzzysearch._substitutions_only import \
|
||||
substitutions_only_has_near_matches_lp_byteslike, \
|
||||
substitutions_only_has_near_matches_ngrams_byteslike
|
||||
from fuzzysearch.generic_search import \
|
||||
find_near_matches_generic_linear_programming, \
|
||||
find_near_matches_generic_ngrams, has_near_match_generic_ngrams
|
||||
from fuzzysearch._generic_search import \
|
||||
c_find_near_matches_generic_linear_programming as \
|
||||
find_near_matches_generic_linear_programming_cython
|
||||
|
||||
|
||||
def fnm_levenshtein_lp(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_levenshtein_linear_programming(
|
||||
subsequence, sequence, max_l_dist))
|
||||
|
||||
def fnm_substitutions_lp(subsequence, sequence, max_substitutions):
|
||||
return list(find_near_matches_substitutions_lp(
|
||||
subsequence, sequence, max_substitutions))
|
||||
|
||||
def fnm_generic_lp(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_generic_linear_programming(
|
||||
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
||||
|
||||
def fnm_generic_lp_cython(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_generic_linear_programming_cython(
|
||||
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
||||
|
||||
def fnm_generic_ngrams(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_generic_ngrams(
|
||||
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
||||
|
||||
def hnm_generic_ngrams(subsequence, sequence, max_l_dist):
|
||||
return has_near_match_generic_ngrams(
|
||||
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist)
|
||||
|
||||
def hnm_substitutions_ngrams(subsequence, sequence, max_substitutions):
|
||||
return has_near_match_substitutions_ngrams(
|
||||
subsequence, sequence, max_substitutions)
|
||||
|
||||
def hnm_substitutions_byteslike(subsequence, sequence, max_substitutions):
|
||||
return substitutions_only_has_near_matches_lp_byteslike(
|
||||
subsequence, sequence, max_substitutions)
|
||||
|
||||
def hnm_substitutions_ngrams_byteslike(subsequence, sequence, max_substitutions):
|
||||
return substitutions_only_has_near_matches_ngrams_byteslike(
|
||||
subsequence, sequence, max_substitutions)
|
||||
|
||||
|
||||
search_functions = {
|
||||
'fnm': find_near_matches,
|
||||
'levenshtein_lp': fnm_levenshtein_lp,
|
||||
'levenshtein_ngrams': fnm_levenshtein_ngrams,
|
||||
'substitutions_lp': fnm_substitutions_lp,
|
||||
'substitutions_ngrams': fnm_substitutions_ngrams,
|
||||
'generic_lp': fnm_generic_lp,
|
||||
'generic_lp_cython': fnm_generic_lp_cython,
|
||||
'generic_ngrams': fnm_generic_ngrams,
|
||||
'has_match_generic_ngrams': hnm_generic_ngrams,
|
||||
'has_match_substitutions_ngrams': hnm_substitutions_ngrams,
|
||||
'has_match_substitutions_byteslike': hnm_substitutions_byteslike,
|
||||
'has_match_substitutions_ngrams_byteslike': hnm_substitutions_ngrams_byteslike,
|
||||
}
|
||||
|
||||
benchmarks = {
|
||||
'dna_no_match': dict(
|
||||
subsequence = 'GCTAGCTAGCTA',
|
||||
sequence = "ATCG" * (10**3),
|
||||
max_dist = 1,
|
||||
),
|
||||
'dna_no_match2': dict(
|
||||
subsequence = 'ATGATGATG',
|
||||
sequence = 'ATCG' * (10**3),
|
||||
max_dist = 2,
|
||||
),
|
||||
'random_kevin': dict(
|
||||
subsequence = ''.join(random.choice('ATCG') for _i in xrange(36)),
|
||||
sequence = ''.join(random.choice('ATCG' * 5 + 'N') for _i in xrange(90)),
|
||||
max_dist = 3,
|
||||
),
|
||||
'random_kevin_partial_match': dict(
|
||||
subsequence = 'AAGTCTAGT' + ''.join(random.choice('ATCG') for _i in xrange(36-9)),
|
||||
sequence = 'AAGTCTAGT' + ''.join(random.choice('ATCG' * 5 + 'N') for _i in xrange(90-9)),
|
||||
max_dist = 3,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def get_benchmark(search_func_name, benchmark_name):
|
||||
search_func = search_functions[search_func_name]
|
||||
search_args = dict(benchmarks[benchmark_name])
|
||||
|
||||
if search_func in (find_near_matches,):
|
||||
search_args['max_l_dist'] = search_args.pop('max_dist')
|
||||
elif search_func in (fnm_levenshtein_ngrams, fnm_levenshtein_lp, fnm_generic_lp, fnm_generic_lp_cython, fnm_generic_ngrams, hnm_generic_ngrams):
|
||||
search_args['max_l_dist'] = search_args.pop('max_dist')
|
||||
elif search_func in (fnm_substitutions_ngrams, fnm_substitutions_lp, hnm_substitutions_ngrams, hnm_substitutions_byteslike, hnm_substitutions_ngrams_byteslike):
|
||||
search_args['max_substitutions'] = search_args.pop('max_dist')
|
||||
else:
|
||||
raise Exception('Unsupported search function: %r' % search_func)
|
||||
|
||||
return search_func, search_args
|
||||
|
||||
|
||||
def run_benchmark(search_func, search_args):
|
||||
return search_func(**search_args)
|
|
@ -1,45 +1,5 @@
|
|||
import textwrap
|
||||
import timeit
|
||||
import argparse
|
||||
from benchmarks import benchmarks, search_functions
|
||||
import sys
|
||||
|
||||
from .main import main
|
||||
|
||||
def print_results(timings, number, repeat, precision=3):
|
||||
best = min(timings)
|
||||
|
||||
usec = best * 1e6 / number
|
||||
if usec < 1000:
|
||||
x = "best of %d: %.*g usec per loop" % (repeat, precision, usec)
|
||||
else:
|
||||
msec = usec / 1000
|
||||
if msec < 1000:
|
||||
x = "best of %d: %.*g msec per loop" % (repeat, precision, msec)
|
||||
else:
|
||||
sec = msec / 1000
|
||||
x = "best of %d: %.*g sec per loop" % (repeat, precision, sec)
|
||||
|
||||
print("%d loops, " % number + x)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run fuzzysearch benchmarks.')
|
||||
|
||||
parser.add_argument('search_function', choices=search_functions)
|
||||
parser.add_argument('benchmark', choices=benchmarks)
|
||||
parser.add_argument('-r', '--repetitions', type=int, default=5,
|
||||
help='number of times to run the benchmark')
|
||||
parser.add_argument('-n', '--number', type=int, default=10**5,
|
||||
help='number of loop iterations to run in each repetition')
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
setup = textwrap.dedent('''\
|
||||
from benchmarks import get_benchmark, run_benchmark
|
||||
search_func, search_args = get_benchmark({search_function!r},
|
||||
{benchmark!r})
|
||||
''').format(**args.__dict__)
|
||||
|
||||
code = 'run_benchmark(search_func, search_args)'
|
||||
|
||||
timings = timeit.Timer(code, setup=setup).repeat(args.repetitions, args.number)
|
||||
print_results(timings, args.number, args.repetitions)
|
||||
sys.exit(main())
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
import contextlib
|
||||
import io
|
||||
import os.path
|
||||
|
||||
from fuzzysearch import find_near_matches_in_file
|
||||
|
||||
|
||||
book_file_path = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
'The Adventures of Huckleberry Finn.txt',
|
||||
)
|
||||
|
||||
|
||||
def search(substring, max_l_dist, as_binary=False):
|
||||
if as_binary:
|
||||
f = open(book_file_path, 'rb')
|
||||
else:
|
||||
f = io.open(book_file_path, encoding='utf-8')
|
||||
|
||||
with contextlib.closing(f):
|
||||
for match in find_near_matches_in_file(substring, f, max_l_dist=max_l_dist):
|
||||
pass
|
|
@ -1,6 +1,19 @@
|
|||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import textwrap
|
||||
import timeit
|
||||
|
||||
print timeit.timeit(
|
||||
'find_near_matches_levenshtein_ngrams(pattern, text, 1)',
|
||||
setup='text = "ATCG" * (10**3); pattern = "GCTAGCTAGCTA"; from fuzzysearch import find_near_matches_levenshtein_ngrams',
|
||||
)
|
||||
|
||||
rc = timeit.main(args=(
|
||||
'-s', textwrap.dedent('''\
|
||||
from fuzzysearch.levenshtein_ngram import \
|
||||
find_near_matches_levenshtein_ngrams
|
||||
|
||||
text = "ATCG" * (10**7)
|
||||
pattern = "GCTAGCTAGCTA"
|
||||
'''),
|
||||
'list(find_near_matches_levenshtein_ngrams(pattern, text, 1))',
|
||||
))
|
||||
|
||||
sys.exit(rc)
|
|
@ -0,0 +1,91 @@
|
|||
from __future__ import print_function
|
||||
|
||||
import textwrap
|
||||
import timeit
|
||||
import argparse
|
||||
from .micro_benchmarks import benchmarks, search_functions
|
||||
|
||||
|
||||
def print_results(timings, number, repeat, precision=3):
|
||||
best = min(timings)
|
||||
|
||||
usec = best * 1e6 / number
|
||||
if usec < 1000:
|
||||
x = "best of %d: %.*g usec per loop" % (repeat, precision, usec)
|
||||
else:
|
||||
msec = usec / 1000
|
||||
if msec < 1000:
|
||||
x = "best of %d: %.*g msec per loop" % (repeat, precision, msec)
|
||||
else:
|
||||
sec = msec / 1000
|
||||
x = "best of %d: %.*g sec per loop" % (repeat, precision, sec)
|
||||
|
||||
print("%d loops, " % number + x)
|
||||
|
||||
|
||||
def autorange(timer):
|
||||
for i in range(10):
|
||||
number = 10 ** i
|
||||
time_taken = timer.timeit(number)
|
||||
if time_taken >= 0.5:
|
||||
break
|
||||
return number
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Run fuzzysearch benchmarks.')
|
||||
|
||||
parser.add_argument('-r', '--repetitions', type=int, default=3,
|
||||
help='number of times to run the benchmark')
|
||||
parser.add_argument('-n', '--number', type=int,
|
||||
help='number of loop iterations to run in each repetition')
|
||||
|
||||
subparsers = parser.add_subparsers(help='sub-command help', dest='subparser_name')
|
||||
|
||||
micro_parser = subparsers.add_parser('micro', help='micro-benchmarks')
|
||||
micro_parser.add_argument('search_function', choices=search_functions)
|
||||
micro_parser.add_argument('benchmark', choices=benchmarks)
|
||||
|
||||
book_parser = subparsers.add_parser('book', help='search through the text of a long book')
|
||||
book_parser.add_argument('substring', type=str, required=True)
|
||||
book_parser.add_argument('max_l_dist', type=int, required=True)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
setup = None
|
||||
code = None
|
||||
|
||||
if args.subparser_name == 'micro':
|
||||
setup = textwrap.dedent('''\
|
||||
from benchmarks.micro_benchmarks import get_benchmark, run_benchmark
|
||||
search_func, search_args = get_benchmark({search_function!r},
|
||||
{benchmark!r})
|
||||
''').format(**args.__dict__)
|
||||
code = 'run_benchmark(search_func, search_args)'
|
||||
elif args.subparser_name == 'book':
|
||||
setup = textwrap.dedent('''\
|
||||
from benchmarks.book import search
|
||||
''')
|
||||
code = 'search({substring!r}, {max_l_dist!r})'.format(**args.__dict__)
|
||||
|
||||
timer = timeit.Timer(code, setup=setup)
|
||||
try:
|
||||
if args.number is None:
|
||||
args.number = autorange(timer)
|
||||
timings = timer.repeat(args.repetitions, args.number)
|
||||
except KeyboardInterrupt:
|
||||
print('Aborted!')
|
||||
except Exception:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
else:
|
||||
print_results(timings, args.number, args.repetitions)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
sys.exit(main())
|
|
@ -0,0 +1,119 @@
|
|||
import random
|
||||
|
||||
from fuzzysearch.common import LevenshteinSearchParams
|
||||
from fuzzysearch.compat import xrange
|
||||
|
||||
from fuzzysearch import find_near_matches
|
||||
from fuzzysearch.levenshtein import \
|
||||
find_near_matches_levenshtein_linear_programming
|
||||
from fuzzysearch.levenshtein_ngram import \
|
||||
find_near_matches_levenshtein_ngrams as fnm_levenshtein_ngrams
|
||||
from fuzzysearch.substitutions_only import \
|
||||
find_near_matches_substitutions_ngrams as fnm_substitutions_ngrams, \
|
||||
find_near_matches_substitutions_lp, \
|
||||
has_near_match_substitutions_ngrams
|
||||
from fuzzysearch._substitutions_only import \
|
||||
substitutions_only_has_near_matches_lp_byteslike, \
|
||||
substitutions_only_has_near_matches_ngrams_byteslike
|
||||
from fuzzysearch.generic_search import \
|
||||
find_near_matches_generic_linear_programming, \
|
||||
find_near_matches_generic_ngrams, has_near_match_generic_ngrams
|
||||
from fuzzysearch._generic_search import \
|
||||
c_find_near_matches_generic_linear_programming as \
|
||||
find_near_matches_generic_linear_programming_cython
|
||||
|
||||
|
||||
def fnm_levenshtein_lp(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_levenshtein_linear_programming(
|
||||
subsequence, sequence, max_l_dist))
|
||||
|
||||
def fnm_substitutions_lp(subsequence, sequence, max_substitutions):
|
||||
return list(find_near_matches_substitutions_lp(
|
||||
subsequence, sequence, max_substitutions))
|
||||
|
||||
def fnm_generic_lp(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_generic_linear_programming(
|
||||
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
||||
|
||||
def fnm_generic_lp_cython(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_generic_linear_programming_cython(
|
||||
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
|
||||
|
||||
def fnm_generic_ngrams(subsequence, sequence, max_l_dist):
|
||||
return list(find_near_matches_generic_ngrams(
|
||||
subsequence, sequence, LevenshteinSearchParams(max_l_dist, max_l_dist, max_l_dist, max_l_dist)))
|
||||
|
||||
def hnm_generic_ngrams(subsequence, sequence, max_l_dist):
|
||||
return has_near_match_generic_ngrams(
|
||||
subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist)
|
||||
|
||||
def hnm_substitutions_ngrams(subsequence, sequence, max_substitutions):
|
||||
return has_near_match_substitutions_ngrams(
|
||||
subsequence, sequence, max_substitutions)
|
||||
|
||||
def hnm_substitutions_byteslike(subsequence, sequence, max_substitutions):
|
||||
return substitutions_only_has_near_matches_lp_byteslike(
|
||||
subsequence, sequence, max_substitutions)
|
||||
|
||||
def hnm_substitutions_ngrams_byteslike(subsequence, sequence, max_substitutions):
|
||||
return substitutions_only_has_near_matches_ngrams_byteslike(
|
||||
subsequence, sequence, max_substitutions)
|
||||
|
||||
|
||||
search_functions = {
|
||||
'fnm': find_near_matches,
|
||||
'levenshtein_lp': fnm_levenshtein_lp,
|
||||
'levenshtein_ngrams': fnm_levenshtein_ngrams,
|
||||
'substitutions_lp': fnm_substitutions_lp,
|
||||
'substitutions_ngrams': fnm_substitutions_ngrams,
|
||||
'generic_lp': fnm_generic_lp,
|
||||
'generic_lp_cython': fnm_generic_lp_cython,
|
||||
'generic_ngrams': fnm_generic_ngrams,
|
||||
'has_match_generic_ngrams': hnm_generic_ngrams,
|
||||
'has_match_substitutions_ngrams': hnm_substitutions_ngrams,
|
||||
'has_match_substitutions_byteslike': hnm_substitutions_byteslike,
|
||||
'has_match_substitutions_ngrams_byteslike': hnm_substitutions_ngrams_byteslike,
|
||||
}
|
||||
|
||||
benchmarks = {
|
||||
'dna_no_match': dict(
|
||||
subsequence = 'GCTAGCTAGCTA',
|
||||
sequence = "ATCG" * (10**3),
|
||||
max_dist = 1,
|
||||
),
|
||||
'dna_no_match2': dict(
|
||||
subsequence = 'ATGATGATG',
|
||||
sequence = 'ATCG' * (10**3),
|
||||
max_dist = 2,
|
||||
),
|
||||
'random_kevin': dict(
|
||||
subsequence = ''.join(random.choice('ATCG') for _i in xrange(36)),
|
||||
sequence = ''.join(random.choice('ATCG' * 5 + 'N') for _i in xrange(90)),
|
||||
max_dist = 3,
|
||||
),
|
||||
'random_kevin_partial_match': dict(
|
||||
subsequence = 'AAGTCTAGT' + ''.join(random.choice('ATCG') for _i in xrange(36-9)),
|
||||
sequence = 'AAGTCTAGT' + ''.join(random.choice('ATCG' * 5 + 'N') for _i in xrange(90-9)),
|
||||
max_dist = 3,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def get_benchmark(search_func_name, benchmark_name):
|
||||
search_func = search_functions[search_func_name]
|
||||
search_args = dict(benchmarks[benchmark_name])
|
||||
|
||||
if search_func in (find_near_matches,):
|
||||
search_args['max_l_dist'] = search_args.pop('max_dist')
|
||||
elif search_func in (fnm_levenshtein_ngrams, fnm_levenshtein_lp, fnm_generic_lp, fnm_generic_lp_cython, fnm_generic_ngrams, hnm_generic_ngrams):
|
||||
search_args['max_l_dist'] = search_args.pop('max_dist')
|
||||
elif search_func in (fnm_substitutions_ngrams, fnm_substitutions_lp, hnm_substitutions_ngrams, hnm_substitutions_byteslike, hnm_substitutions_ngrams_byteslike):
|
||||
search_args['max_substitutions'] = search_args.pop('max_dist')
|
||||
else:
|
||||
raise Exception('Unsupported search function: %r' % search_func)
|
||||
|
||||
return search_func, search_args
|
||||
|
||||
|
||||
def run_benchmark(search_func, search_args):
|
||||
return search_func(**search_args)
|
Loading…
Reference in New Issue