greatly optimized Cython search function + fixed and added benchmarks

2014-03-28 12:51:32 +03:00 · 2014-03-28 12:51:32 +03:00 · 8b65f29b3f
parent 2863b57236
commit 8b65f29b3f
3 changed files with 195 additions and 102 deletions
--- a/benchmarks/init.py
+++ b/benchmarks/init.py
@ -5,6 +5,14 @@ from fuzzysearch.levenshtein_ngram import \
 from fuzzysearch.susbstitutions_only import \
    find_near_matches_substitutions_ngrams as fnm_substitutions_ngrams, \
    find_near_matches_substitutions_linear_programming
+from fuzzysearch.generic_search import \
+    find_near_matches_generic_linear_programming
+import pyximport
+pyximport.install()
+from fuzzysearch._generic_search import \
+    find_near_matches_generic_linear_programming as \
+    find_near_matches_generic_linear_programming_cython
+import random


 def fnm_levenshtein_lp(subsequence, sequence, max_l_dist):
@ -15,32 +23,56 @@ def fnm_substitutions_lp(subsequence, sequence, max_substitutions):
    return list(find_near_matches_substitutions_linear_programming(
        subsequence, sequence, max_substitutions))

+def fnm_generic_lp(subsequence, sequence, max_l_dist):
+    return list(find_near_matches_generic_linear_programming(
+        subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
+
+def fnm_generic_lp_cython(subsequence, sequence, max_l_dist):
+    return list(find_near_matches_generic_linear_programming_cython(
+        subsequence, sequence, max_l_dist, max_l_dist, max_l_dist, max_l_dist))
+

 search_functions = {
    'levenshtein_lp': fnm_levenshtein_lp,
    'levenshtein_ngrams': fnm_levenshtein_ngrams,
    'substitutions_lp': fnm_substitutions_lp,
    'substitutions_ngrams': fnm_substitutions_ngrams,
+    'generic_lp': fnm_generic_lp,
+    'generic_lp_cython': fnm_generic_lp_cython,
 }

 benchmarks = {
    'dna_no_match': dict(
        subsequence = 'GCTAGCTAGCTA',
-        sequence = '"ATCG" * (10**3)',
+        sequence = "ATCG" * (10**3),
        max_dist = 1,
    ),
+    'dna_no_match2': dict(
+        subsequence = 'ATGATGATG',
+        sequence = 'ATCG' * (10**3),
+        max_dist = 2,
+    ),
+    'random_kevin': dict(
+        subsequence = ''.join(random.choice('ATCG') for _i in xrange(36)),
+        sequence = ''.join(random.choice('ATCG' * 5 + 'N') for _i in xrange(90)),
+        max_dist = 3,
+    ),
 }


-def run_benchmark(search_func_name, benchmark_name):
+def get_benchmark(search_func_name, benchmark_name):
    search_func = search_functions[search_func_name]
    search_args = dict(benchmarks[benchmark_name])

-    if search_func in (fnm_levenshtein_ngrams, fnm_levenshtein_lp):
+    if search_func in (fnm_levenshtein_ngrams, fnm_levenshtein_lp, fnm_generic_lp, fnm_generic_lp_cython):
        search_args['max_l_dist'] = search_args.pop('max_dist')
    elif search_func in (fnm_substitutions_ngrams, fnm_substitutions_lp):
        search_args['max_substitutions'] = search_args.pop('max_dist')
    else:
        raise Exception('Unsupported search function: %r' % search_func)

+    return search_func, search_args
+
+
+def run_benchmark(search_func, search_args):
    return search_func(**search_args)
--- a/benchmarks/main.py
+++ b/benchmarks/main.py
@ -4,4 +4,4 @@ import timeit
 args = sys.argv[1:]
 search_func_name, benchmark_name = args

-timeit.main(['-s', 'from benchmarks import run_benchmark', 'run_benchmark(%r, %r)' % (search_func_name, benchmark_name)])
+timeit.main(['-s', 'from benchmarks import get_benchmark, run_benchmark; search_func, search_args = get_benchmark(%r, %r)' % (search_func_name, benchmark_name), 'run_benchmark(search_func, search_args)'])
--- a/fuzzysearch/_generic_search.pyx
+++ b/fuzzysearch/_generic_search.pyx
@ -1,9 +1,23 @@
 from fuzzysearch.common import Match
+from libc.stdlib cimport malloc, free, realloc
+
+
+__all__ = ['find_near_matches_generic_linear_programming']
+

 cdef struct GenericSearchCandidate:
    int start, subseq_index, l_dist, n_subs, n_ins, n_dels


+ALLOWED_TYPES = (str,)
+try:
+    from Bio.Seq import Seq
+except ImportError:
+    pass
+else:
+    ALLOWED_TYPES += (Seq,)
+
+
 def find_near_matches_generic_linear_programming(subsequence, sequence,
                                                 max_substitutions,
                                                 max_insertions,
@ -22,8 +36,14 @@ def find_near_matches_generic_linear_programming(subsequence, sequence,
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

+    if not isinstance(sequence, ALLOWED_TYPES):
+        raise TypeError('sequence is of invalid type %s' % type(subsequence))
+    if not isinstance(subsequence, ALLOWED_TYPES):
+        raise TypeError('subsequence is of invalid type %s' % type(subsequence))
+
    # optimization: prepare some often used things in advance
-    _subseq_len = len(subsequence)
+    cdef int _subseq_len = len(subsequence)
+    cdef int _subseq_len_minus_one = _subseq_len - 1

    maxes_sum = sum(
        (x if x is not None else 0)
@ -32,125 +52,166 @@ def find_near_matches_generic_linear_programming(subsequence, sequence,
    if max_l_dist is None or max_l_dist >= maxes_sum:
        max_l_dist = maxes_sum

-    cdef GenericSearchCandidate[1000] _candidates1
-    cdef GenericSearchCandidate[1000] _candidates2
-    cdef GenericSearchCandidate* candidates = _candidates1
-    cdef GenericSearchCandidate* new_candidates = _candidates2
+    cdef c_max_l_dist = max_l_dist
+    cdef c_max_substitutions = max_substitutions
+    cdef c_max_insertions = max_insertions
+    cdef c_max_deletions = max_deletions
+
+    cdef alloc_size
+    cdef GenericSearchCandidate* candidates
+    cdef GenericSearchCandidate* new_candidates
    cdef GenericSearchCandidate* _tmp
    cdef GenericSearchCandidate cand
    cdef int n_candidates = 0
    cdef int n_new_candidates = 0
    cdef int n_cand

-    for index, char in enumerate(sequence):
-        candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0)
-        n_candidates += 1
+    cdef char* c_sequence = sequence
+    cdef char* c_subsequence = subsequence
+    cdef char char

-        for n_cand in xrange(n_candidates):
-            cand = candidates[n_cand]
-            # if this sequence char is the candidate's next expected char
-            if char == subsequence[cand.subseq_index]:
-                # if reached the end of the subsequence, return a match
-                if cand.subseq_index + 1 == _subseq_len:
-                    yield Match(cand.start, index + 1, cand.l_dist)
-                # otherwise, update the candidate's subseq_index and keep it
-                else:
-                    new_candidates[n_new_candidates] = GenericSearchCandidate(
-                        cand.start, cand.subseq_index + 1,
-                        cand.l_dist, cand.n_subs,
-                        cand.n_ins, cand.n_dels,
-                    )
-                    n_new_candidates += 1
+    alloc_size = min(10, _subseq_len * 3 + 1)
+    candidates = <GenericSearchCandidate *> malloc(alloc_size * sizeof(GenericSearchCandidate))
+    if candidates is NULL:
+        raise MemoryError()
+    new_candidates = <GenericSearchCandidate *> malloc(alloc_size * sizeof(GenericSearchCandidate))
+    if candidates is NULL:
+        free(candidates)
+        raise MemoryError()

-            # if this sequence char is *not* the candidate's next expected char
-            else:
-                # we can try skipping a sequence or sub-sequence char (or both),
-                # unless this candidate has already skipped the maximum allowed
-                # number of characters
-                if cand.l_dist == max_l_dist:
-                    continue
+    try:
+        have_realloced = False
+        for index in xrange(len(sequence)):
+            char = c_sequence[index]

-                if cand.n_ins < max_insertions:
-                    # add a candidate skipping a sequence char
-                    new_candidates[n_new_candidates] = GenericSearchCandidate(
-                        cand.start, cand.subseq_index,
-                        cand.l_dist + 1, cand.n_subs,
-                        cand.n_ins + 1, cand.n_dels,
-                    )
-                    n_new_candidates += 1
+            candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0)
+            n_candidates += 1

-                if cand.subseq_index + 1 < _subseq_len:
-                    if cand.n_subs < max_substitutions:
-                        # add a candidate skipping both a sequence char and a
-                        # subsequence char
+            for n_cand in xrange(n_candidates):
+                cand = candidates[n_cand]
+
+                if n_new_candidates + 4 > alloc_size:
+                    alloc_size += alloc_size // 2
+                    _tmp = <GenericSearchCandidate *>realloc(new_candidates, alloc_size * sizeof(GenericSearchCandidate))
+                    if _tmp is NULL:
+                        raise MemoryError()
+                    new_candidates = _tmp
+                    have_realloced = True
+
+                # if this sequence char is the candidate's next expected char
+                if char == c_subsequence[cand.subseq_index]:
+                    # if reached the end of the subsequence, return a match
+                    if cand.subseq_index == _subseq_len_minus_one:
+                        yield Match(cand.start, index + 1, cand.l_dist)
+                    # otherwise, update the candidate's subseq_index and keep it
+                    else:
                        new_candidates[n_new_candidates] = GenericSearchCandidate(
                            cand.start, cand.subseq_index + 1,
-                            cand.l_dist + 1, cand.n_subs + 1,
+                            cand.l_dist, cand.n_subs,
                            cand.n_ins, cand.n_dels,
                        )
                        n_new_candidates += 1
-                    elif cand.n_dels < max_deletions and cand.n_ins < max_insertions:
-                        # add a candidate skipping both a sequence char and a
-                        # subsequence char
+
+                # if this sequence char is *not* the candidate's next expected char
+                else:
+                    # we can try skipping a sequence or sub-sequence char (or both),
+                    # unless this candidate has already skipped the maximum allowed
+                    # number of characters
+                    if cand.l_dist == c_max_l_dist:
+                        continue
+
+                    if cand.n_ins < c_max_insertions:
+                        # add a candidate skipping a sequence char
                        new_candidates[n_new_candidates] = GenericSearchCandidate(
-                            cand.start, cand.subseq_index + 1,
+                            cand.start, cand.subseq_index,
                            cand.l_dist + 1, cand.n_subs,
-                            cand.n_ins + 1, cand.n_dels + 1,
+                            cand.n_ins + 1, cand.n_dels,
                        )
                        n_new_candidates += 1
-                else:
-                    # cand.subseq_index == _subseq_len - 1
-                    if (
-                            cand.n_subs < max_substitutions or
-                            (
-                                cand.n_dels < max_deletions and
-                                cand.n_ins < max_insertions
-                            )
-                    ):
-                        yield Match(cand.start, index + 1, cand.l_dist + 1)

-                # try skipping subsequence chars
-                for n_skipped in xrange(1, min(max_deletions - cand.n_dels, max_l_dist - cand.l_dist) + 1):
-                    # if skipping n_dels sub-sequence chars reaches the end
-                    # of the sub-sequence, yield a match
-                    if cand.subseq_index + n_skipped == _subseq_len:
-                        yield Match(cand.start, index + 1,
-                                    cand.l_dist + n_skipped)
-                        break
-                    # otherwise, if skipping n_skipped sub-sequence chars
-                    # reaches a sub-sequence char identical to this sequence
-                    # char ...
-                    elif subsequence[cand.subseq_index + n_skipped] == char:
-                        # if this is the last char of the sub-sequence, yield
-                        # a match
-                        if cand.subseq_index + n_skipped + 1 == _subseq_len:
-                            yield Match(cand.start, index + 1,
-                                        cand.l_dist + n_skipped)
-                        # otherwise add a candidate skipping n_skipped
-                        # subsequence chars
-                        else:
+                    if cand.subseq_index + 1 < _subseq_len:
+                        if cand.n_subs < c_max_substitutions:
+                            # add a candidate skipping both a sequence char and a
+                            # subsequence char
                            new_candidates[n_new_candidates] = GenericSearchCandidate(
-                                cand.start, cand.subseq_index + 1 + n_skipped,
-                                cand.l_dist + n_skipped, cand.n_subs,
-                                cand.n_ins, cand.n_dels + n_skipped,
+                                cand.start, cand.subseq_index + 1,
+                                cand.l_dist + 1, cand.n_subs + 1,
+                                cand.n_ins, cand.n_dels,
                            )
                            n_new_candidates += 1
-                        break
-                # note: if the above loop ends without a break, that means that
-                # no candidate could be added / yielded by skipping sub-sequence
-                # chars
+                        elif cand.n_dels < c_max_deletions and cand.n_ins < c_max_insertions:
+                            # add a candidate skipping both a sequence char and a
+                            # subsequence char
+                            new_candidates[n_new_candidates] = GenericSearchCandidate(
+                                cand.start, cand.subseq_index + 1,
+                                cand.l_dist + 1, cand.n_subs,
+                                cand.n_ins + 1, cand.n_dels + 1,
+                            )
+                            n_new_candidates += 1
+                    else:
+                        # cand.subseq_index == _subseq_len - 1
+                        if (
+                                cand.n_subs < c_max_substitutions or
+                                (
+                                    cand.n_dels < c_max_deletions and
+                                    cand.n_ins < c_max_insertions
+                                )
+                        ):
+                            yield Match(cand.start, index + 1, cand.l_dist + 1)

-        # new_candidates = candidates; candidates = []
-        _tmp = candidates
-        candidates = new_candidates
-        new_candidates = _tmp
-        n_candidates = n_new_candidates
-        n_new_candidates = 0
+                    # try skipping subsequence chars
+                    for n_skipped in xrange(1, min(c_max_deletions - cand.n_dels, c_max_l_dist - cand.l_dist) + 1):
+                        # if skipping n_dels sub-sequence chars reaches the end
+                        # of the sub-sequence, yield a match
+                        if cand.subseq_index + n_skipped == _subseq_len:
+                            yield Match(cand.start, index + 1,
+                                        cand.l_dist + n_skipped)
+                            break
+                        # otherwise, if skipping n_skipped sub-sequence chars
+                        # reaches a sub-sequence char identical to this sequence
+                        # char ...
+                        elif char == c_subsequence[cand.subseq_index + n_skipped]:
+                            # if this is the last char of the sub-sequence, yield
+                            # a match
+                            if cand.subseq_index + n_skipped + 1 == _subseq_len:
+                                yield Match(cand.start, index + 1,
+                                            cand.l_dist + n_skipped)
+                            # otherwise add a candidate skipping n_skipped
+                            # subsequence chars
+                            else:
+                                new_candidates[n_new_candidates] = GenericSearchCandidate(
+                                    cand.start, cand.subseq_index + 1 + n_skipped,
+                                    cand.l_dist + n_skipped, cand.n_subs,
+                                    cand.n_ins, cand.n_dels + n_skipped,
+                                )
+                                n_new_candidates += 1
+                            break
+                    # note: if the above loop ends without a break, that means that
+                    # no candidate could be added / yielded by skipping sub-sequence
+                    # chars

-    for n_cand in xrange(n_candidates):
-        cand = candidates[n_cand]
-        # note: index + 1 == length(sequence)
-        n_skipped = _subseq_len - cand.subseq_index
-        if cand.n_dels + n_skipped <= max_deletions and \
-           cand.l_dist + n_skipped <= max_l_dist:
-            yield Match(cand.start, index + 1, cand.l_dist + n_skipped)
+            # new_candidates = candidates; candidates = []
+            _tmp = candidates
+            candidates = new_candidates
+            new_candidates = _tmp
+            n_candidates = n_new_candidates
+            n_new_candidates = 0
+
+            if have_realloced:
+                have_realloced = False
+                _tmp = <GenericSearchCandidate *>realloc(new_candidates, alloc_size * sizeof(GenericSearchCandidate))
+                if _tmp is NULL:
+                    raise MemoryError()
+                new_candidates = _tmp
+
+        for n_cand in xrange(n_candidates):
+            cand = candidates[n_cand]
+            # note: index + 1 == length(sequence)
+            n_skipped = _subseq_len - cand.subseq_index
+            if cand.n_dels + n_skipped <= c_max_deletions and \
+               cand.l_dist + n_skipped <= c_max_l_dist:
+                yield Match(cand.start, index + 1, cand.l_dist + n_skipped)
+
+    finally:
+        free(candidates)
+        free(new_candidates)