first working Cython version of generic search
This commit is contained in:
parent
8ba0cc8dc3
commit
2863b57236
|
@ -0,0 +1,156 @@
|
|||
from fuzzysearch.common import Match
|
||||
|
||||
cdef struct GenericSearchCandidate:
|
||||
int start, subseq_index, l_dist, n_subs, n_ins, n_dels
|
||||
|
||||
|
||||
def find_near_matches_generic_linear_programming(subsequence, sequence,
|
||||
max_substitutions,
|
||||
max_insertions,
|
||||
max_deletions,
|
||||
max_l_dist=None):
|
||||
"""search for near-matches of subsequence in sequence
|
||||
|
||||
This searches for near-matches, where the nearly-matching parts of the
|
||||
sequence must meet the following limitations (relative to the subsequence):
|
||||
|
||||
* the maximum allowed number of character substitutions
|
||||
* the maximum allowed number of new characters inserted
|
||||
* and the maximum allowed number of character deletions
|
||||
* the total number of substitutions, insertions and deletions
|
||||
"""
|
||||
if not subsequence:
|
||||
raise ValueError('Given subsequence is empty!')
|
||||
|
||||
# optimization: prepare some often used things in advance
|
||||
_subseq_len = len(subsequence)
|
||||
|
||||
maxes_sum = sum(
|
||||
(x if x is not None else 0)
|
||||
for x in [max_substitutions, max_insertions, max_deletions]
|
||||
)
|
||||
if max_l_dist is None or max_l_dist >= maxes_sum:
|
||||
max_l_dist = maxes_sum
|
||||
|
||||
cdef GenericSearchCandidate[1000] _candidates1
|
||||
cdef GenericSearchCandidate[1000] _candidates2
|
||||
cdef GenericSearchCandidate* candidates = _candidates1
|
||||
cdef GenericSearchCandidate* new_candidates = _candidates2
|
||||
cdef GenericSearchCandidate* _tmp
|
||||
cdef GenericSearchCandidate cand
|
||||
cdef int n_candidates = 0
|
||||
cdef int n_new_candidates = 0
|
||||
cdef int n_cand
|
||||
|
||||
for index, char in enumerate(sequence):
|
||||
candidates[n_candidates] = GenericSearchCandidate(index, 0, 0, 0, 0, 0)
|
||||
n_candidates += 1
|
||||
|
||||
for n_cand in xrange(n_candidates):
|
||||
cand = candidates[n_cand]
|
||||
# if this sequence char is the candidate's next expected char
|
||||
if char == subsequence[cand.subseq_index]:
|
||||
# if reached the end of the subsequence, return a match
|
||||
if cand.subseq_index + 1 == _subseq_len:
|
||||
yield Match(cand.start, index + 1, cand.l_dist)
|
||||
# otherwise, update the candidate's subseq_index and keep it
|
||||
else:
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1,
|
||||
cand.l_dist, cand.n_subs,
|
||||
cand.n_ins, cand.n_dels,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
|
||||
# if this sequence char is *not* the candidate's next expected char
|
||||
else:
|
||||
# we can try skipping a sequence or sub-sequence char (or both),
|
||||
# unless this candidate has already skipped the maximum allowed
|
||||
# number of characters
|
||||
if cand.l_dist == max_l_dist:
|
||||
continue
|
||||
|
||||
if cand.n_ins < max_insertions:
|
||||
# add a candidate skipping a sequence char
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index,
|
||||
cand.l_dist + 1, cand.n_subs,
|
||||
cand.n_ins + 1, cand.n_dels,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
|
||||
if cand.subseq_index + 1 < _subseq_len:
|
||||
if cand.n_subs < max_substitutions:
|
||||
# add a candidate skipping both a sequence char and a
|
||||
# subsequence char
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1,
|
||||
cand.l_dist + 1, cand.n_subs + 1,
|
||||
cand.n_ins, cand.n_dels,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
elif cand.n_dels < max_deletions and cand.n_ins < max_insertions:
|
||||
# add a candidate skipping both a sequence char and a
|
||||
# subsequence char
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1,
|
||||
cand.l_dist + 1, cand.n_subs,
|
||||
cand.n_ins + 1, cand.n_dels + 1,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
else:
|
||||
# cand.subseq_index == _subseq_len - 1
|
||||
if (
|
||||
cand.n_subs < max_substitutions or
|
||||
(
|
||||
cand.n_dels < max_deletions and
|
||||
cand.n_ins < max_insertions
|
||||
)
|
||||
):
|
||||
yield Match(cand.start, index + 1, cand.l_dist + 1)
|
||||
|
||||
# try skipping subsequence chars
|
||||
for n_skipped in xrange(1, min(max_deletions - cand.n_dels, max_l_dist - cand.l_dist) + 1):
|
||||
# if skipping n_dels sub-sequence chars reaches the end
|
||||
# of the sub-sequence, yield a match
|
||||
if cand.subseq_index + n_skipped == _subseq_len:
|
||||
yield Match(cand.start, index + 1,
|
||||
cand.l_dist + n_skipped)
|
||||
break
|
||||
# otherwise, if skipping n_skipped sub-sequence chars
|
||||
# reaches a sub-sequence char identical to this sequence
|
||||
# char ...
|
||||
elif subsequence[cand.subseq_index + n_skipped] == char:
|
||||
# if this is the last char of the sub-sequence, yield
|
||||
# a match
|
||||
if cand.subseq_index + n_skipped + 1 == _subseq_len:
|
||||
yield Match(cand.start, index + 1,
|
||||
cand.l_dist + n_skipped)
|
||||
# otherwise add a candidate skipping n_skipped
|
||||
# subsequence chars
|
||||
else:
|
||||
new_candidates[n_new_candidates] = GenericSearchCandidate(
|
||||
cand.start, cand.subseq_index + 1 + n_skipped,
|
||||
cand.l_dist + n_skipped, cand.n_subs,
|
||||
cand.n_ins, cand.n_dels + n_skipped,
|
||||
)
|
||||
n_new_candidates += 1
|
||||
break
|
||||
# note: if the above loop ends without a break, that means that
|
||||
# no candidate could be added / yielded by skipping sub-sequence
|
||||
# chars
|
||||
|
||||
# new_candidates = candidates; candidates = []
|
||||
_tmp = candidates
|
||||
candidates = new_candidates
|
||||
new_candidates = _tmp
|
||||
n_candidates = n_new_candidates
|
||||
n_new_candidates = 0
|
||||
|
||||
for n_cand in xrange(n_candidates):
|
||||
cand = candidates[n_cand]
|
||||
# note: index + 1 == length(sequence)
|
||||
n_skipped = _subseq_len - cand.subseq_index
|
||||
if cand.n_dels + n_skipped <= max_deletions and \
|
||||
cand.l_dist + n_skipped <= max_l_dist:
|
||||
yield Match(cand.start, index + 1, cand.l_dist + n_skipped)
|
6
setup.py
6
setup.py
|
@ -3,7 +3,7 @@
|
|||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from Cython.Build import cythonize
|
||||
|
||||
try:
|
||||
from setuptools import setup
|
||||
|
@ -29,9 +29,9 @@ setup(
|
|||
'fuzzysearch',
|
||||
],
|
||||
package_dir={'fuzzysearch': 'fuzzysearch'},
|
||||
ext_modules=cythonize("fuzzysearch/_generic_search.pyx"),
|
||||
include_package_data=True,
|
||||
install_requires=[
|
||||
],
|
||||
install_requires=[],
|
||||
use_2to3=True,
|
||||
license="MIT",
|
||||
zip_safe=False,
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from tests.compat import unittest
|
||||
from tests.test_levenshtein import TestFindNearMatchesLevenshteinBase
|
||||
from fuzzysearch.common import Match, get_best_match_in_group, group_matches
|
||||
from tests.test_substitutions_only import TestSubstitionsOnlyBase
|
||||
from fuzzysearch.generic_search import \
|
||||
find_near_matches_generic_linear_programming as fnm_generic_lp
|
||||
from tests.test_substitutions_only import TestSubstitionsOnlyBase
|
||||
|
||||
|
||||
class TestGenericSearchAsLevenshtein(TestFindNearMatchesLevenshteinBase,
|
||||
|
|
|
@ -0,0 +1,153 @@
|
|||
from tests.compat import unittest
|
||||
from tests.test_levenshtein import TestFindNearMatchesLevenshteinBase
|
||||
from fuzzysearch.common import Match, get_best_match_in_group, group_matches
|
||||
from tests.test_substitutions_only import TestSubstitionsOnlyBase
|
||||
|
||||
import pyximport
|
||||
pyimporter, pyximporter = pyximport.install()
|
||||
from fuzzysearch._generic_search import \
|
||||
find_near_matches_generic_linear_programming as fnm_generic_lp
|
||||
pyximport.uninstall(pyimporter, pyximporter)
|
||||
|
||||
|
||||
class TestGenericSearchAsLevenshtein(TestFindNearMatchesLevenshteinBase,
|
||||
unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_l_dist):
|
||||
return [
|
||||
get_best_match_in_group(group)
|
||||
for group in group_matches(
|
||||
fnm_generic_lp(subsequence, sequence, max_l_dist,
|
||||
max_l_dist, max_l_dist, max_l_dist)
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class TestGenericSearchAsSubstitutionsOnly(TestSubstitionsOnlyBase,
|
||||
unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
return list(
|
||||
fnm_generic_lp(subsequence, sequence, max_subs, 0, 0, max_subs)
|
||||
)
|
||||
|
||||
|
||||
class TestGenericSearch(unittest.TestCase):
|
||||
def search(self, pattern, sequence, max_subs, max_ins, max_dels,
|
||||
max_l_dist=None):
|
||||
return list(fnm_generic_lp(pattern, sequence, max_subs,
|
||||
max_ins, max_dels, max_l_dist))
|
||||
|
||||
def test_empty_sequence(self):
|
||||
self.assertEqual([], self.search('PATTERN', '', 0, 0, 0))
|
||||
|
||||
def test_empty_subsequence_exeption(self):
|
||||
with self.assertRaises(ValueError):
|
||||
self.search('', 'TEXT', 0, 0, 0)
|
||||
|
||||
def test_match_identical_sequence(self):
|
||||
self.assertEqual(
|
||||
[Match(start=0, end=len('PATTERN'), dist=0)],
|
||||
self.search('PATTERN', 'PATTERN', 0, 0, 0),
|
||||
)
|
||||
|
||||
def test_substring(self):
|
||||
substring = 'PATTERN'
|
||||
text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
|
||||
expected_match = Match(start=10, end=17, dist=0)
|
||||
|
||||
self.assertEqual(
|
||||
[expected_match],
|
||||
self.search(substring, text, 0, 0, 0)
|
||||
)
|
||||
|
||||
def test_double_first_item(self):
|
||||
# sequence = 'abcdefg'
|
||||
# pattern = 'bde'
|
||||
|
||||
self.assertEqual(
|
||||
[Match(start=4, end=7, dist=0)],
|
||||
self.search('def', 'abcddefg', 0, 0, 0),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[Match(start=4, end=7, dist=0)],
|
||||
self.search('def', 'abcddefg', 1, 0, 0),
|
||||
)
|
||||
|
||||
self.assertListEqual(
|
||||
[Match(start=3, end=7, dist=1), Match(start=4, end=7, dist=0)],
|
||||
self.search('def', 'abcddefg', 0, 1, 0),
|
||||
)
|
||||
|
||||
self.assertIn(
|
||||
Match(start=4, end=7, dist=0),
|
||||
self.search('def', 'abcddefg', 0, 0, 1),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
[Match(start=4, end=7, dist=0)],
|
||||
self.search('def', 'abcddefg', 0, 1, 0, 0),
|
||||
)
|
||||
|
||||
def test_missing_second_item(self):
|
||||
# sequence = 'abcdefg'
|
||||
# pattern = 'bde'
|
||||
|
||||
self.assertEqual(
|
||||
self.search('bde', 'abcdefg', 0, 1, 0),
|
||||
[Match(start=1, end=5, dist=1)],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('bde', 'abcdefg', 0, 0, 0),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('bde', 'abcdefg', 1, 0, 0),
|
||||
[Match(start=2, end=5, dist=1)],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('bde', 'abcdefg', 0, 0, 1),
|
||||
[Match(start=3, end=5, dist=1)],
|
||||
)
|
||||
|
||||
self.assertListEqual(
|
||||
self.search('bde', 'abcdefg', 1, 1, 1, 1),
|
||||
[Match(start=1, end=5, dist=1),
|
||||
Match(start=2, end=5, dist=1),
|
||||
Match(start=3, end=5, dist=1)],
|
||||
)
|
||||
|
||||
self.assertTrue(
|
||||
set([
|
||||
Match(start=1, end=5, dist=1),
|
||||
Match(start=2, end=5, dist=1),
|
||||
Match(start=3, end=5, dist=1),
|
||||
Match(start=2, end=5, dist=3),
|
||||
]).issubset(set(
|
||||
self.search('bde', 'abcdefg', 1, 1, 1, 3),
|
||||
))
|
||||
)
|
||||
|
||||
def test_argument_handling(self):
|
||||
# check that no exception is raised when some values are None
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', 0, None, None, None),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, 0, None, None),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, 0, None),
|
||||
[],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.search('a', 'b', None, None, None, 0),
|
||||
[],
|
||||
)
|
Loading…
Reference in New Issue