improved result filtering and refactored tests
This commit is contained in:
parent
cf5bbd6269
commit
a2f10110bc
|
@ -175,11 +175,48 @@ def find_near_matches_with_ngrams(subsequence, sequence, max_l_dist):
|
|||
if dist_right is None:
|
||||
continue
|
||||
assert dist_left + dist_right <= max_l_dist
|
||||
#matches.append(_get_best_match(subsequence, sequence[index - ngram.start - max_l_dist:index - ngram.start + len(subsequence) + max_l_dist], max_l_dist))
|
||||
|
||||
matches.append(Match(
|
||||
start=index - left_expand_size,
|
||||
end=index + ngram_len + right_expand_size,
|
||||
dist=dist_left + dist_right,
|
||||
))
|
||||
|
||||
return sorted(set(matches))
|
||||
# don't return overlapping matches; instead, group overlapping matches
|
||||
# together and return the best match from each group
|
||||
match_groups = group_matches(matches)
|
||||
best_matches = [get_best_match_in_group(group) for group in match_groups]
|
||||
return sorted(best_matches)
|
||||
|
||||
|
||||
class Group(object):
|
||||
def __init__(self, match):
|
||||
self.start = match.start
|
||||
self.end = match.end
|
||||
self.matches = set([match])
|
||||
|
||||
def is_match_in_group(self, match):
|
||||
return match in self.matches or \
|
||||
not (match.end <= self.start or match.start >= self.end)
|
||||
|
||||
def add_match(self, match):
|
||||
self.matches.add(match)
|
||||
self.start = min(self.start, match.start)
|
||||
self.end = max(self.end, match.end)
|
||||
|
||||
|
||||
def group_matches(matches):
|
||||
groups = []
|
||||
for match in matches:
|
||||
for group in groups:
|
||||
if group.is_match_in_group(match):
|
||||
group.add_match(match)
|
||||
break
|
||||
else:
|
||||
groups.append(Group(match))
|
||||
return [group.matches for group in groups]
|
||||
|
||||
|
||||
def get_best_match_in_group(group):
|
||||
# return longest match amongst those with the shortest distance
|
||||
return min(group, key=lambda match: (match.dist, -(match.end - match.start)))
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from tests.compat import unittest
|
||||
from fuzzysearch.fuzzysearch import find_near_matches, Match, _expand,\
|
||||
find_near_matches_with_ngrams
|
||||
find_near_matches_with_ngrams, get_best_match_in_group, group_matches
|
||||
|
||||
|
||||
class TestFuzzySearch(unittest.TestCase):
|
||||
|
@ -74,7 +74,23 @@ class TestExpand(unittest.TestCase):
|
|||
self.assertEquals((1, 3), _expand('abcd', 'abd', 2))
|
||||
|
||||
|
||||
class TestFindNearMatchesWithNgrams(unittest.TestCase):
|
||||
class TestFuzzySearchBase(object):
|
||||
def search(self, subsequence, sequence, max_l_dist):
|
||||
raise NotImplementedError
|
||||
|
||||
def test_empty_sequence(self):
|
||||
self.assertEquals([], self.search('PATTERN', '', max_l_dist=0))
|
||||
|
||||
def test_empty_subsequence_exeption(self):
|
||||
with self.assertRaises(ValueError):
|
||||
self.search('', 'TEXT', max_l_dist=0)
|
||||
|
||||
def test_match_identical_sequence(self):
|
||||
self.assertEquals(
|
||||
[Match(start=0, end=len('PATTERN'), dist=0)],
|
||||
self.search('PATTERN', 'PATTERN', max_l_dist=0),
|
||||
)
|
||||
|
||||
def test_substring(self):
|
||||
substring = 'PATTERN'
|
||||
text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
|
||||
|
@ -83,15 +99,51 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase):
|
|||
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=0)
|
||||
self.search(substring, text, max_l_dist=0)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=1)
|
||||
self.search(substring, text, max_l_dist=1)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=2)
|
||||
self.search(substring, text, max_l_dist=2)
|
||||
)
|
||||
|
||||
def test_double_first_item(self):
|
||||
self.assertEquals(
|
||||
[Match(start=4, end=7, dist=0)],
|
||||
self.search('def', 'abcddefg', max_l_dist=1),
|
||||
)
|
||||
|
||||
def test_double_last_item(self):
|
||||
self.assertEquals(
|
||||
[Match(start=3, end=6, dist=0)],
|
||||
self.search('def', 'abcdeffg', max_l_dist=1),
|
||||
)
|
||||
|
||||
def test_double_first_items(self):
|
||||
self.assertEquals(
|
||||
[Match(start=5, end=10, dist=0)],
|
||||
self.search('defgh', 'abcdedefghi', max_l_dist=3),
|
||||
)
|
||||
|
||||
def test_double_last_items(self):
|
||||
self.assertEquals(
|
||||
[Match(start=3, end=8, dist=0)],
|
||||
self.search('defgh', 'abcdefghghi', max_l_dist=3),
|
||||
)
|
||||
|
||||
def test_missing_second_item(self):
|
||||
self.assertEquals(
|
||||
[Match(start=1, end=5, dist=1)],
|
||||
self.search('bde', 'abcdefg', max_l_dist=1),
|
||||
)
|
||||
|
||||
def test_missing_second_to_last_item(self):
|
||||
self.assertEquals(
|
||||
[Match(start=1, end=5, dist=1)],
|
||||
self.search('bce', 'abcdefg', max_l_dist=1),
|
||||
)
|
||||
|
||||
def test_one_missing_in_middle(self):
|
||||
|
@ -101,15 +153,15 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase):
|
|||
|
||||
self.assertEquals(
|
||||
[],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=0)
|
||||
self.search(substring, text, max_l_dist=0)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=1)
|
||||
self.search(substring, text, max_l_dist=1)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=2)
|
||||
self.search(substring, text, max_l_dist=2)
|
||||
)
|
||||
|
||||
def test_one_changed_in_middle(self):
|
||||
|
@ -119,15 +171,15 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase):
|
|||
|
||||
self.assertEquals(
|
||||
[],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=0)
|
||||
self.search(substring, text, max_l_dist=0)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=1)
|
||||
self.search(substring, text, max_l_dist=1)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=2)
|
||||
self.search(substring, text, max_l_dist=2)
|
||||
)
|
||||
|
||||
def test_one_extra_in_middle(self):
|
||||
|
@ -137,15 +189,15 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase):
|
|||
|
||||
self.assertEquals(
|
||||
[],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=0)
|
||||
self.search(substring, text, max_l_dist=0)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=1)
|
||||
self.search(substring, text, max_l_dist=1)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=2)
|
||||
self.search(substring, text, max_l_dist=2)
|
||||
)
|
||||
|
||||
def test_one_extra_repeating_in_middle(self):
|
||||
|
@ -155,15 +207,15 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase):
|
|||
|
||||
self.assertEquals(
|
||||
[],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=0)
|
||||
self.search(substring, text, max_l_dist=0)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=1)
|
||||
self.search(substring, text, max_l_dist=1)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=2)
|
||||
self.search(substring, text, max_l_dist=2)
|
||||
)
|
||||
|
||||
def test_one_extra_repeating_at_end(self):
|
||||
|
@ -173,13 +225,43 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase):
|
|||
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=0)
|
||||
self.search(substring, text, max_l_dist=0)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=1)
|
||||
self.search(substring, text, max_l_dist=1)
|
||||
)
|
||||
self.assertEquals(
|
||||
[expected_match],
|
||||
find_near_matches_with_ngrams(substring, text, max_l_dist=2)
|
||||
self.search(substring, text, max_l_dist=2)
|
||||
)
|
||||
|
||||
def test_dna_search(self):
|
||||
# see: http://stackoverflow.com/questions/19725127/
|
||||
text = ''.join('''\
|
||||
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
|
||||
CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
|
||||
ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
|
||||
TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
|
||||
'''.split())
|
||||
pattern = 'TGCACTGTAGGGATAACAAT'
|
||||
|
||||
self.assertEquals(
|
||||
[Match(start=3, end=24, dist=1)],
|
||||
self.search(pattern, text, max_l_dist=2),
|
||||
)
|
||||
|
||||
|
||||
class TestFindNearMatchesWithNgrams(TestFuzzySearchBase, unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_l_dist):
|
||||
return find_near_matches_with_ngrams(subsequence, sequence, max_l_dist)
|
||||
|
||||
|
||||
class TestFindNearMatches(TestFuzzySearchBase, unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_l_dist):
|
||||
return [
|
||||
get_best_match_in_group(group)
|
||||
for group in group_matches(
|
||||
find_near_matches(subsequence, sequence, max_l_dist)
|
||||
)
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue