diff --git a/fuzzysearch/fuzzysearch.py b/fuzzysearch/fuzzysearch.py index ba5d218..ae4127f 100644 --- a/fuzzysearch/fuzzysearch.py +++ b/fuzzysearch/fuzzysearch.py @@ -175,11 +175,48 @@ def find_near_matches_with_ngrams(subsequence, sequence, max_l_dist): if dist_right is None: continue assert dist_left + dist_right <= max_l_dist - #matches.append(_get_best_match(subsequence, sequence[index - ngram.start - max_l_dist:index - ngram.start + len(subsequence) + max_l_dist], max_l_dist)) + matches.append(Match( start=index - left_expand_size, end=index + ngram_len + right_expand_size, dist=dist_left + dist_right, )) - return sorted(set(matches)) + # don't return overlapping matches; instead, group overlapping matches + # together and return the best match from each group + match_groups = group_matches(matches) + best_matches = [get_best_match_in_group(group) for group in match_groups] + return sorted(best_matches) + + +class Group(object): + def __init__(self, match): + self.start = match.start + self.end = match.end + self.matches = set([match]) + + def is_match_in_group(self, match): + return match in self.matches or \ + not (match.end <= self.start or match.start >= self.end) + + def add_match(self, match): + self.matches.add(match) + self.start = min(self.start, match.start) + self.end = max(self.end, match.end) + + +def group_matches(matches): + groups = [] + for match in matches: + for group in groups: + if group.is_match_in_group(match): + group.add_match(match) + break + else: + groups.append(Group(match)) + return [group.matches for group in groups] + + +def get_best_match_in_group(group): + # return longest match amongst those with the shortest distance + return min(group, key=lambda match: (match.dist, -(match.end - match.start))) diff --git a/tests/test_fuzzysearch.py b/tests/test_fuzzysearch.py index c9d404e..8c4cec5 100644 --- a/tests/test_fuzzysearch.py +++ b/tests/test_fuzzysearch.py @@ -1,6 +1,6 @@ from tests.compat import unittest from fuzzysearch.fuzzysearch import find_near_matches, Match, _expand,\ - find_near_matches_with_ngrams + find_near_matches_with_ngrams, get_best_match_in_group, group_matches class TestFuzzySearch(unittest.TestCase): @@ -74,7 +74,23 @@ class TestExpand(unittest.TestCase): self.assertEquals((1, 3), _expand('abcd', 'abd', 2)) -class TestFindNearMatchesWithNgrams(unittest.TestCase): +class TestFuzzySearchBase(object): + def search(self, subsequence, sequence, max_l_dist): + raise NotImplementedError + + def test_empty_sequence(self): + self.assertEquals([], self.search('PATTERN', '', max_l_dist=0)) + + def test_empty_subsequence_exeption(self): + with self.assertRaises(ValueError): + self.search('', 'TEXT', max_l_dist=0) + + def test_match_identical_sequence(self): + self.assertEquals( + [Match(start=0, end=len('PATTERN'), dist=0)], + self.search('PATTERN', 'PATTERN', max_l_dist=0), + ) + def test_substring(self): substring = 'PATTERN' text = 'aaaaaaaaaaPATTERNaaaaaaaaa' @@ -83,15 +99,51 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase): self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=0) + self.search(substring, text, max_l_dist=0) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=1) + self.search(substring, text, max_l_dist=1) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=2) + self.search(substring, text, max_l_dist=2) + ) + + def test_double_first_item(self): + self.assertEquals( + [Match(start=4, end=7, dist=0)], + self.search('def', 'abcddefg', max_l_dist=1), + ) + + def test_double_last_item(self): + self.assertEquals( + [Match(start=3, end=6, dist=0)], + self.search('def', 'abcdeffg', max_l_dist=1), + ) + + def test_double_first_items(self): + self.assertEquals( + [Match(start=5, end=10, dist=0)], + self.search('defgh', 'abcdedefghi', max_l_dist=3), + ) + + def test_double_last_items(self): + self.assertEquals( + [Match(start=3, end=8, dist=0)], + self.search('defgh', 'abcdefghghi', max_l_dist=3), + ) + + def test_missing_second_item(self): + self.assertEquals( + [Match(start=1, end=5, dist=1)], + self.search('bde', 'abcdefg', max_l_dist=1), + ) + + def test_missing_second_to_last_item(self): + self.assertEquals( + [Match(start=1, end=5, dist=1)], + self.search('bce', 'abcdefg', max_l_dist=1), ) def test_one_missing_in_middle(self): @@ -101,15 +153,15 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase): self.assertEquals( [], - find_near_matches_with_ngrams(substring, text, max_l_dist=0) + self.search(substring, text, max_l_dist=0) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=1) + self.search(substring, text, max_l_dist=1) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=2) + self.search(substring, text, max_l_dist=2) ) def test_one_changed_in_middle(self): @@ -119,15 +171,15 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase): self.assertEquals( [], - find_near_matches_with_ngrams(substring, text, max_l_dist=0) + self.search(substring, text, max_l_dist=0) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=1) + self.search(substring, text, max_l_dist=1) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=2) + self.search(substring, text, max_l_dist=2) ) def test_one_extra_in_middle(self): @@ -137,15 +189,15 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase): self.assertEquals( [], - find_near_matches_with_ngrams(substring, text, max_l_dist=0) + self.search(substring, text, max_l_dist=0) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=1) + self.search(substring, text, max_l_dist=1) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=2) + self.search(substring, text, max_l_dist=2) ) def test_one_extra_repeating_in_middle(self): @@ -155,15 +207,15 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase): self.assertEquals( [], - find_near_matches_with_ngrams(substring, text, max_l_dist=0) + self.search(substring, text, max_l_dist=0) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=1) + self.search(substring, text, max_l_dist=1) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=2) + self.search(substring, text, max_l_dist=2) ) def test_one_extra_repeating_at_end(self): @@ -173,13 +225,43 @@ class TestFindNearMatchesWithNgrams(unittest.TestCase): self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=0) + self.search(substring, text, max_l_dist=0) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=1) + self.search(substring, text, max_l_dist=1) ) self.assertEquals( [expected_match], - find_near_matches_with_ngrams(substring, text, max_l_dist=2) + self.search(substring, text, max_l_dist=2) ) + + def test_dna_search(self): + # see: http://stackoverflow.com/questions/19725127/ + text = ''.join('''\ + GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT + CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC + ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC + TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG + '''.split()) + pattern = 'TGCACTGTAGGGATAACAAT' + + self.assertEquals( + [Match(start=3, end=24, dist=1)], + self.search(pattern, text, max_l_dist=2), + ) + + +class TestFindNearMatchesWithNgrams(TestFuzzySearchBase, unittest.TestCase): + def search(self, subsequence, sequence, max_l_dist): + return find_near_matches_with_ngrams(subsequence, sequence, max_l_dist) + + +class TestFindNearMatches(TestFuzzySearchBase, unittest.TestCase): + def search(self, subsequence, sequence, max_l_dist): + return [ + get_best_match_in_group(group) + for group in group_matches( + find_near_matches(subsequence, sequence, max_l_dist) + ) + ]