356 lines
13 KiB
Python
356 lines
13 KiB
Python
import re
|
|
|
|
from tests.compat import unittest
|
|
|
|
from fuzzysearch.common import Match, get_best_match_in_group, group_matches, consolidate_overlapping_matches
|
|
from fuzzysearch.levenshtein import find_near_matches_levenshtein, \
|
|
find_near_matches_levenshtein_linear_programming as fnm_levenshtein_lp
|
|
from fuzzysearch.levenshtein_ngram import \
|
|
_expand, _py_expand_short, _expand_long, \
|
|
find_near_matches_levenshtein_ngrams as fnm_levenshtein_ngrams
|
|
|
|
|
|
def longstr(string):
|
|
return re.sub(r'\s+', '', string)
|
|
|
|
|
|
class TestFuzzySearch(unittest.TestCase):
|
|
def test_empty_sequence(self):
|
|
self.assertEqual(
|
|
list(fnm_levenshtein_lp('PATTERN', '', max_l_dist=0)),
|
|
[],
|
|
)
|
|
|
|
def test_empty_subsequence_exeption(self):
|
|
with self.assertRaises(ValueError):
|
|
list(fnm_levenshtein_lp('', 'TEXT', max_l_dist=0))
|
|
|
|
def test_match_identical_sequence(self):
|
|
matches = \
|
|
list(fnm_levenshtein_lp('PATTERN', 'PATTERN', max_l_dist=0))
|
|
self.assertEqual(matches, [Match(start=0, end=len('PATTERN'), dist=0,
|
|
matched='PATTERN')])
|
|
|
|
def test_double_first_item(self):
|
|
sequence = 'abcddefg'
|
|
pattern = 'def'
|
|
matches = \
|
|
list(fnm_levenshtein_lp(pattern, sequence, max_l_dist=1))
|
|
self.assertIn(Match(start=4, end=7, dist=0, matched=pattern), matches)
|
|
|
|
def test_missing_second_item(self):
|
|
sequence = 'abcdefg'
|
|
pattern = 'bde'
|
|
matches = \
|
|
list(fnm_levenshtein_lp(pattern, sequence, max_l_dist=1))
|
|
self.assertIn(Match(start=1, end=5, dist=1, matched='bcde'), matches)
|
|
|
|
def test_dna_search(self):
|
|
# see: http://stackoverflow.com/questions/19725127/
|
|
text = ''.join('''\
|
|
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
|
|
CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
|
|
ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
|
|
TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
|
|
'''.split())
|
|
pattern = 'TGCACTGTAGGGATAACAAT'
|
|
|
|
matches = list(fnm_levenshtein_lp(pattern, text, max_l_dist=2))
|
|
|
|
self.assertTrue(len(matches) > 0)
|
|
self.assertIn(Match(start=3, end=24, dist=1, matched=text[3:24]),
|
|
matches)
|
|
|
|
|
|
class TestExpandBase(object):
|
|
expand = None # override in sub-classes!
|
|
|
|
def test_both_empty(self):
|
|
self.assertEqual(self.expand('', '', 0), (0, 0))
|
|
|
|
def test_empty_subsequence(self):
|
|
self.assertEqual(self.expand('', 'TEXT', 0), (0, 0))
|
|
|
|
def test_empty_sequence(self):
|
|
self.assertEqual(self.expand('PATTERN', '', 0), (None, None))
|
|
self.assertEqual(self.expand('PATTERN', '', 6), (None, None))
|
|
self.assertEqual(self.expand('PATTERN', '', 7), (7, 0))
|
|
self.assertEqual(self.expand('PATTERN', '', 8), (7, 0))
|
|
|
|
def test_identical(self):
|
|
self.assertEqual(self.expand('abc', 'abc', 0), (0, 3))
|
|
self.assertEqual(self.expand('abc', 'abc', 1), (0, 3))
|
|
self.assertEqual(self.expand('abc', 'abc', 2), (0, 3))
|
|
|
|
def test_first_item_missing(self):
|
|
self.assertEqual(self.expand('abcd', 'bcd', 0), (None, None))
|
|
self.assertEqual(self.expand('abcd', 'bcd', 1), (1, 3))
|
|
self.assertEqual(self.expand('abcd', 'bcd', 2), (1, 3))
|
|
|
|
def test_second_item_missing(self):
|
|
self.assertEqual(self.expand('abcd', 'acd', 0), (None, None))
|
|
self.assertEqual(self.expand('abcd', 'acd', 1), (1, 3))
|
|
self.assertEqual(self.expand('abcd', 'acd', 2), (1, 3))
|
|
|
|
def test_second_before_last_item_missing(self):
|
|
self.assertEqual(self.expand('abcd', 'abd', 0), (None, None))
|
|
self.assertEqual(self.expand('abcd', 'abd', 1), (1, 3))
|
|
self.assertEqual(self.expand('abcd', 'abd', 2), (1, 3))
|
|
|
|
def test_last_item_missing(self):
|
|
self.assertEqual(self.expand('abcd', 'abc', 0), (None, None))
|
|
self.assertEqual(self.expand('abcd', 'abc', 1), (1, 3))
|
|
self.assertEqual(self.expand('abcd', 'abc', 2), (1, 3))
|
|
|
|
def test_completely_different(self):
|
|
self.assertEqual(self.expand('abc', 'def', 0), (None, None))
|
|
|
|
def test_startswith(self):
|
|
self.assertEqual(self.expand('abc', 'abcd', 0), (0, 3))
|
|
self.assertEqual(self.expand('abc', 'abcd', 1), (0, 3))
|
|
self.assertEqual(self.expand('abc', 'abcd', 2), (0, 3))
|
|
|
|
def test_missing_at_start_middle_and_end(self):
|
|
self.assertEqual(self.expand('abcd', '-ab-cd-', 0), (None, None))
|
|
self.assertEqual(self.expand('abcd', '-ab-cd-', 1), (None, None))
|
|
self.assertEqual(self.expand('abcd', '-ab-cd-', 2), (2, 6))
|
|
self.assertEqual(self.expand('abcd', '-ab-cd-', 3), (2, 6))
|
|
|
|
def test_no_common_chars(self):
|
|
self.assertEqual(self.expand('abc', 'de', 2), (None, None))
|
|
self.assertEqual(self.expand('abc', 'de', 3)[0], 3)
|
|
self.assertEqual(self.expand('abc', 'de', 4)[0], 3)
|
|
|
|
def test_long_needle(self):
|
|
self.assertEqual(
|
|
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnopqrst', 0),
|
|
(None, None),
|
|
)
|
|
self.assertEqual(
|
|
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnopqrst', 1),
|
|
(None, None),
|
|
)
|
|
self.assertEqual(
|
|
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnopqrst', 2),
|
|
(2, 17),
|
|
)
|
|
self.assertEqual(
|
|
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnopqrst', 3),
|
|
(2, 17),
|
|
)
|
|
|
|
self.assertEqual(
|
|
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnop', 3),
|
|
(2, 17),
|
|
)
|
|
|
|
self.assertEqual(
|
|
self.expand('abcdefghijklmnop', '-bcdefg-hijk-mnop', 3),
|
|
(3, 17),
|
|
)
|
|
self.assertEqual(
|
|
self.expand('abcdefghijklmnop', '-abcdefg-hijk-mnop', 3),
|
|
(3, 18),
|
|
)
|
|
|
|
self.assertEqual(
|
|
self.expand('abcdefghijklmnop', 'abc---defg-hijk-mnopqrst', 8),
|
|
(5, 20),
|
|
)
|
|
|
|
|
|
class TestExpand(TestExpandBase, unittest.TestCase):
|
|
expand = staticmethod(_expand)
|
|
|
|
|
|
class TestPyExpandShort(TestExpandBase, unittest.TestCase):
|
|
expand = staticmethod(_py_expand_short)
|
|
|
|
|
|
try:
|
|
from fuzzysearch._levenshtein_ngrams import c_expand_short
|
|
except ImportError:
|
|
pass
|
|
else:
|
|
class TestCExpandShort(TestExpandBase, unittest.TestCase):
|
|
expand = staticmethod(c_expand_short)
|
|
|
|
|
|
class TestExpandLong(TestExpandBase, unittest.TestCase):
|
|
expand = staticmethod(_expand_long)
|
|
|
|
|
|
class TestFindNearMatchesLevenshteinBase(object):
|
|
def search(self, subsequence, sequence, max_l_dist):
|
|
raise NotImplementedError
|
|
|
|
test_cases_data = {
|
|
# name: (needle, haystack, [
|
|
# (max_l_dist, [(start, end, dist), ...]),
|
|
# ])
|
|
'identical sequence': ('PATTERN', 'PATTERN', [
|
|
(0, [(0, 7, 0)]),
|
|
]),
|
|
'substring': ('PATTERN', '----------PATTERN---------', [
|
|
(0, [(10, 17, 0)]),
|
|
(1, [(10, 17, 0)]),
|
|
(2, [(10, 17, 0)]),
|
|
]),
|
|
'double first item': ('def', 'abcddefg', [
|
|
(1, [(4, 7, 0)]),
|
|
]),
|
|
'double last item': ('def', 'abcdeffg', [
|
|
(1, [(3, 6, 0)]),
|
|
]),
|
|
'double first items': ('defgh', 'abcdedefghi', [
|
|
(3, [(5, 10, 0)]),
|
|
]),
|
|
'double last items': ('cdefgh', 'abcdefghghi', [
|
|
(3, [(2, 8, 0)]),
|
|
]),
|
|
'missing second item': ('bde', 'abcdefg', [
|
|
(1, [(1, 5, 1)]),
|
|
]),
|
|
'missing second to last item': ('bce', 'abcdefg', [
|
|
(1, [(1, 5, 1)]),
|
|
(2, [(1, 5, 1)]),
|
|
]),
|
|
'one missing in middle': ('PATTERN', '----------PATERN---------', [
|
|
(0, []),
|
|
(1, [(10, 16, 1)]),
|
|
(2, [(10, 16, 1)]),
|
|
]),
|
|
'one changed in middle': ('PATTERN', '----------PAT-ERN---------', [
|
|
(0, []),
|
|
(1, [(10, 17, 1)]),
|
|
(2, [(10, 17, 1)]),
|
|
]),
|
|
'one extra in middle': ('PATTERN', '----------PATT-ERN---------', [
|
|
(0, []),
|
|
(1, [(10, 18, 1)]),
|
|
(2, [(10, 18, 1)]),
|
|
]),
|
|
'one extra repeating in middle': ('PATTERN', '----------PATTTERN---------', [
|
|
(0, []),
|
|
(1, [(10, 18, 1)]),
|
|
(2, [(10, 18, 1)]),
|
|
]),
|
|
'one extra repeating at end': ('PATTERN', '----------PATTERNN---------', [
|
|
(0, [(10, 17, 0)]),
|
|
(1, [(10, 17, 0)]),
|
|
(2, [(10, 17, 0)]),
|
|
]),
|
|
'one missing at end': ('defg', 'abcdef', [
|
|
(1, [(3, 6, 1)]),
|
|
]),
|
|
'highly repetetive': ('a' * 9, 'a' * 7 + 'xx', [
|
|
(1, []),
|
|
(2, [(0, 9, 2)]),
|
|
]),
|
|
'DNA search': (
|
|
'TGCACTGTAGGGATAACAAT',
|
|
longstr('''
|
|
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTG
|
|
GTCACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAAC
|
|
ACACATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGAC
|
|
TGATACTCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
|
|
'''),
|
|
[
|
|
(2, [(3, 24, 1)]),
|
|
]
|
|
),
|
|
# see:
|
|
# * BioPython archives from March 14th, 2014
|
|
# http://lists.open-bio.org/pipermail/biopython/2014-March/009030.html
|
|
# * https://github.com/taleinat/fuzzysearch/issues/3
|
|
'protein search 1': (
|
|
'GGGTTLTTSS',
|
|
longstr('''
|
|
XXXXXXXXXXXXXXXXXXXGGGTTVTTSSAAAAAAAAAAAAAGGGTTLTTSSAAAAAAAAAAA
|
|
AAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBGGGTTLTTSS
|
|
'''),
|
|
[
|
|
(0, [(42, 52, 0), (99, 109, 0)]),
|
|
(1, [(19, 29, 1), (42, 52, 0), (99, 109, 0)]),
|
|
(2, [(19, 29, 1), (42, 52, 0), (99, 109, 0)]),
|
|
]
|
|
),
|
|
'protein search 2': (
|
|
'GGGTTLTTSS',
|
|
longstr('''
|
|
XXXXXXXXXXXXXXXXXXXGGGTTVTTSSAAAAAAAAAAAAAGGGTTVTTSSAAAAAAAAAAA
|
|
AAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBGGGTTLTTSS
|
|
'''),
|
|
[
|
|
(0, [(99, 109, 0)]),
|
|
(1, [(19, 29, 1), (42, 52, 1), (99, 109, 0)]),
|
|
(2, [(19, 29, 1), (42, 52, 1), (99, 109, 0)]),
|
|
]
|
|
),
|
|
'list of words': (
|
|
"over a lazy dog".split(),
|
|
"the big brown fox jumped over the lazy dog".split(),
|
|
[
|
|
(0, []),
|
|
(1, [(5, 9, 1)]),
|
|
(2, [(5, 9, 1)]),
|
|
]
|
|
),
|
|
}
|
|
|
|
def test_cases(self):
|
|
for name, data in self.test_cases_data.items():
|
|
substring, text, max_l_dist2expected_matches = data
|
|
with self.subTest(name=name):
|
|
for max_l_dist, expected_matches in max_l_dist2expected_matches:
|
|
self.assertEqual(
|
|
self.search(substring, text, max_l_dist=max_l_dist),
|
|
[Match(*x, matched=text[x[0]:x[1]])
|
|
for x in expected_matches],
|
|
)
|
|
|
|
def test_empty_sequence(self):
|
|
self.assertEqual(self.search('PATTERN', '', max_l_dist=0), [])
|
|
|
|
def test_empty_subsequence_exeption(self):
|
|
with self.assertRaises(ValueError):
|
|
self.search('', 'TEXT', max_l_dist=0)
|
|
|
|
def test_all_different(self):
|
|
for max_l_dist in [0, 1, 2, 3]:
|
|
self.assertEqual(
|
|
self.search('AAAA', 'ZZZZ', max_l_dist),
|
|
[],
|
|
)
|
|
|
|
matches = self.search('AAAA', 'ZZZZ', max_l_dist=4)
|
|
self.assertGreater(len(matches), 0)
|
|
self.assertTrue(all(match.dist == 4 for match in matches))
|
|
|
|
|
|
class TestFindNearMatchesLevenshteinNgrams(TestFindNearMatchesLevenshteinBase,
|
|
unittest.TestCase):
|
|
def search(self, subsequence, sequence, max_l_dist):
|
|
if max_l_dist >= len(subsequence):
|
|
self.skipTest(
|
|
'skipping ngram search with max_l_dist >= len(subsequence)')
|
|
return consolidate_overlapping_matches(
|
|
fnm_levenshtein_ngrams(subsequence, sequence, max_l_dist)
|
|
)
|
|
|
|
|
|
class TestFindNearMatchesLevenshteinLP(TestFindNearMatchesLevenshteinBase,
|
|
unittest.TestCase):
|
|
def search(self, subsequence, sequence, max_l_dist):
|
|
return consolidate_overlapping_matches(
|
|
fnm_levenshtein_lp(subsequence, sequence, max_l_dist)
|
|
)
|
|
|
|
|
|
class TestFindNearMatchesLevenshtein(TestFindNearMatchesLevenshteinBase,
|
|
unittest.TestCase):
|
|
def search(self, subsequence, sequence, max_l_dist):
|
|
return consolidate_overlapping_matches(
|
|
find_near_matches_levenshtein(subsequence, sequence, max_l_dist)
|
|
)
|