fuzzysearch/tests/test_levenshtein.py

356 lines
13 KiB
Python

import re
from tests.compat import unittest
from fuzzysearch.common import Match, get_best_match_in_group, group_matches, consolidate_overlapping_matches
from fuzzysearch.levenshtein import find_near_matches_levenshtein, \
find_near_matches_levenshtein_linear_programming as fnm_levenshtein_lp
from fuzzysearch.levenshtein_ngram import \
_expand, _py_expand_short, _expand_long, \
find_near_matches_levenshtein_ngrams as fnm_levenshtein_ngrams
def longstr(string):
return re.sub(r'\s+', '', string)
class TestFuzzySearch(unittest.TestCase):
def test_empty_sequence(self):
self.assertEqual(
list(fnm_levenshtein_lp('PATTERN', '', max_l_dist=0)),
[],
)
def test_empty_subsequence_exeption(self):
with self.assertRaises(ValueError):
list(fnm_levenshtein_lp('', 'TEXT', max_l_dist=0))
def test_match_identical_sequence(self):
matches = \
list(fnm_levenshtein_lp('PATTERN', 'PATTERN', max_l_dist=0))
self.assertEqual(matches, [Match(start=0, end=len('PATTERN'), dist=0,
matched='PATTERN')])
def test_double_first_item(self):
sequence = 'abcddefg'
pattern = 'def'
matches = \
list(fnm_levenshtein_lp(pattern, sequence, max_l_dist=1))
self.assertIn(Match(start=4, end=7, dist=0, matched=pattern), matches)
def test_missing_second_item(self):
sequence = 'abcdefg'
pattern = 'bde'
matches = \
list(fnm_levenshtein_lp(pattern, sequence, max_l_dist=1))
self.assertIn(Match(start=1, end=5, dist=1, matched='bcde'), matches)
def test_dna_search(self):
# see: http://stackoverflow.com/questions/19725127/
text = ''.join('''\
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
'''.split())
pattern = 'TGCACTGTAGGGATAACAAT'
matches = list(fnm_levenshtein_lp(pattern, text, max_l_dist=2))
self.assertTrue(len(matches) > 0)
self.assertIn(Match(start=3, end=24, dist=1, matched=text[3:24]),
matches)
class TestExpandBase(object):
expand = None # override in sub-classes!
def test_both_empty(self):
self.assertEqual(self.expand('', '', 0), (0, 0))
def test_empty_subsequence(self):
self.assertEqual(self.expand('', 'TEXT', 0), (0, 0))
def test_empty_sequence(self):
self.assertEqual(self.expand('PATTERN', '', 0), (None, None))
self.assertEqual(self.expand('PATTERN', '', 6), (None, None))
self.assertEqual(self.expand('PATTERN', '', 7), (7, 0))
self.assertEqual(self.expand('PATTERN', '', 8), (7, 0))
def test_identical(self):
self.assertEqual(self.expand('abc', 'abc', 0), (0, 3))
self.assertEqual(self.expand('abc', 'abc', 1), (0, 3))
self.assertEqual(self.expand('abc', 'abc', 2), (0, 3))
def test_first_item_missing(self):
self.assertEqual(self.expand('abcd', 'bcd', 0), (None, None))
self.assertEqual(self.expand('abcd', 'bcd', 1), (1, 3))
self.assertEqual(self.expand('abcd', 'bcd', 2), (1, 3))
def test_second_item_missing(self):
self.assertEqual(self.expand('abcd', 'acd', 0), (None, None))
self.assertEqual(self.expand('abcd', 'acd', 1), (1, 3))
self.assertEqual(self.expand('abcd', 'acd', 2), (1, 3))
def test_second_before_last_item_missing(self):
self.assertEqual(self.expand('abcd', 'abd', 0), (None, None))
self.assertEqual(self.expand('abcd', 'abd', 1), (1, 3))
self.assertEqual(self.expand('abcd', 'abd', 2), (1, 3))
def test_last_item_missing(self):
self.assertEqual(self.expand('abcd', 'abc', 0), (None, None))
self.assertEqual(self.expand('abcd', 'abc', 1), (1, 3))
self.assertEqual(self.expand('abcd', 'abc', 2), (1, 3))
def test_completely_different(self):
self.assertEqual(self.expand('abc', 'def', 0), (None, None))
def test_startswith(self):
self.assertEqual(self.expand('abc', 'abcd', 0), (0, 3))
self.assertEqual(self.expand('abc', 'abcd', 1), (0, 3))
self.assertEqual(self.expand('abc', 'abcd', 2), (0, 3))
def test_missing_at_start_middle_and_end(self):
self.assertEqual(self.expand('abcd', '-ab-cd-', 0), (None, None))
self.assertEqual(self.expand('abcd', '-ab-cd-', 1), (None, None))
self.assertEqual(self.expand('abcd', '-ab-cd-', 2), (2, 6))
self.assertEqual(self.expand('abcd', '-ab-cd-', 3), (2, 6))
def test_no_common_chars(self):
self.assertEqual(self.expand('abc', 'de', 2), (None, None))
self.assertEqual(self.expand('abc', 'de', 3)[0], 3)
self.assertEqual(self.expand('abc', 'de', 4)[0], 3)
def test_long_needle(self):
self.assertEqual(
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnopqrst', 0),
(None, None),
)
self.assertEqual(
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnopqrst', 1),
(None, None),
)
self.assertEqual(
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnopqrst', 2),
(2, 17),
)
self.assertEqual(
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnopqrst', 3),
(2, 17),
)
self.assertEqual(
self.expand('abcdefghijklmnop', 'abcdefg-hijk-mnop', 3),
(2, 17),
)
self.assertEqual(
self.expand('abcdefghijklmnop', '-bcdefg-hijk-mnop', 3),
(3, 17),
)
self.assertEqual(
self.expand('abcdefghijklmnop', '-abcdefg-hijk-mnop', 3),
(3, 18),
)
self.assertEqual(
self.expand('abcdefghijklmnop', 'abc---defg-hijk-mnopqrst', 8),
(5, 20),
)
class TestExpand(TestExpandBase, unittest.TestCase):
expand = staticmethod(_expand)
class TestPyExpandShort(TestExpandBase, unittest.TestCase):
expand = staticmethod(_py_expand_short)
try:
from fuzzysearch._levenshtein_ngrams import c_expand_short
except ImportError:
pass
else:
class TestCExpandShort(TestExpandBase, unittest.TestCase):
expand = staticmethod(c_expand_short)
class TestExpandLong(TestExpandBase, unittest.TestCase):
expand = staticmethod(_expand_long)
class TestFindNearMatchesLevenshteinBase(object):
def search(self, subsequence, sequence, max_l_dist):
raise NotImplementedError
test_cases_data = {
# name: (needle, haystack, [
# (max_l_dist, [(start, end, dist), ...]),
# ])
'identical sequence': ('PATTERN', 'PATTERN', [
(0, [(0, 7, 0)]),
]),
'substring': ('PATTERN', '----------PATTERN---------', [
(0, [(10, 17, 0)]),
(1, [(10, 17, 0)]),
(2, [(10, 17, 0)]),
]),
'double first item': ('def', 'abcddefg', [
(1, [(4, 7, 0)]),
]),
'double last item': ('def', 'abcdeffg', [
(1, [(3, 6, 0)]),
]),
'double first items': ('defgh', 'abcdedefghi', [
(3, [(5, 10, 0)]),
]),
'double last items': ('cdefgh', 'abcdefghghi', [
(3, [(2, 8, 0)]),
]),
'missing second item': ('bde', 'abcdefg', [
(1, [(1, 5, 1)]),
]),
'missing second to last item': ('bce', 'abcdefg', [
(1, [(1, 5, 1)]),
(2, [(1, 5, 1)]),
]),
'one missing in middle': ('PATTERN', '----------PATERN---------', [
(0, []),
(1, [(10, 16, 1)]),
(2, [(10, 16, 1)]),
]),
'one changed in middle': ('PATTERN', '----------PAT-ERN---------', [
(0, []),
(1, [(10, 17, 1)]),
(2, [(10, 17, 1)]),
]),
'one extra in middle': ('PATTERN', '----------PATT-ERN---------', [
(0, []),
(1, [(10, 18, 1)]),
(2, [(10, 18, 1)]),
]),
'one extra repeating in middle': ('PATTERN', '----------PATTTERN---------', [
(0, []),
(1, [(10, 18, 1)]),
(2, [(10, 18, 1)]),
]),
'one extra repeating at end': ('PATTERN', '----------PATTERNN---------', [
(0, [(10, 17, 0)]),
(1, [(10, 17, 0)]),
(2, [(10, 17, 0)]),
]),
'one missing at end': ('defg', 'abcdef', [
(1, [(3, 6, 1)]),
]),
'highly repetetive': ('a' * 9, 'a' * 7 + 'xx', [
(1, []),
(2, [(0, 9, 2)]),
]),
'DNA search': (
'TGCACTGTAGGGATAACAAT',
longstr('''
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTG
GTCACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAAC
ACACATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGAC
TGATACTCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
'''),
[
(2, [(3, 24, 1)]),
]
),
# see:
# * BioPython archives from March 14th, 2014
# http://lists.open-bio.org/pipermail/biopython/2014-March/009030.html
# * https://github.com/taleinat/fuzzysearch/issues/3
'protein search 1': (
'GGGTTLTTSS',
longstr('''
XXXXXXXXXXXXXXXXXXXGGGTTVTTSSAAAAAAAAAAAAAGGGTTLTTSSAAAAAAAAAAA
AAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBGGGTTLTTSS
'''),
[
(0, [(42, 52, 0), (99, 109, 0)]),
(1, [(19, 29, 1), (42, 52, 0), (99, 109, 0)]),
(2, [(19, 29, 1), (42, 52, 0), (99, 109, 0)]),
]
),
'protein search 2': (
'GGGTTLTTSS',
longstr('''
XXXXXXXXXXXXXXXXXXXGGGTTVTTSSAAAAAAAAAAAAAGGGTTVTTSSAAAAAAAAAAA
AAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBGGGTTLTTSS
'''),
[
(0, [(99, 109, 0)]),
(1, [(19, 29, 1), (42, 52, 1), (99, 109, 0)]),
(2, [(19, 29, 1), (42, 52, 1), (99, 109, 0)]),
]
),
'list of words': (
"over a lazy dog".split(),
"the big brown fox jumped over the lazy dog".split(),
[
(0, []),
(1, [(5, 9, 1)]),
(2, [(5, 9, 1)]),
]
),
}
def test_cases(self):
for name, data in self.test_cases_data.items():
substring, text, max_l_dist2expected_matches = data
with self.subTest(name=name):
for max_l_dist, expected_matches in max_l_dist2expected_matches:
self.assertEqual(
self.search(substring, text, max_l_dist=max_l_dist),
[Match(*x, matched=text[x[0]:x[1]])
for x in expected_matches],
)
def test_empty_sequence(self):
self.assertEqual(self.search('PATTERN', '', max_l_dist=0), [])
def test_empty_subsequence_exeption(self):
with self.assertRaises(ValueError):
self.search('', 'TEXT', max_l_dist=0)
def test_all_different(self):
for max_l_dist in [0, 1, 2, 3]:
self.assertEqual(
self.search('AAAA', 'ZZZZ', max_l_dist),
[],
)
matches = self.search('AAAA', 'ZZZZ', max_l_dist=4)
self.assertGreater(len(matches), 0)
self.assertTrue(all(match.dist == 4 for match in matches))
class TestFindNearMatchesLevenshteinNgrams(TestFindNearMatchesLevenshteinBase,
unittest.TestCase):
def search(self, subsequence, sequence, max_l_dist):
if max_l_dist >= len(subsequence):
self.skipTest(
'skipping ngram search with max_l_dist >= len(subsequence)')
return consolidate_overlapping_matches(
fnm_levenshtein_ngrams(subsequence, sequence, max_l_dist)
)
class TestFindNearMatchesLevenshteinLP(TestFindNearMatchesLevenshteinBase,
unittest.TestCase):
def search(self, subsequence, sequence, max_l_dist):
return consolidate_overlapping_matches(
fnm_levenshtein_lp(subsequence, sequence, max_l_dist)
)
class TestFindNearMatchesLevenshtein(TestFindNearMatchesLevenshteinBase,
unittest.TestCase):
def search(self, subsequence, sequence, max_l_dist):
return consolidate_overlapping_matches(
find_near_matches_levenshtein(subsequence, sequence, max_l_dist)
)