2014-04-12 15:20:10 +00:00
|
|
|
from fuzzysearch.substitutions_only import \
|
2014-03-15 17:36:13 +00:00
|
|
|
find_near_matches_substitutions_linear_programming as fnm_subs_lp, \
|
2014-03-29 10:57:17 +00:00
|
|
|
find_near_matches_substitutions_ngrams as fnm_subs_ngrams, \
|
|
|
|
has_near_match_substitutions_ngrams
|
2014-04-12 10:10:34 +00:00
|
|
|
from fuzzysearch._substitutions_only import \
|
|
|
|
substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike
|
|
|
|
|
2014-03-15 17:36:13 +00:00
|
|
|
from tests.compat import unittest
|
|
|
|
|
2014-03-29 10:57:17 +00:00
|
|
|
import textwrap
|
2014-03-15 17:36:13 +00:00
|
|
|
from fuzzysearch.common import Match
|
|
|
|
|
|
|
|
|
|
|
|
class TestSubstitionsOnlyBase(object):
|
|
|
|
def search(self, subsequence, sequence, max_subs):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def test_empty_sequence(self):
|
|
|
|
self.assertEqual(self.search('PATTERN', '', max_subs=0), [])
|
|
|
|
|
|
|
|
def test_empty_subsequence_exeption(self):
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
self.search('', 'TEXT', max_subs=0)
|
|
|
|
|
|
|
|
def test_match_identical_sequence(self):
|
|
|
|
self.assertEqual(
|
|
|
|
self.search('PATTERN', 'PATTERN', max_subs=0),
|
|
|
|
[Match(start=0, end=len('PATTERN'), dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_substring(self):
|
|
|
|
substring = 'PATTERN'
|
|
|
|
text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
|
|
|
|
expected_match = Match(start=10, end=17, dist=0)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(substring, text, max_subs=0),
|
|
|
|
[expected_match],
|
|
|
|
)
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(substring, text, max_subs=1),
|
|
|
|
[expected_match],
|
|
|
|
)
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(substring, text, max_subs=2),
|
|
|
|
[expected_match],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_double_first_item(self):
|
|
|
|
self.assertListEqual(
|
|
|
|
self.search('def', 'abcddefg', max_subs=1),
|
|
|
|
[Match(start=4, end=7, dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertListEqual(
|
|
|
|
self.search('def', 'abcddefg', max_subs=2),
|
|
|
|
[Match(start=3, end=6, dist=2),
|
|
|
|
Match(start=4, end=7, dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_two_identical(self):
|
|
|
|
self.assertEqual(
|
|
|
|
self.search('abc', 'abcabc', max_subs=1),
|
|
|
|
[Match(start=0, end=3, dist=0), Match(start=3, end=6, dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.search('abc', 'abcXabc', max_subs=1),
|
|
|
|
[Match(start=0, end=3, dist=0), Match(start=4, end=7, dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_one_changed_in_middle(self):
|
|
|
|
substring = 'abcdefg'
|
|
|
|
pattern = 'abcXefg'
|
|
|
|
expected_match = Match(start=0, end=7, dist=1)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(substring, pattern, max_subs=0),
|
|
|
|
[],
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(substring, pattern, max_subs=1),
|
|
|
|
[expected_match],
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(substring, pattern, max_subs=2),
|
|
|
|
[expected_match],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_one_missing_in_middle(self):
|
|
|
|
substring = 'PATTERN'
|
|
|
|
text = 'aaaaaaaaaaPATERNaaaaaaaaa'
|
|
|
|
|
|
|
|
for max_subs in [0, 1, 2]:
|
2014-04-12 15:09:00 +00:00
|
|
|
self.assertEqual(
|
2014-03-15 17:36:13 +00:00
|
|
|
self.search(substring, text, max_subs=max_subs),
|
|
|
|
[],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_one_changed_in_middle2(self):
|
|
|
|
substring = 'PATTERN'
|
|
|
|
text = 'aaaaaaaaaaPATtERNaaaaaaaaa'
|
|
|
|
expected_match = Match(start=10, end=17, dist=1)
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(substring, text, max_subs=0),
|
|
|
|
[],
|
|
|
|
)
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(substring, text, max_subs=1),
|
|
|
|
[expected_match],
|
|
|
|
)
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(substring, text, max_subs=2),
|
|
|
|
[expected_match],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_one_extra_in_middle(self):
|
|
|
|
substring = 'PATTERN'
|
|
|
|
text = 'aaaaaaaaaaPATTXERNaaaaaaaaa'
|
|
|
|
|
|
|
|
for max_subs in [0, 1, 2]:
|
2014-04-12 15:09:00 +00:00
|
|
|
self.assertEqual(
|
2014-03-15 17:36:13 +00:00
|
|
|
self.search(substring, text, max_subs=max_subs),
|
|
|
|
[],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_dna_search(self):
|
|
|
|
# see: http://stackoverflow.com/questions/19725127/
|
|
|
|
text = ''.join('''\
|
|
|
|
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
|
|
|
|
CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
|
|
|
|
ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
|
|
|
|
TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
|
|
|
|
'''.split())
|
|
|
|
pattern = 'TGCACTGTAGGGATAACAAT'
|
|
|
|
|
|
|
|
self.assertEqual(
|
|
|
|
self.search(pattern, text, max_subs=2),
|
|
|
|
[Match(start=4, end=24, dist=1)],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_protein_search1(self):
|
|
|
|
# see:
|
|
|
|
# * BioPython archives from March 14th, 2014
|
|
|
|
# http://lists.open-bio.org/pipermail/biopython/2014-March/009030.html
|
|
|
|
# * https://github.com/taleinat/fuzzysearch/issues/3
|
|
|
|
text = ''.join('''\
|
|
|
|
XXXXXXXXXXXXXXXXXXXGGGTTVTTSSAAAAAAAAAAAAAGGGTTLTTSSAAAAAAAAAAAA
|
|
|
|
AAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBGGGTTLTTSS
|
|
|
|
'''.split())
|
|
|
|
pattern = "GGGTTLTTSS"
|
|
|
|
|
|
|
|
self.assertListEqual(
|
|
|
|
self.search(pattern, text, max_subs=0),
|
|
|
|
[Match(start=42, end=52, dist=0),
|
|
|
|
Match(start=99, end=109, dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertListEqual(
|
|
|
|
self.search(pattern, text, max_subs=1),
|
|
|
|
[Match(start=19, end=29, dist=1),
|
|
|
|
Match(start=42, end=52, dist=0),
|
|
|
|
Match(start=99, end=109, dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertListEqual(
|
|
|
|
self.search(pattern, text, max_subs=2),
|
|
|
|
[Match(start=19, end=29, dist=1),
|
|
|
|
Match(start=42, end=52, dist=0),
|
|
|
|
Match(start=99, end=109, dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_protein_search2(self):
|
|
|
|
# see:
|
|
|
|
# * BioPython archives from March 14th, 2014
|
|
|
|
# http://lists.open-bio.org/pipermail/biopython/2014-March/009030.html
|
|
|
|
# * https://github.com/taleinat/fuzzysearch/issues/3
|
|
|
|
text = ''.join('''\
|
|
|
|
XXXXXXXXXXXXXXXXXXXGGGTTVTTSSAAAAAAAAAAAAAGGGTTVTTSSAAAAAAAAAAA
|
|
|
|
AAAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBGGGTTLTTSS
|
|
|
|
'''.split())
|
|
|
|
pattern = "GGGTTLTTSS"
|
|
|
|
|
|
|
|
self.assertListEqual(
|
|
|
|
self.search(pattern, text, max_subs=0),
|
|
|
|
[Match(start=99, end=109, dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertListEqual(
|
|
|
|
self.search(pattern, text, max_subs=1),
|
|
|
|
[Match(start=19, end=29, dist=1),
|
|
|
|
Match(start=42, end=52, dist=1),
|
|
|
|
Match(start=99, end=109, dist=0)],
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assertListEqual(
|
|
|
|
self.search(pattern, text, max_subs=2),
|
|
|
|
[Match(start=19, end=29, dist=1),
|
|
|
|
Match(start=42, end=52, dist=1),
|
|
|
|
Match(start=99, end=109, dist=0)],
|
|
|
|
)
|
|
|
|
|
2014-03-19 21:46:34 +00:00
|
|
|
def test_missing_at_beginning(self):
|
|
|
|
self.assertEqual(
|
2014-04-12 10:10:34 +00:00
|
|
|
self.search("ATTEST", "TESTOSTERONE", max_subs=2),
|
2014-03-19 21:46:34 +00:00
|
|
|
[],
|
|
|
|
)
|
2014-03-15 17:36:13 +00:00
|
|
|
|
2014-04-12 10:10:34 +00:00
|
|
|
|
2014-03-15 17:36:13 +00:00
|
|
|
class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, unittest.TestCase):
|
|
|
|
def search(self, subsequence, sequence, max_subs):
|
|
|
|
return list(fnm_subs_lp(subsequence, sequence, max_subs))
|
|
|
|
|
|
|
|
|
|
|
|
class TestFindNearMatchesSubstitionsNgrams(TestSubstitionsOnlyBase, unittest.TestCase):
|
|
|
|
def search(self, subsequence, sequence, max_subs):
|
|
|
|
return fnm_subs_ngrams(subsequence, sequence, max_subs)
|
2014-04-12 10:10:34 +00:00
|
|
|
|
|
|
|
|
|
|
|
class TestHasNearMatchSubstitionsOnlyBase(object):
|
|
|
|
def search(self, subsequence, sequence, max_subs):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def test_empty_sequence(self):
|
|
|
|
self.assertFalse(self.search('PATTERN', '', max_subs=0))
|
|
|
|
|
|
|
|
def test_empty_subsequence_exeption(self):
|
|
|
|
with self.assertRaises(ValueError):
|
|
|
|
self.search('', 'TEXT', max_subs=0)
|
|
|
|
|
|
|
|
def test_match_identical_sequence(self):
|
|
|
|
self.assertTrue(self.search('PATTERN', 'PATTERN', max_subs=0))
|
|
|
|
|
|
|
|
def test_substring(self):
|
|
|
|
substring = 'PATTERN'
|
|
|
|
text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
|
|
|
|
for max_subs in [0, 1, 2]:
|
|
|
|
self.assertTrue(self.search(substring, text, max_subs))
|
|
|
|
|
|
|
|
def test_double_first_item(self):
|
|
|
|
for max_subs in [0, 1, 2]:
|
|
|
|
self.assertTrue(self.search('def', 'abcddefg', max_subs))
|
|
|
|
|
|
|
|
def test_two_identical(self):
|
|
|
|
for max_subs in [0, 1, 2]:
|
|
|
|
self.assertTrue(self.search('abc', 'abcabc', max_subs))
|
|
|
|
self.assertTrue(self.search('abc', 'abcXabc', max_subs))
|
|
|
|
|
|
|
|
def test_one_changed_in_middle(self):
|
|
|
|
self.assertFalse(self.search('abcdefg', 'abcXefg', 0))
|
|
|
|
self.assertTrue(self.search('abcdefg', 'abcXefg', 1))
|
|
|
|
self.assertTrue(self.search('abcdefg', 'abcXefg', 2))
|
|
|
|
|
|
|
|
def test_one_missing_in_middle(self):
|
|
|
|
substring = 'PATTERN'
|
|
|
|
text = 'aaaaaaaaaaPATERNaaaaaaaaa'
|
|
|
|
|
|
|
|
for max_subs in [0, 1, 2]:
|
|
|
|
self.assertFalse(self.search(substring, text, max_subs=max_subs))
|
|
|
|
|
|
|
|
def test_one_changed_in_middle2(self):
|
|
|
|
substring = 'PATTERN'
|
|
|
|
text = 'aaaaaaaaaaPATtERNaaaaaaaaa'
|
|
|
|
|
|
|
|
self.assertFalse(self.search(substring, text, max_subs=0))
|
|
|
|
self.assertTrue(self.search(substring, text, max_subs=1))
|
|
|
|
self.assertTrue(self.search(substring, text, max_subs=2))
|
|
|
|
|
|
|
|
def test_one_extra_in_middle(self):
|
|
|
|
substring = 'PATTERN'
|
|
|
|
text = 'aaaaaaaaaaPATTXERNaaaaaaaaa'
|
|
|
|
|
|
|
|
for max_subs in [0, 1, 2]:
|
|
|
|
self.assertFalse(self.search(substring, text, max_subs=max_subs))
|
|
|
|
|
|
|
|
def test_dna_search(self):
|
|
|
|
# see: http://stackoverflow.com/questions/19725127/
|
2014-03-29 10:57:17 +00:00
|
|
|
text = ''.join(textwrap.dedent('''\
|
|
|
|
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
|
|
|
|
CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
|
|
|
|
ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
|
|
|
|
TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
|
|
|
|
''').split())
|
2014-04-12 10:10:34 +00:00
|
|
|
pattern = 'TGCACTGTAGGGATAACAAT'
|
|
|
|
|
|
|
|
self.assertTrue(self.search(pattern, text, max_subs=2))
|
|
|
|
|
|
|
|
def test_missing_at_beginning(self):
|
|
|
|
self.assertFalse(self.search("ATTEST", "TESTOSTERONE", max_subs=2))
|
|
|
|
|
|
|
|
|
2014-03-29 10:57:17 +00:00
|
|
|
class TestHasNearMatchSubstitionsOnly(TestHasNearMatchSubstitionsOnlyBase,
|
|
|
|
unittest.TestCase):
|
|
|
|
def search(self, subsequence, sequence, max_subs):
|
|
|
|
return has_near_match_substitutions_ngrams(subsequence, sequence, max_subs)
|
|
|
|
|
|
|
|
|
|
|
|
class TestFindNearMatchesSubstitionsByteslike(
|
|
|
|
TestHasNearMatchSubstitionsOnlyBase,
|
|
|
|
unittest.TestCase
|
|
|
|
):
|
2014-04-12 10:10:34 +00:00
|
|
|
def search(self, subsequence, sequence, max_subs):
|
|
|
|
return hnm_subs_byteslike(subsequence, sequence, max_subs)
|
|
|
|
|
|
|
|
def test_empty_subsequence_exeption(self):
|
|
|
|
pass
|