From dccf4a47c655c3935e9b23d273d207a319a8349c Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Sat, 12 Apr 2014 13:10:34 +0300 Subject: [PATCH] added C implementation of has_near_matches_substitutions_only --- fuzzysearch/_substitutions_only.c | 107 ++++++++++++++++++++++++++++++ tests/test_substitutions_only.py | 86 +++++++++++++++++++++++- 2 files changed, 192 insertions(+), 1 deletion(-) create mode 100644 fuzzysearch/_substitutions_only.c diff --git a/fuzzysearch/_substitutions_only.c b/fuzzysearch/_substitutions_only.c new file mode 100644 index 0000000..9898cf6 --- /dev/null +++ b/fuzzysearch/_substitutions_only.c @@ -0,0 +1,107 @@ +#include + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + + +static PyObject * +substitutions_only_has_near_matches_byteslike(PyObject *self, PyObject *args) +{ + const char *subsequence; + const char *sequence; + int subseq_len, seq_len, max_substitutions; + unsigned int *sub_counts; + unsigned int seq_idx, subseq_idx, count_idx; + + if (!PyArg_ParseTuple( + args, "s#s#i", + &subsequence, &subseq_len, + &sequence, &seq_len, + &max_substitutions + )) { + return NULL; + } + + if (seq_len < subseq_len) { + Py_RETURN_FALSE; + } + + sub_counts = (unsigned int *) malloc (sizeof(unsigned int) * subseq_len); + if (sub_counts == NULL) { + return PyErr_NoMemory(); + } + + for (seq_idx = 0; seq_idx < subseq_len - 1; ++seq_idx) { + sub_counts[seq_idx] = 0; + for (subseq_idx = 0; subseq_idx <= seq_idx; ++subseq_idx) { + sub_counts[seq_idx - subseq_idx] += + subsequence[subseq_idx] != sequence[seq_idx]; + } +// for(count_idx = 0; count_idx <= seq_idx; ++count_idx) { +// printf("%d ", sub_counts[count_idx]); +// } +// printf("\n"); + } + sub_counts[seq_idx] = 0; + + for (seq_idx = subseq_len-1; seq_idx < seq_len;) { + for (subseq_idx = 0; subseq_idx < subseq_len; ++subseq_idx) { + sub_counts[(seq_idx - subseq_idx) % subseq_len] += + subsequence[subseq_idx] != sequence[seq_idx]; + } + +// for(count_idx = 0; count_idx < subseq_len; ++count_idx) { +// printf("%d ", sub_counts[count_idx]); +// } +// printf("\n"); + + ++seq_idx; + count_idx = seq_idx % subseq_len; + + if (sub_counts[count_idx] <= max_substitutions) { + free(sub_counts); + Py_RETURN_TRUE; + } + sub_counts[count_idx] = 0; + } + + free(sub_counts); + Py_RETURN_FALSE; +} + +static PyMethodDef substitutions_only_methods[] = { + {"substitutions_only_has_near_matches_byteslike", + substitutions_only_has_near_matches_byteslike, + METH_VARARGS, + "DOCSTRING."}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + + +#ifdef IS_PY3K + +static struct PyModuleDef substitutions_only_module = { + PyModuleDef_HEAD_INIT, + "_substitutions_only", /* name of module */ + NULL, /* module documentation, may be NULL */ + -1, /* size of per-interpreter state of the module, + or -1 if the module keeps state in global variables. */ + substitutions_only_methods +}; + +PyMODINIT_FUNC +PyInit__substitutions_only(void) +{ + return PyModule_Create(&substitutions_only_module); +} + +#else + +PyMODINIT_FUNC +init_substitutions_only(void) +{ + (void) Py_InitModule("_substitutions_only", substitutions_only_methods); +} + +#endif diff --git a/tests/test_substitutions_only.py b/tests/test_substitutions_only.py index f59bc50..2258a3c 100644 --- a/tests/test_substitutions_only.py +++ b/tests/test_substitutions_only.py @@ -1,6 +1,9 @@ from fuzzysearch.susbstitutions_only import \ find_near_matches_substitutions_linear_programming as fnm_subs_lp, \ find_near_matches_substitutions_ngrams as fnm_subs_ngrams +from fuzzysearch._substitutions_only import \ + substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike + from tests.compat import unittest from fuzzysearch.common import Match @@ -200,10 +203,11 @@ class TestSubstitionsOnlyBase(object): def test_missing_at_beginning(self): self.assertEqual( - self.search("ATTEST","TESTOSTERONE", max_subs=2), + self.search("ATTEST", "TESTOSTERONE", max_subs=2), [], ) + class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, unittest.TestCase): def search(self, subsequence, sequence, max_subs): return list(fnm_subs_lp(subsequence, sequence, max_subs)) @@ -212,3 +216,83 @@ class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, u class TestFindNearMatchesSubstitionsNgrams(TestSubstitionsOnlyBase, unittest.TestCase): def search(self, subsequence, sequence, max_subs): return fnm_subs_ngrams(subsequence, sequence, max_subs) + + +class TestHasNearMatchSubstitionsOnlyBase(object): + def search(self, subsequence, sequence, max_subs): + raise NotImplementedError + + def test_empty_sequence(self): + self.assertFalse(self.search('PATTERN', '', max_subs=0)) + + def test_empty_subsequence_exeption(self): + with self.assertRaises(ValueError): + self.search('', 'TEXT', max_subs=0) + + def test_match_identical_sequence(self): + self.assertTrue(self.search('PATTERN', 'PATTERN', max_subs=0)) + + def test_substring(self): + substring = 'PATTERN' + text = 'aaaaaaaaaaPATTERNaaaaaaaaa' + for max_subs in [0, 1, 2]: + self.assertTrue(self.search(substring, text, max_subs)) + + def test_double_first_item(self): + for max_subs in [0, 1, 2]: + self.assertTrue(self.search('def', 'abcddefg', max_subs)) + + def test_two_identical(self): + for max_subs in [0, 1, 2]: + self.assertTrue(self.search('abc', 'abcabc', max_subs)) + self.assertTrue(self.search('abc', 'abcXabc', max_subs)) + + def test_one_changed_in_middle(self): + self.assertFalse(self.search('abcdefg', 'abcXefg', 0)) + self.assertTrue(self.search('abcdefg', 'abcXefg', 1)) + self.assertTrue(self.search('abcdefg', 'abcXefg', 2)) + + def test_one_missing_in_middle(self): + substring = 'PATTERN' + text = 'aaaaaaaaaaPATERNaaaaaaaaa' + + for max_subs in [0, 1, 2]: + self.assertFalse(self.search(substring, text, max_subs=max_subs)) + + def test_one_changed_in_middle2(self): + substring = 'PATTERN' + text = 'aaaaaaaaaaPATtERNaaaaaaaaa' + + self.assertFalse(self.search(substring, text, max_subs=0)) + self.assertTrue(self.search(substring, text, max_subs=1)) + self.assertTrue(self.search(substring, text, max_subs=2)) + + def test_one_extra_in_middle(self): + substring = 'PATTERN' + text = 'aaaaaaaaaaPATTXERNaaaaaaaaa' + + for max_subs in [0, 1, 2]: + self.assertFalse(self.search(substring, text, max_subs=max_subs)) + + def test_dna_search(self): + # see: http://stackoverflow.com/questions/19725127/ + text = ''.join('''\ +GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT +CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC +ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC +TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG +'''.split()) + pattern = 'TGCACTGTAGGGATAACAAT' + + self.assertTrue(self.search(pattern, text, max_subs=2)) + + def test_missing_at_beginning(self): + self.assertFalse(self.search("ATTEST", "TESTOSTERONE", max_subs=2)) + + +class TestFindNearMatchesSubstitionsByteslike(TestHasNearMatchSubstitionsOnlyBase, unittest.TestCase): + def search(self, subsequence, sequence, max_subs): + return hnm_subs_byteslike(subsequence, sequence, max_subs) + + def test_empty_subsequence_exeption(self): + pass