added C implementation of has_near_matches_substitutions_only

2014-04-12 13:10:34 +03:00 · 2014-04-12 13:10:34 +03:00 · dccf4a47c6
parent 54e166392e
commit dccf4a47c6
2 changed files with 192 additions and 1 deletions
--- a/fuzzysearch/_substitutions_only.c
+++ b/fuzzysearch/_substitutions_only.c
@ -0,0 +1,107 @@
+#include <Python.h>
+
+#if PY_MAJOR_VERSION >= 3
+#define IS_PY3K
+#endif
+
+
+static PyObject *
+substitutions_only_has_near_matches_byteslike(PyObject *self, PyObject *args)
+{
+    const char *subsequence;
+    const char *sequence;
+    int subseq_len, seq_len, max_substitutions;
+    unsigned int *sub_counts;
+    unsigned int seq_idx, subseq_idx, count_idx;
+
+    if (!PyArg_ParseTuple(
+        args, "s#s#i",
+        &subsequence, &subseq_len,
+        &sequence, &seq_len,
+        &max_substitutions
+    )) {
+        return NULL;
+    }
+
+    if (seq_len < subseq_len) {
+        Py_RETURN_FALSE;
+    }
+
+    sub_counts = (unsigned int *) malloc (sizeof(unsigned int) * subseq_len);
+    if (sub_counts == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    for (seq_idx = 0; seq_idx < subseq_len - 1; ++seq_idx) {
+        sub_counts[seq_idx] = 0;
+        for (subseq_idx = 0; subseq_idx <= seq_idx; ++subseq_idx) {
+            sub_counts[seq_idx - subseq_idx] +=
+                subsequence[subseq_idx] != sequence[seq_idx];
+        }
+//        for(count_idx = 0; count_idx <= seq_idx; ++count_idx) {
+//            printf("%d ", sub_counts[count_idx]);
+//        }
+//        printf("\n");
+    }
+    sub_counts[seq_idx] = 0;
+
+    for (seq_idx = subseq_len-1; seq_idx < seq_len;) {
+        for (subseq_idx = 0; subseq_idx < subseq_len; ++subseq_idx) {
+            sub_counts[(seq_idx - subseq_idx) % subseq_len] +=
+                subsequence[subseq_idx] != sequence[seq_idx];
+        }
+
+//        for(count_idx = 0; count_idx < subseq_len; ++count_idx) {
+//            printf("%d ", sub_counts[count_idx]);
+//        }
+//        printf("\n");
+
+        ++seq_idx;
+        count_idx = seq_idx % subseq_len;
+
+        if (sub_counts[count_idx] <= max_substitutions) {
+            free(sub_counts);
+            Py_RETURN_TRUE;
+        }
+        sub_counts[count_idx] = 0;
+    }
+
+    free(sub_counts);
+    Py_RETURN_FALSE;
+}
+
+static PyMethodDef substitutions_only_methods[] = {
+    {"substitutions_only_has_near_matches_byteslike",
+     substitutions_only_has_near_matches_byteslike,
+     METH_VARARGS,
+     "DOCSTRING."},
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+
+#ifdef IS_PY3K
+
+static struct PyModuleDef substitutions_only_module = {
+   PyModuleDef_HEAD_INIT,
+   "_substitutions_only",   /* name of module */
+   NULL, /* module documentation, may be NULL */
+   -1,       /* size of per-interpreter state of the module,
+                or -1 if the module keeps state in global variables. */
+   substitutions_only_methods
+};
+
+PyMODINIT_FUNC
+PyInit__substitutions_only(void)
+{
+    return PyModule_Create(&substitutions_only_module);
+}
+
+#else
+
+PyMODINIT_FUNC
+init_substitutions_only(void)
+{
+    (void) Py_InitModule("_substitutions_only", substitutions_only_methods);
+}
+
+#endif
--- a/tests/test_substitutions_only.py
+++ b/tests/test_substitutions_only.py
@ -1,6 +1,9 @@
 from fuzzysearch.susbstitutions_only import \
    find_near_matches_substitutions_linear_programming as fnm_subs_lp, \
    find_near_matches_substitutions_ngrams as fnm_subs_ngrams
+from fuzzysearch._substitutions_only import \
+    substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike
+
 from tests.compat import unittest

 from fuzzysearch.common import Match
@ -200,10 +203,11 @@ class TestSubstitionsOnlyBase(object):

    def test_missing_at_beginning(self):
        self.assertEqual(
-            self.search("ATTEST","TESTOSTERONE", max_subs=2),
+            self.search("ATTEST", "TESTOSTERONE", max_subs=2),
            [],
        )

+
 class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, unittest.TestCase):
    def search(self, subsequence, sequence, max_subs):
        return list(fnm_subs_lp(subsequence, sequence, max_subs))
@ -212,3 +216,83 @@ class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, u
 class TestFindNearMatchesSubstitionsNgrams(TestSubstitionsOnlyBase, unittest.TestCase):
    def search(self, subsequence, sequence, max_subs):
        return fnm_subs_ngrams(subsequence, sequence, max_subs)
+
+
+class TestHasNearMatchSubstitionsOnlyBase(object):
+    def search(self, subsequence, sequence, max_subs):
+        raise NotImplementedError
+
+    def test_empty_sequence(self):
+        self.assertFalse(self.search('PATTERN', '', max_subs=0))
+
+    def test_empty_subsequence_exeption(self):
+        with self.assertRaises(ValueError):
+            self.search('', 'TEXT', max_subs=0)
+
+    def test_match_identical_sequence(self):
+        self.assertTrue(self.search('PATTERN', 'PATTERN', max_subs=0))
+
+    def test_substring(self):
+        substring = 'PATTERN'
+        text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
+        for max_subs in [0, 1, 2]:
+            self.assertTrue(self.search(substring, text, max_subs))
+
+    def test_double_first_item(self):
+        for max_subs in [0, 1, 2]:
+            self.assertTrue(self.search('def', 'abcddefg', max_subs))
+
+    def test_two_identical(self):
+        for max_subs in [0, 1, 2]:
+            self.assertTrue(self.search('abc', 'abcabc', max_subs))
+            self.assertTrue(self.search('abc', 'abcXabc', max_subs))
+
+    def test_one_changed_in_middle(self):
+        self.assertFalse(self.search('abcdefg', 'abcXefg', 0))
+        self.assertTrue(self.search('abcdefg', 'abcXefg', 1))
+        self.assertTrue(self.search('abcdefg', 'abcXefg', 2))
+
+    def test_one_missing_in_middle(self):
+        substring = 'PATTERN'
+        text = 'aaaaaaaaaaPATERNaaaaaaaaa'
+
+        for max_subs in [0, 1, 2]:
+            self.assertFalse(self.search(substring, text, max_subs=max_subs))
+
+    def test_one_changed_in_middle2(self):
+        substring = 'PATTERN'
+        text = 'aaaaaaaaaaPATtERNaaaaaaaaa'
+
+        self.assertFalse(self.search(substring, text, max_subs=0))
+        self.assertTrue(self.search(substring, text, max_subs=1))
+        self.assertTrue(self.search(substring, text, max_subs=2))
+
+    def test_one_extra_in_middle(self):
+        substring = 'PATTERN'
+        text = 'aaaaaaaaaaPATTXERNaaaaaaaaa'
+
+        for max_subs in [0, 1, 2]:
+            self.assertFalse(self.search(substring, text, max_subs=max_subs))
+
+    def test_dna_search(self):
+        # see: http://stackoverflow.com/questions/19725127/
+        text = ''.join('''\
+GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
+CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
+ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
+TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
+'''.split())
+        pattern = 'TGCACTGTAGGGATAACAAT'
+
+        self.assertTrue(self.search(pattern, text, max_subs=2))
+
+    def test_missing_at_beginning(self):
+        self.assertFalse(self.search("ATTEST", "TESTOSTERONE", max_subs=2))
+
+
+class TestFindNearMatchesSubstitionsByteslike(TestHasNearMatchSubstitionsOnlyBase, unittest.TestCase):
+    def search(self, subsequence, sequence, max_subs):
+        return hnm_subs_byteslike(subsequence, sequence, max_subs)
+
+    def test_empty_subsequence_exeption(self):
+        pass