added C implementation of has_near_matches_substitutions_only

2014-04-12 13:10:34 +03:00 · 2014-04-12 13:10:34 +03:00 · dccf4a47c6
parent 54e166392e
commit dccf4a47c6
2 changed files with 192 additions and 1 deletions
--- a/fuzzysearch/_substitutions_only.c
+++ b/fuzzysearch/_substitutions_only.c
@ -0,0 +1,107 @@
 #include <Python.h>
 #if PY_MAJOR_VERSION >= 3
 #define IS_PY3K
 #endif
 static PyObject *
 substitutions_only_has_near_matches_byteslike(PyObject *self, PyObject *args)
 {
    const char *subsequence;
    const char *sequence;
    int subseq_len, seq_len, max_substitutions;
    unsigned int *sub_counts;
    unsigned int seq_idx, subseq_idx, count_idx;
    if (!PyArg_ParseTuple(
        args, "s#s#i",
        &subsequence, &subseq_len,
        &sequence, &seq_len,
        &max_substitutions
    )) {
        return NULL;
    }
    if (seq_len < subseq_len) {
        Py_RETURN_FALSE;
    }
    sub_counts = (unsigned int *) malloc (sizeof(unsigned int) * subseq_len);
    if (sub_counts == NULL) {
        return PyErr_NoMemory();
    }
    for (seq_idx = 0; seq_idx < subseq_len - 1; ++seq_idx) {
        sub_counts[seq_idx] = 0;
        for (subseq_idx = 0; subseq_idx <= seq_idx; ++subseq_idx) {
            sub_counts[seq_idx - subseq_idx] +=
                subsequence[subseq_idx] != sequence[seq_idx];
        }
 //        for(count_idx = 0; count_idx <= seq_idx; ++count_idx) {
 //            printf("%d ", sub_counts[count_idx]);
 //        }
 //        printf("\n");
    }
    sub_counts[seq_idx] = 0;
    for (seq_idx = subseq_len-1; seq_idx < seq_len;) {
        for (subseq_idx = 0; subseq_idx < subseq_len; ++subseq_idx) {
            sub_counts[(seq_idx - subseq_idx) % subseq_len] +=
                subsequence[subseq_idx] != sequence[seq_idx];
        }
 //        for(count_idx = 0; count_idx < subseq_len; ++count_idx) {
 //            printf("%d ", sub_counts[count_idx]);
 //        }
 //        printf("\n");
        ++seq_idx;
        count_idx = seq_idx % subseq_len;
        if (sub_counts[count_idx] <= max_substitutions) {
            free(sub_counts);
            Py_RETURN_TRUE;
        }
        sub_counts[count_idx] = 0;
    }
    free(sub_counts);
    Py_RETURN_FALSE;
 }
 static PyMethodDef substitutions_only_methods[] = {
    {"substitutions_only_has_near_matches_byteslike",
     substitutions_only_has_near_matches_byteslike,
     METH_VARARGS,
     "DOCSTRING."},
    {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 #ifdef IS_PY3K
 static struct PyModuleDef substitutions_only_module = {
   PyModuleDef_HEAD_INIT,
   "_substitutions_only",   /* name of module */
   NULL, /* module documentation, may be NULL */
   -1,       /* size of per-interpreter state of the module,
                or -1 if the module keeps state in global variables. */
   substitutions_only_methods
 };
 PyMODINIT_FUNC
 PyInit__substitutions_only(void)
 {
    return PyModule_Create(&substitutions_only_module);
 }
 #else
 PyMODINIT_FUNC
 init_substitutions_only(void)
 {
    (void) Py_InitModule("_substitutions_only", substitutions_only_methods);
 }
 #endif
--- a/tests/test_substitutions_only.py
+++ b/tests/test_substitutions_only.py
@ -1,6 +1,9 @@
 from fuzzysearch.susbstitutions_only import \
    find_near_matches_substitutions_linear_programming as fnm_subs_lp, \
    find_near_matches_substitutions_ngrams as fnm_subs_ngrams
 from fuzzysearch._substitutions_only import \
    substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike
 from tests.compat import unittest
 from fuzzysearch.common import Match
@ -204,6 +207,7 @@ class TestSubstitionsOnlyBase(object):
            [],
        )
 class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, unittest.TestCase):
    def search(self, subsequence, sequence, max_subs):
        return list(fnm_subs_lp(subsequence, sequence, max_subs))
@ -212,3 +216,83 @@ class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, u
 class TestFindNearMatchesSubstitionsNgrams(TestSubstitionsOnlyBase, unittest.TestCase):
    def search(self, subsequence, sequence, max_subs):
        return fnm_subs_ngrams(subsequence, sequence, max_subs)
 class TestHasNearMatchSubstitionsOnlyBase(object):
    def search(self, subsequence, sequence, max_subs):
        raise NotImplementedError
    def test_empty_sequence(self):
        self.assertFalse(self.search('PATTERN', '', max_subs=0))
    def test_empty_subsequence_exeption(self):
        with self.assertRaises(ValueError):
            self.search('', 'TEXT', max_subs=0)
    def test_match_identical_sequence(self):
        self.assertTrue(self.search('PATTERN', 'PATTERN', max_subs=0))
    def test_substring(self):
        substring = 'PATTERN'
        text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
        for max_subs in [0, 1, 2]:
            self.assertTrue(self.search(substring, text, max_subs))
    def test_double_first_item(self):
        for max_subs in [0, 1, 2]:
            self.assertTrue(self.search('def', 'abcddefg', max_subs))
    def test_two_identical(self):
        for max_subs in [0, 1, 2]:
            self.assertTrue(self.search('abc', 'abcabc', max_subs))
            self.assertTrue(self.search('abc', 'abcXabc', max_subs))
    def test_one_changed_in_middle(self):
        self.assertFalse(self.search('abcdefg', 'abcXefg', 0))
        self.assertTrue(self.search('abcdefg', 'abcXefg', 1))
        self.assertTrue(self.search('abcdefg', 'abcXefg', 2))
    def test_one_missing_in_middle(self):
        substring = 'PATTERN'
        text = 'aaaaaaaaaaPATERNaaaaaaaaa'
        for max_subs in [0, 1, 2]:
            self.assertFalse(self.search(substring, text, max_subs=max_subs))
    def test_one_changed_in_middle2(self):
        substring = 'PATTERN'
        text = 'aaaaaaaaaaPATtERNaaaaaaaaa'
        self.assertFalse(self.search(substring, text, max_subs=0))
        self.assertTrue(self.search(substring, text, max_subs=1))
        self.assertTrue(self.search(substring, text, max_subs=2))
    def test_one_extra_in_middle(self):
        substring = 'PATTERN'
        text = 'aaaaaaaaaaPATTXERNaaaaaaaaa'
        for max_subs in [0, 1, 2]:
            self.assertFalse(self.search(substring, text, max_subs=max_subs))
    def test_dna_search(self):
        # see: http://stackoverflow.com/questions/19725127/
        text = ''.join('''\
 GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
 CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
 ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
 TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
 '''.split())
        pattern = 'TGCACTGTAGGGATAACAAT'
        self.assertTrue(self.search(pattern, text, max_subs=2))
    def test_missing_at_beginning(self):
        self.assertFalse(self.search("ATTEST", "TESTOSTERONE", max_subs=2))
 class TestFindNearMatchesSubstitionsByteslike(TestHasNearMatchSubstitionsOnlyBase, unittest.TestCase):
    def search(self, subsequence, sequence, max_subs):
        return hnm_subs_byteslike(subsequence, sequence, max_subs)
    def test_empty_subsequence_exeption(self):
        pass