added C implementation of has_near_matches_substitutions_only

This commit is contained in:
Tal Einat 2014-04-12 13:10:34 +03:00
parent 54e166392e
commit dccf4a47c6
2 changed files with 192 additions and 1 deletions

View File

@ -0,0 +1,107 @@
#include <Python.h>
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
static PyObject *
substitutions_only_has_near_matches_byteslike(PyObject *self, PyObject *args)
{
const char *subsequence;
const char *sequence;
int subseq_len, seq_len, max_substitutions;
unsigned int *sub_counts;
unsigned int seq_idx, subseq_idx, count_idx;
if (!PyArg_ParseTuple(
args, "s#s#i",
&subsequence, &subseq_len,
&sequence, &seq_len,
&max_substitutions
)) {
return NULL;
}
if (seq_len < subseq_len) {
Py_RETURN_FALSE;
}
sub_counts = (unsigned int *) malloc (sizeof(unsigned int) * subseq_len);
if (sub_counts == NULL) {
return PyErr_NoMemory();
}
for (seq_idx = 0; seq_idx < subseq_len - 1; ++seq_idx) {
sub_counts[seq_idx] = 0;
for (subseq_idx = 0; subseq_idx <= seq_idx; ++subseq_idx) {
sub_counts[seq_idx - subseq_idx] +=
subsequence[subseq_idx] != sequence[seq_idx];
}
// for(count_idx = 0; count_idx <= seq_idx; ++count_idx) {
// printf("%d ", sub_counts[count_idx]);
// }
// printf("\n");
}
sub_counts[seq_idx] = 0;
for (seq_idx = subseq_len-1; seq_idx < seq_len;) {
for (subseq_idx = 0; subseq_idx < subseq_len; ++subseq_idx) {
sub_counts[(seq_idx - subseq_idx) % subseq_len] +=
subsequence[subseq_idx] != sequence[seq_idx];
}
// for(count_idx = 0; count_idx < subseq_len; ++count_idx) {
// printf("%d ", sub_counts[count_idx]);
// }
// printf("\n");
++seq_idx;
count_idx = seq_idx % subseq_len;
if (sub_counts[count_idx] <= max_substitutions) {
free(sub_counts);
Py_RETURN_TRUE;
}
sub_counts[count_idx] = 0;
}
free(sub_counts);
Py_RETURN_FALSE;
}
static PyMethodDef substitutions_only_methods[] = {
{"substitutions_only_has_near_matches_byteslike",
substitutions_only_has_near_matches_byteslike,
METH_VARARGS,
"DOCSTRING."},
{NULL, NULL, 0, NULL} /* Sentinel */
};
#ifdef IS_PY3K
static struct PyModuleDef substitutions_only_module = {
PyModuleDef_HEAD_INIT,
"_substitutions_only", /* name of module */
NULL, /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module,
or -1 if the module keeps state in global variables. */
substitutions_only_methods
};
PyMODINIT_FUNC
PyInit__substitutions_only(void)
{
return PyModule_Create(&substitutions_only_module);
}
#else
PyMODINIT_FUNC
init_substitutions_only(void)
{
(void) Py_InitModule("_substitutions_only", substitutions_only_methods);
}
#endif

View File

@ -1,6 +1,9 @@
from fuzzysearch.susbstitutions_only import \
find_near_matches_substitutions_linear_programming as fnm_subs_lp, \
find_near_matches_substitutions_ngrams as fnm_subs_ngrams
from fuzzysearch._substitutions_only import \
substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike
from tests.compat import unittest
from fuzzysearch.common import Match
@ -200,10 +203,11 @@ class TestSubstitionsOnlyBase(object):
def test_missing_at_beginning(self):
self.assertEqual(
self.search("ATTEST","TESTOSTERONE", max_subs=2),
self.search("ATTEST", "TESTOSTERONE", max_subs=2),
[],
)
class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, unittest.TestCase):
def search(self, subsequence, sequence, max_subs):
return list(fnm_subs_lp(subsequence, sequence, max_subs))
@ -212,3 +216,83 @@ class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, u
class TestFindNearMatchesSubstitionsNgrams(TestSubstitionsOnlyBase, unittest.TestCase):
def search(self, subsequence, sequence, max_subs):
return fnm_subs_ngrams(subsequence, sequence, max_subs)
class TestHasNearMatchSubstitionsOnlyBase(object):
def search(self, subsequence, sequence, max_subs):
raise NotImplementedError
def test_empty_sequence(self):
self.assertFalse(self.search('PATTERN', '', max_subs=0))
def test_empty_subsequence_exeption(self):
with self.assertRaises(ValueError):
self.search('', 'TEXT', max_subs=0)
def test_match_identical_sequence(self):
self.assertTrue(self.search('PATTERN', 'PATTERN', max_subs=0))
def test_substring(self):
substring = 'PATTERN'
text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
for max_subs in [0, 1, 2]:
self.assertTrue(self.search(substring, text, max_subs))
def test_double_first_item(self):
for max_subs in [0, 1, 2]:
self.assertTrue(self.search('def', 'abcddefg', max_subs))
def test_two_identical(self):
for max_subs in [0, 1, 2]:
self.assertTrue(self.search('abc', 'abcabc', max_subs))
self.assertTrue(self.search('abc', 'abcXabc', max_subs))
def test_one_changed_in_middle(self):
self.assertFalse(self.search('abcdefg', 'abcXefg', 0))
self.assertTrue(self.search('abcdefg', 'abcXefg', 1))
self.assertTrue(self.search('abcdefg', 'abcXefg', 2))
def test_one_missing_in_middle(self):
substring = 'PATTERN'
text = 'aaaaaaaaaaPATERNaaaaaaaaa'
for max_subs in [0, 1, 2]:
self.assertFalse(self.search(substring, text, max_subs=max_subs))
def test_one_changed_in_middle2(self):
substring = 'PATTERN'
text = 'aaaaaaaaaaPATtERNaaaaaaaaa'
self.assertFalse(self.search(substring, text, max_subs=0))
self.assertTrue(self.search(substring, text, max_subs=1))
self.assertTrue(self.search(substring, text, max_subs=2))
def test_one_extra_in_middle(self):
substring = 'PATTERN'
text = 'aaaaaaaaaaPATTXERNaaaaaaaaa'
for max_subs in [0, 1, 2]:
self.assertFalse(self.search(substring, text, max_subs=max_subs))
def test_dna_search(self):
# see: http://stackoverflow.com/questions/19725127/
text = ''.join('''\
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
'''.split())
pattern = 'TGCACTGTAGGGATAACAAT'
self.assertTrue(self.search(pattern, text, max_subs=2))
def test_missing_at_beginning(self):
self.assertFalse(self.search("ATTEST", "TESTOSTERONE", max_subs=2))
class TestFindNearMatchesSubstitionsByteslike(TestHasNearMatchSubstitionsOnlyBase, unittest.TestCase):
def search(self, subsequence, sequence, max_subs):
return hnm_subs_byteslike(subsequence, sequence, max_subs)
def test_empty_subsequence_exeption(self):
pass