added C implementation of has_near_matches_substitutions_only
This commit is contained in:
parent
54e166392e
commit
dccf4a47c6
|
@ -0,0 +1,107 @@
|
|||
#include <Python.h>
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define IS_PY3K
|
||||
#endif
|
||||
|
||||
|
||||
static PyObject *
|
||||
substitutions_only_has_near_matches_byteslike(PyObject *self, PyObject *args)
|
||||
{
|
||||
const char *subsequence;
|
||||
const char *sequence;
|
||||
int subseq_len, seq_len, max_substitutions;
|
||||
unsigned int *sub_counts;
|
||||
unsigned int seq_idx, subseq_idx, count_idx;
|
||||
|
||||
if (!PyArg_ParseTuple(
|
||||
args, "s#s#i",
|
||||
&subsequence, &subseq_len,
|
||||
&sequence, &seq_len,
|
||||
&max_substitutions
|
||||
)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (seq_len < subseq_len) {
|
||||
Py_RETURN_FALSE;
|
||||
}
|
||||
|
||||
sub_counts = (unsigned int *) malloc (sizeof(unsigned int) * subseq_len);
|
||||
if (sub_counts == NULL) {
|
||||
return PyErr_NoMemory();
|
||||
}
|
||||
|
||||
for (seq_idx = 0; seq_idx < subseq_len - 1; ++seq_idx) {
|
||||
sub_counts[seq_idx] = 0;
|
||||
for (subseq_idx = 0; subseq_idx <= seq_idx; ++subseq_idx) {
|
||||
sub_counts[seq_idx - subseq_idx] +=
|
||||
subsequence[subseq_idx] != sequence[seq_idx];
|
||||
}
|
||||
// for(count_idx = 0; count_idx <= seq_idx; ++count_idx) {
|
||||
// printf("%d ", sub_counts[count_idx]);
|
||||
// }
|
||||
// printf("\n");
|
||||
}
|
||||
sub_counts[seq_idx] = 0;
|
||||
|
||||
for (seq_idx = subseq_len-1; seq_idx < seq_len;) {
|
||||
for (subseq_idx = 0; subseq_idx < subseq_len; ++subseq_idx) {
|
||||
sub_counts[(seq_idx - subseq_idx) % subseq_len] +=
|
||||
subsequence[subseq_idx] != sequence[seq_idx];
|
||||
}
|
||||
|
||||
// for(count_idx = 0; count_idx < subseq_len; ++count_idx) {
|
||||
// printf("%d ", sub_counts[count_idx]);
|
||||
// }
|
||||
// printf("\n");
|
||||
|
||||
++seq_idx;
|
||||
count_idx = seq_idx % subseq_len;
|
||||
|
||||
if (sub_counts[count_idx] <= max_substitutions) {
|
||||
free(sub_counts);
|
||||
Py_RETURN_TRUE;
|
||||
}
|
||||
sub_counts[count_idx] = 0;
|
||||
}
|
||||
|
||||
free(sub_counts);
|
||||
Py_RETURN_FALSE;
|
||||
}
|
||||
|
||||
static PyMethodDef substitutions_only_methods[] = {
|
||||
{"substitutions_only_has_near_matches_byteslike",
|
||||
substitutions_only_has_near_matches_byteslike,
|
||||
METH_VARARGS,
|
||||
"DOCSTRING."},
|
||||
{NULL, NULL, 0, NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
|
||||
static struct PyModuleDef substitutions_only_module = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"_substitutions_only", /* name of module */
|
||||
NULL, /* module documentation, may be NULL */
|
||||
-1, /* size of per-interpreter state of the module,
|
||||
or -1 if the module keeps state in global variables. */
|
||||
substitutions_only_methods
|
||||
};
|
||||
|
||||
PyMODINIT_FUNC
|
||||
PyInit__substitutions_only(void)
|
||||
{
|
||||
return PyModule_Create(&substitutions_only_module);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
PyMODINIT_FUNC
|
||||
init_substitutions_only(void)
|
||||
{
|
||||
(void) Py_InitModule("_substitutions_only", substitutions_only_methods);
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,6 +1,9 @@
|
|||
from fuzzysearch.susbstitutions_only import \
|
||||
find_near_matches_substitutions_linear_programming as fnm_subs_lp, \
|
||||
find_near_matches_substitutions_ngrams as fnm_subs_ngrams
|
||||
from fuzzysearch._substitutions_only import \
|
||||
substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike
|
||||
|
||||
from tests.compat import unittest
|
||||
|
||||
from fuzzysearch.common import Match
|
||||
|
@ -200,10 +203,11 @@ class TestSubstitionsOnlyBase(object):
|
|||
|
||||
def test_missing_at_beginning(self):
|
||||
self.assertEqual(
|
||||
self.search("ATTEST","TESTOSTERONE", max_subs=2),
|
||||
self.search("ATTEST", "TESTOSTERONE", max_subs=2),
|
||||
[],
|
||||
)
|
||||
|
||||
|
||||
class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
return list(fnm_subs_lp(subsequence, sequence, max_subs))
|
||||
|
@ -212,3 +216,83 @@ class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, u
|
|||
class TestFindNearMatchesSubstitionsNgrams(TestSubstitionsOnlyBase, unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
return fnm_subs_ngrams(subsequence, sequence, max_subs)
|
||||
|
||||
|
||||
class TestHasNearMatchSubstitionsOnlyBase(object):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
raise NotImplementedError
|
||||
|
||||
def test_empty_sequence(self):
|
||||
self.assertFalse(self.search('PATTERN', '', max_subs=0))
|
||||
|
||||
def test_empty_subsequence_exeption(self):
|
||||
with self.assertRaises(ValueError):
|
||||
self.search('', 'TEXT', max_subs=0)
|
||||
|
||||
def test_match_identical_sequence(self):
|
||||
self.assertTrue(self.search('PATTERN', 'PATTERN', max_subs=0))
|
||||
|
||||
def test_substring(self):
|
||||
substring = 'PATTERN'
|
||||
text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
|
||||
for max_subs in [0, 1, 2]:
|
||||
self.assertTrue(self.search(substring, text, max_subs))
|
||||
|
||||
def test_double_first_item(self):
|
||||
for max_subs in [0, 1, 2]:
|
||||
self.assertTrue(self.search('def', 'abcddefg', max_subs))
|
||||
|
||||
def test_two_identical(self):
|
||||
for max_subs in [0, 1, 2]:
|
||||
self.assertTrue(self.search('abc', 'abcabc', max_subs))
|
||||
self.assertTrue(self.search('abc', 'abcXabc', max_subs))
|
||||
|
||||
def test_one_changed_in_middle(self):
|
||||
self.assertFalse(self.search('abcdefg', 'abcXefg', 0))
|
||||
self.assertTrue(self.search('abcdefg', 'abcXefg', 1))
|
||||
self.assertTrue(self.search('abcdefg', 'abcXefg', 2))
|
||||
|
||||
def test_one_missing_in_middle(self):
|
||||
substring = 'PATTERN'
|
||||
text = 'aaaaaaaaaaPATERNaaaaaaaaa'
|
||||
|
||||
for max_subs in [0, 1, 2]:
|
||||
self.assertFalse(self.search(substring, text, max_subs=max_subs))
|
||||
|
||||
def test_one_changed_in_middle2(self):
|
||||
substring = 'PATTERN'
|
||||
text = 'aaaaaaaaaaPATtERNaaaaaaaaa'
|
||||
|
||||
self.assertFalse(self.search(substring, text, max_subs=0))
|
||||
self.assertTrue(self.search(substring, text, max_subs=1))
|
||||
self.assertTrue(self.search(substring, text, max_subs=2))
|
||||
|
||||
def test_one_extra_in_middle(self):
|
||||
substring = 'PATTERN'
|
||||
text = 'aaaaaaaaaaPATTXERNaaaaaaaaa'
|
||||
|
||||
for max_subs in [0, 1, 2]:
|
||||
self.assertFalse(self.search(substring, text, max_subs=max_subs))
|
||||
|
||||
def test_dna_search(self):
|
||||
# see: http://stackoverflow.com/questions/19725127/
|
||||
text = ''.join('''\
|
||||
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
|
||||
CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
|
||||
ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
|
||||
TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
|
||||
'''.split())
|
||||
pattern = 'TGCACTGTAGGGATAACAAT'
|
||||
|
||||
self.assertTrue(self.search(pattern, text, max_subs=2))
|
||||
|
||||
def test_missing_at_beginning(self):
|
||||
self.assertFalse(self.search("ATTEST", "TESTOSTERONE", max_subs=2))
|
||||
|
||||
|
||||
class TestFindNearMatchesSubstitionsByteslike(TestHasNearMatchSubstitionsOnlyBase, unittest.TestCase):
|
||||
def search(self, subsequence, sequence, max_subs):
|
||||
return hnm_subs_byteslike(subsequence, sequence, max_subs)
|
||||
|
||||
def test_empty_subsequence_exeption(self):
|
||||
pass
|
||||
|
|
Loading…
Reference in New Issue