added C implementation of has_near_matches_substitutions_only
This commit is contained in:
parent
54e166392e
commit
dccf4a47c6
|
@ -0,0 +1,107 @@
|
||||||
|
#include <Python.h>
|
||||||
|
|
||||||
|
#if PY_MAJOR_VERSION >= 3
|
||||||
|
#define IS_PY3K
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
substitutions_only_has_near_matches_byteslike(PyObject *self, PyObject *args)
|
||||||
|
{
|
||||||
|
const char *subsequence;
|
||||||
|
const char *sequence;
|
||||||
|
int subseq_len, seq_len, max_substitutions;
|
||||||
|
unsigned int *sub_counts;
|
||||||
|
unsigned int seq_idx, subseq_idx, count_idx;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(
|
||||||
|
args, "s#s#i",
|
||||||
|
&subsequence, &subseq_len,
|
||||||
|
&sequence, &seq_len,
|
||||||
|
&max_substitutions
|
||||||
|
)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (seq_len < subseq_len) {
|
||||||
|
Py_RETURN_FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub_counts = (unsigned int *) malloc (sizeof(unsigned int) * subseq_len);
|
||||||
|
if (sub_counts == NULL) {
|
||||||
|
return PyErr_NoMemory();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (seq_idx = 0; seq_idx < subseq_len - 1; ++seq_idx) {
|
||||||
|
sub_counts[seq_idx] = 0;
|
||||||
|
for (subseq_idx = 0; subseq_idx <= seq_idx; ++subseq_idx) {
|
||||||
|
sub_counts[seq_idx - subseq_idx] +=
|
||||||
|
subsequence[subseq_idx] != sequence[seq_idx];
|
||||||
|
}
|
||||||
|
// for(count_idx = 0; count_idx <= seq_idx; ++count_idx) {
|
||||||
|
// printf("%d ", sub_counts[count_idx]);
|
||||||
|
// }
|
||||||
|
// printf("\n");
|
||||||
|
}
|
||||||
|
sub_counts[seq_idx] = 0;
|
||||||
|
|
||||||
|
for (seq_idx = subseq_len-1; seq_idx < seq_len;) {
|
||||||
|
for (subseq_idx = 0; subseq_idx < subseq_len; ++subseq_idx) {
|
||||||
|
sub_counts[(seq_idx - subseq_idx) % subseq_len] +=
|
||||||
|
subsequence[subseq_idx] != sequence[seq_idx];
|
||||||
|
}
|
||||||
|
|
||||||
|
// for(count_idx = 0; count_idx < subseq_len; ++count_idx) {
|
||||||
|
// printf("%d ", sub_counts[count_idx]);
|
||||||
|
// }
|
||||||
|
// printf("\n");
|
||||||
|
|
||||||
|
++seq_idx;
|
||||||
|
count_idx = seq_idx % subseq_len;
|
||||||
|
|
||||||
|
if (sub_counts[count_idx] <= max_substitutions) {
|
||||||
|
free(sub_counts);
|
||||||
|
Py_RETURN_TRUE;
|
||||||
|
}
|
||||||
|
sub_counts[count_idx] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(sub_counts);
|
||||||
|
Py_RETURN_FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyMethodDef substitutions_only_methods[] = {
|
||||||
|
{"substitutions_only_has_near_matches_byteslike",
|
||||||
|
substitutions_only_has_near_matches_byteslike,
|
||||||
|
METH_VARARGS,
|
||||||
|
"DOCSTRING."},
|
||||||
|
{NULL, NULL, 0, NULL} /* Sentinel */
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef IS_PY3K
|
||||||
|
|
||||||
|
static struct PyModuleDef substitutions_only_module = {
|
||||||
|
PyModuleDef_HEAD_INIT,
|
||||||
|
"_substitutions_only", /* name of module */
|
||||||
|
NULL, /* module documentation, may be NULL */
|
||||||
|
-1, /* size of per-interpreter state of the module,
|
||||||
|
or -1 if the module keeps state in global variables. */
|
||||||
|
substitutions_only_methods
|
||||||
|
};
|
||||||
|
|
||||||
|
PyMODINIT_FUNC
|
||||||
|
PyInit__substitutions_only(void)
|
||||||
|
{
|
||||||
|
return PyModule_Create(&substitutions_only_module);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
PyMODINIT_FUNC
|
||||||
|
init_substitutions_only(void)
|
||||||
|
{
|
||||||
|
(void) Py_InitModule("_substitutions_only", substitutions_only_methods);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -1,6 +1,9 @@
|
||||||
from fuzzysearch.susbstitutions_only import \
|
from fuzzysearch.susbstitutions_only import \
|
||||||
find_near_matches_substitutions_linear_programming as fnm_subs_lp, \
|
find_near_matches_substitutions_linear_programming as fnm_subs_lp, \
|
||||||
find_near_matches_substitutions_ngrams as fnm_subs_ngrams
|
find_near_matches_substitutions_ngrams as fnm_subs_ngrams
|
||||||
|
from fuzzysearch._substitutions_only import \
|
||||||
|
substitutions_only_has_near_matches_byteslike as hnm_subs_byteslike
|
||||||
|
|
||||||
from tests.compat import unittest
|
from tests.compat import unittest
|
||||||
|
|
||||||
from fuzzysearch.common import Match
|
from fuzzysearch.common import Match
|
||||||
|
@ -204,6 +207,7 @@ class TestSubstitionsOnlyBase(object):
|
||||||
[],
|
[],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, unittest.TestCase):
|
class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, unittest.TestCase):
|
||||||
def search(self, subsequence, sequence, max_subs):
|
def search(self, subsequence, sequence, max_subs):
|
||||||
return list(fnm_subs_lp(subsequence, sequence, max_subs))
|
return list(fnm_subs_lp(subsequence, sequence, max_subs))
|
||||||
|
@ -212,3 +216,83 @@ class TestFindNearMatchesSubstitionsLinearProgramming(TestSubstitionsOnlyBase, u
|
||||||
class TestFindNearMatchesSubstitionsNgrams(TestSubstitionsOnlyBase, unittest.TestCase):
|
class TestFindNearMatchesSubstitionsNgrams(TestSubstitionsOnlyBase, unittest.TestCase):
|
||||||
def search(self, subsequence, sequence, max_subs):
|
def search(self, subsequence, sequence, max_subs):
|
||||||
return fnm_subs_ngrams(subsequence, sequence, max_subs)
|
return fnm_subs_ngrams(subsequence, sequence, max_subs)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHasNearMatchSubstitionsOnlyBase(object):
|
||||||
|
def search(self, subsequence, sequence, max_subs):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def test_empty_sequence(self):
|
||||||
|
self.assertFalse(self.search('PATTERN', '', max_subs=0))
|
||||||
|
|
||||||
|
def test_empty_subsequence_exeption(self):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self.search('', 'TEXT', max_subs=0)
|
||||||
|
|
||||||
|
def test_match_identical_sequence(self):
|
||||||
|
self.assertTrue(self.search('PATTERN', 'PATTERN', max_subs=0))
|
||||||
|
|
||||||
|
def test_substring(self):
|
||||||
|
substring = 'PATTERN'
|
||||||
|
text = 'aaaaaaaaaaPATTERNaaaaaaaaa'
|
||||||
|
for max_subs in [0, 1, 2]:
|
||||||
|
self.assertTrue(self.search(substring, text, max_subs))
|
||||||
|
|
||||||
|
def test_double_first_item(self):
|
||||||
|
for max_subs in [0, 1, 2]:
|
||||||
|
self.assertTrue(self.search('def', 'abcddefg', max_subs))
|
||||||
|
|
||||||
|
def test_two_identical(self):
|
||||||
|
for max_subs in [0, 1, 2]:
|
||||||
|
self.assertTrue(self.search('abc', 'abcabc', max_subs))
|
||||||
|
self.assertTrue(self.search('abc', 'abcXabc', max_subs))
|
||||||
|
|
||||||
|
def test_one_changed_in_middle(self):
|
||||||
|
self.assertFalse(self.search('abcdefg', 'abcXefg', 0))
|
||||||
|
self.assertTrue(self.search('abcdefg', 'abcXefg', 1))
|
||||||
|
self.assertTrue(self.search('abcdefg', 'abcXefg', 2))
|
||||||
|
|
||||||
|
def test_one_missing_in_middle(self):
|
||||||
|
substring = 'PATTERN'
|
||||||
|
text = 'aaaaaaaaaaPATERNaaaaaaaaa'
|
||||||
|
|
||||||
|
for max_subs in [0, 1, 2]:
|
||||||
|
self.assertFalse(self.search(substring, text, max_subs=max_subs))
|
||||||
|
|
||||||
|
def test_one_changed_in_middle2(self):
|
||||||
|
substring = 'PATTERN'
|
||||||
|
text = 'aaaaaaaaaaPATtERNaaaaaaaaa'
|
||||||
|
|
||||||
|
self.assertFalse(self.search(substring, text, max_subs=0))
|
||||||
|
self.assertTrue(self.search(substring, text, max_subs=1))
|
||||||
|
self.assertTrue(self.search(substring, text, max_subs=2))
|
||||||
|
|
||||||
|
def test_one_extra_in_middle(self):
|
||||||
|
substring = 'PATTERN'
|
||||||
|
text = 'aaaaaaaaaaPATTXERNaaaaaaaaa'
|
||||||
|
|
||||||
|
for max_subs in [0, 1, 2]:
|
||||||
|
self.assertFalse(self.search(substring, text, max_subs=max_subs))
|
||||||
|
|
||||||
|
def test_dna_search(self):
|
||||||
|
# see: http://stackoverflow.com/questions/19725127/
|
||||||
|
text = ''.join('''\
|
||||||
|
GACTAGCACTGTAGGGATAACAATTTCACACAGGTGGACAATTACATTGAAAATCACAGATTGGT
|
||||||
|
CACACACACATTGGACATACATAGAAACACACACACATACATTAGATACGAACATAGAAACACAC
|
||||||
|
ATTAGACGCGTACATAGACACAAACACATTGACAGGCAGTTCAGATGATGACGCCCGACTGATAC
|
||||||
|
TCGCGTAGTCGTGGGAGGCAAGGCACACAGGGGATAGG
|
||||||
|
'''.split())
|
||||||
|
pattern = 'TGCACTGTAGGGATAACAAT'
|
||||||
|
|
||||||
|
self.assertTrue(self.search(pattern, text, max_subs=2))
|
||||||
|
|
||||||
|
def test_missing_at_beginning(self):
|
||||||
|
self.assertFalse(self.search("ATTEST", "TESTOSTERONE", max_subs=2))
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindNearMatchesSubstitionsByteslike(TestHasNearMatchSubstitionsOnlyBase, unittest.TestCase):
|
||||||
|
def search(self, subsequence, sequence, max_subs):
|
||||||
|
return hnm_subs_byteslike(subsequence, sequence, max_subs)
|
||||||
|
|
||||||
|
def test_empty_subsequence_exeption(self):
|
||||||
|
pass
|
||||||
|
|
Loading…
Reference in New Issue