243 lines
5.8 KiB
C
243 lines
5.8 KiB
C
#include <Python.h>
|
|
#include "strfry.h"
|
|
|
|
struct strfry_state {
|
|
PyObject *unicodedata_normalize;
|
|
};
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
#define GETSTATE(m) ((struct strfry_state*)PyModule_GetState(m))
|
|
#else
|
|
#define GETSTATE(m) (&_state)
|
|
static struct strfry_state _state;
|
|
#endif
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
#define UTF8_BYTES(s) (PyBytes_AsString(s))
|
|
#else
|
|
#define UTF8_BYTES(s) (PyString_AS_STRING(s))
|
|
#endif
|
|
|
|
/* Returns a new reference to a PyString (python < 3) or
|
|
* PyBytes (python >= 3.0).
|
|
*
|
|
* If passed a PyUnicode, the returned object will be NFKD UTF-8.
|
|
* If passed a PyString or PyBytes no conversion is done.
|
|
*/
|
|
static inline PyObject* normalize(PyObject *mod, PyObject *pystr) {
|
|
PyObject *unicodedata_normalize;
|
|
PyObject *normalized;
|
|
PyObject *utf8;
|
|
|
|
#if PY_MAJOR_VERSION < 3
|
|
if (PyString_Check(pystr)) {
|
|
Py_INCREF(pystr);
|
|
return pystr;
|
|
}
|
|
#else
|
|
if (PyBytes_Check(pystr)) {
|
|
Py_INCREF(pystr);
|
|
return pystr;
|
|
}
|
|
#endif
|
|
|
|
if (PyUnicode_Check(pystr)) {
|
|
unicodedata_normalize = GETSTATE(mod)->unicodedata_normalize;
|
|
normalized = PyObject_CallFunction(unicodedata_normalize,
|
|
"sO", "NFKD", pystr);
|
|
if (!normalized) {
|
|
return NULL;
|
|
}
|
|
utf8 = PyUnicode_AsUTF8String(normalized);
|
|
Py_DECREF(normalized);
|
|
return utf8;
|
|
}
|
|
|
|
PyErr_SetString(PyExc_TypeError, "expected str or unicode");
|
|
return NULL;
|
|
}
|
|
|
|
static PyObject * strfry_jaro_winkler(PyObject *self, PyObject *args,
|
|
PyObject *keywds)
|
|
{
|
|
const char *s1, *s2;
|
|
double result;
|
|
|
|
static char *kwlist[] = {"string1", "string2", NULL};
|
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, keywds, "ss", kwlist, &s1, &s2)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = jaro_winkler(s1, s2, false);
|
|
|
|
return Py_BuildValue("d", result);
|
|
}
|
|
|
|
static PyObject * strfry_jaro_distance(PyObject *self, PyObject *args,
|
|
PyObject *keywds)
|
|
{
|
|
const char *s1, *s2;
|
|
double result;
|
|
|
|
static char *kwlist[] = {"string1", "string2", NULL};
|
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, keywds, "ss|B", kwlist, &s1, &s2)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = jaro_distance(s1, s2);
|
|
|
|
return Py_BuildValue("d", result);
|
|
}
|
|
|
|
static PyObject * strfry_hamming_distance(PyObject *self, PyObject *args,
|
|
PyObject *keywds)
|
|
{
|
|
const char *s1, *s2;
|
|
unsigned result;
|
|
bool ignore_case = true;
|
|
|
|
static char *kwlist[] = {"string1", "string2", "ignore_case", NULL};
|
|
|
|
if (!PyArg_ParseTupleAndKeywords(args, keywds, "ss|B", kwlist, &s1, &s2,
|
|
&ignore_case)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = hamming_distance(s1, s2, ignore_case);
|
|
|
|
return Py_BuildValue("I", result);
|
|
}
|
|
|
|
static PyObject* strfry_levenshtein_distance(PyObject *self, PyObject *args)
|
|
{
|
|
const char *s1, *s2;
|
|
unsigned result;
|
|
|
|
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = levenshtein_distance(s1, s2);
|
|
|
|
return Py_BuildValue("I", result);
|
|
}
|
|
|
|
static PyObject* strfry_soundex(PyObject *self, PyObject *args)
|
|
{
|
|
PyObject *pystr;
|
|
PyObject *normalized;
|
|
PyObject* ret;
|
|
char *result;
|
|
|
|
if (!PyArg_ParseTuple(args, "O", &pystr)) {
|
|
return NULL;
|
|
}
|
|
|
|
normalized = normalize(self, pystr);
|
|
result = soundex(UTF8_BYTES(normalized));
|
|
ret = Py_BuildValue("s", result);
|
|
free(result);
|
|
Py_DECREF(normalized);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static PyObject* strfry_metaphone(PyObject *self, PyObject *args)
|
|
{
|
|
PyObject *pystr;
|
|
PyObject *normalized;
|
|
PyObject *ret;
|
|
char *result;
|
|
|
|
if (!PyArg_ParseTuple(args, "O", &pystr)) {
|
|
return NULL;
|
|
}
|
|
|
|
normalized = normalize(self, pystr);
|
|
result = metaphone((const char*)UTF8_BYTES(normalized));
|
|
ret = Py_BuildValue("s", result);
|
|
free(result);
|
|
Py_DECREF(normalized);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static PyMethodDef strfry_methods[] = {
|
|
{"jaro_winkler", strfry_jaro_winkler, METH_VARARGS | METH_KEYWORDS,
|
|
"jaro_winkler(string1, string2, ignore_case=True)\n\n"
|
|
"Do a Jaro-Winkler string comparison between string1 and string2."},
|
|
|
|
{"jaro_distance", strfry_jaro_distance, METH_VARARGS | METH_KEYWORDS,
|
|
"jaro_distance(string1, string2, ignore_case=True)\n\n"
|
|
"Get a Jaro string distance metric for string1 and string2."},
|
|
|
|
{"hamming_distance", strfry_hamming_distance, METH_VARARGS | METH_KEYWORDS,
|
|
"hamming_distance(string1, string2, ignore_case=True)\n\n"
|
|
"Compute the Hamming distance between string1 and string2."},
|
|
|
|
{"levenshtein_distance", strfry_levenshtein_distance, METH_VARARGS,
|
|
"levenshtein_distance(string1, string2)\n\n"
|
|
"Compute the Levenshtein distance between string1 and string2."},
|
|
|
|
{"soundex", strfry_soundex, METH_VARARGS,
|
|
"soundex(string)\n\n"
|
|
"Calculate the soundex code for a given name."},
|
|
|
|
{"metaphone", strfry_metaphone, METH_VARARGS,
|
|
"metaphone(string)\n\n"
|
|
"Calculate the metaphone representation of a given string."},
|
|
|
|
{NULL, NULL, 0, NULL}
|
|
};
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
#define INITERROR return NULL
|
|
|
|
static struct PyModuleDef moduledef = {
|
|
PyModuleDef_HEAD_INIT,
|
|
"strfry",
|
|
NULL,
|
|
sizeof(struct strfry_state),
|
|
strfry_methods,
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
PyObject* PyInit_strfry(void)
|
|
#else
|
|
|
|
#define INITERROR return
|
|
|
|
PyMODINIT_FUNC initstrfry(void)
|
|
#endif
|
|
{
|
|
PyObject *unicodedata;
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
PyObject *module = PyModule_Create(&moduledef);
|
|
#else
|
|
PyObject *module = Py_InitModule("strfry", strfry_methods);
|
|
#endif
|
|
|
|
if (module == NULL) {
|
|
INITERROR;
|
|
}
|
|
|
|
unicodedata = PyImport_ImportModule("unicodedata");
|
|
if (!unicodedata) {
|
|
INITERROR;
|
|
}
|
|
|
|
GETSTATE(module)->unicodedata_normalize =
|
|
PyObject_GetAttrString(unicodedata, "normalize");
|
|
Py_DECREF(unicodedata);
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
return module;
|
|
#endif
|
|
}
|