jellyfish/strfrymodule.c

243 lines
5.8 KiB
C

#include <Python.h>
#include "strfry.h"
struct strfry_state {
PyObject *unicodedata_normalize;
};
#if PY_MAJOR_VERSION >= 3
#define GETSTATE(m) ((struct strfry_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct strfry_state _state;
#endif
#if PY_MAJOR_VERSION >= 3
#define UTF8_BYTES(s) (PyBytes_AsString(s))
#else
#define UTF8_BYTES(s) (PyString_AS_STRING(s))
#endif
/* Returns a new reference to a PyString (python < 3) or
* PyBytes (python >= 3.0).
*
* If passed a PyUnicode, the returned object will be NFKD UTF-8.
* If passed a PyString or PyBytes no conversion is done.
*/
static inline PyObject* normalize(PyObject *mod, PyObject *pystr) {
PyObject *unicodedata_normalize;
PyObject *normalized;
PyObject *utf8;
#if PY_MAJOR_VERSION < 3
if (PyString_Check(pystr)) {
Py_INCREF(pystr);
return pystr;
}
#else
if (PyBytes_Check(pystr)) {
Py_INCREF(pystr);
return pystr;
}
#endif
if (PyUnicode_Check(pystr)) {
unicodedata_normalize = GETSTATE(mod)->unicodedata_normalize;
normalized = PyObject_CallFunction(unicodedata_normalize,
"sO", "NFKD", pystr);
if (!normalized) {
return NULL;
}
utf8 = PyUnicode_AsUTF8String(normalized);
Py_DECREF(normalized);
return utf8;
}
PyErr_SetString(PyExc_TypeError, "expected str or unicode");
return NULL;
}
static PyObject * strfry_jaro_winkler(PyObject *self, PyObject *args,
PyObject *keywds)
{
const char *s1, *s2;
double result;
static char *kwlist[] = {"string1", "string2", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "ss", kwlist, &s1, &s2)) {
return NULL;
}
result = jaro_winkler(s1, s2, false);
return Py_BuildValue("d", result);
}
static PyObject * strfry_jaro_distance(PyObject *self, PyObject *args,
PyObject *keywds)
{
const char *s1, *s2;
double result;
static char *kwlist[] = {"string1", "string2", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "ss|B", kwlist, &s1, &s2)) {
return NULL;
}
result = jaro_distance(s1, s2);
return Py_BuildValue("d", result);
}
static PyObject * strfry_hamming_distance(PyObject *self, PyObject *args,
PyObject *keywds)
{
const char *s1, *s2;
unsigned result;
bool ignore_case = true;
static char *kwlist[] = {"string1", "string2", "ignore_case", NULL};
if (!PyArg_ParseTupleAndKeywords(args, keywds, "ss|B", kwlist, &s1, &s2,
&ignore_case)) {
return NULL;
}
result = hamming_distance(s1, s2, ignore_case);
return Py_BuildValue("I", result);
}
static PyObject* strfry_levenshtein_distance(PyObject *self, PyObject *args)
{
const char *s1, *s2;
unsigned result;
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
return NULL;
}
result = levenshtein_distance(s1, s2);
return Py_BuildValue("I", result);
}
static PyObject* strfry_soundex(PyObject *self, PyObject *args)
{
PyObject *pystr;
PyObject *normalized;
PyObject* ret;
char *result;
if (!PyArg_ParseTuple(args, "O", &pystr)) {
return NULL;
}
normalized = normalize(self, pystr);
result = soundex(UTF8_BYTES(normalized));
ret = Py_BuildValue("s", result);
free(result);
Py_DECREF(normalized);
return ret;
}
static PyObject* strfry_metaphone(PyObject *self, PyObject *args)
{
PyObject *pystr;
PyObject *normalized;
PyObject *ret;
char *result;
if (!PyArg_ParseTuple(args, "O", &pystr)) {
return NULL;
}
normalized = normalize(self, pystr);
result = metaphone((const char*)UTF8_BYTES(normalized));
ret = Py_BuildValue("s", result);
free(result);
Py_DECREF(normalized);
return ret;
}
static PyMethodDef strfry_methods[] = {
{"jaro_winkler", strfry_jaro_winkler, METH_VARARGS | METH_KEYWORDS,
"jaro_winkler(string1, string2, ignore_case=True)\n\n"
"Do a Jaro-Winkler string comparison between string1 and string2."},
{"jaro_distance", strfry_jaro_distance, METH_VARARGS | METH_KEYWORDS,
"jaro_distance(string1, string2, ignore_case=True)\n\n"
"Get a Jaro string distance metric for string1 and string2."},
{"hamming_distance", strfry_hamming_distance, METH_VARARGS | METH_KEYWORDS,
"hamming_distance(string1, string2, ignore_case=True)\n\n"
"Compute the Hamming distance between string1 and string2."},
{"levenshtein_distance", strfry_levenshtein_distance, METH_VARARGS,
"levenshtein_distance(string1, string2)\n\n"
"Compute the Levenshtein distance between string1 and string2."},
{"soundex", strfry_soundex, METH_VARARGS,
"soundex(string)\n\n"
"Calculate the soundex code for a given name."},
{"metaphone", strfry_metaphone, METH_VARARGS,
"metaphone(string)\n\n"
"Calculate the metaphone representation of a given string."},
{NULL, NULL, 0, NULL}
};
#if PY_MAJOR_VERSION >= 3
#define INITERROR return NULL
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"strfry",
NULL,
sizeof(struct strfry_state),
strfry_methods,
NULL,
NULL,
NULL,
NULL
};
PyObject* PyInit_strfry(void)
#else
#define INITERROR return
PyMODINIT_FUNC initstrfry(void)
#endif
{
PyObject *unicodedata;
#if PY_MAJOR_VERSION >= 3
PyObject *module = PyModule_Create(&moduledef);
#else
PyObject *module = Py_InitModule("strfry", strfry_methods);
#endif
if (module == NULL) {
INITERROR;
}
unicodedata = PyImport_ImportModule("unicodedata");
if (!unicodedata) {
INITERROR;
}
GETSTATE(module)->unicodedata_normalize =
PyObject_GetAttrString(unicodedata, "normalize");
Py_DECREF(unicodedata);
#if PY_MAJOR_VERSION >= 3
return module;
#endif
}