jellyfish/cjellyfish/jellyfishmodule.c

444 lines
10 KiB
C

#include <Python.h>
#include <math.h>
#include "jellyfish.h"
struct jellyfish_state {
PyObject *unicodedata_normalize;
};
#if PY_MAJOR_VERSION >= 3
#define GETSTATE(m) ((struct jellyfish_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct jellyfish_state _state;
#endif
#if PY_MAJOR_VERSION >= 3
#define UTF8_BYTES(s) (PyBytes_AS_STRING(s))
#else
#define UTF8_BYTES(s) (PyString_AS_STRING(s))
#endif
/* Returns a new reference to a PyString (python < 3) or
* PyBytes (python >= 3.0).
*
* If passed a PyUnicode, the returned object will be NFKD UTF-8.
* If passed a PyString or PyBytes no conversion is done.
*/
static inline PyObject* normalize(PyObject *mod, PyObject *pystr) {
PyObject *unicodedata_normalize;
PyObject *normalized;
PyObject *utf8;
#if PY_MAJOR_VERSION < 3
if (PyString_Check(pystr)) {
Py_INCREF(pystr);
return pystr;
}
#else
if (PyBytes_Check(pystr)) {
Py_INCREF(pystr);
return pystr;
}
#endif
if (PyUnicode_Check(pystr)) {
unicodedata_normalize = GETSTATE(mod)->unicodedata_normalize;
normalized = PyObject_CallFunction(unicodedata_normalize,
"sO", "NFKD", pystr);
if (!normalized) {
return NULL;
}
utf8 = PyUnicode_AsUTF8String(normalized);
Py_DECREF(normalized);
return utf8;
}
PyErr_SetString(PyExc_TypeError, "expected str or unicode");
return NULL;
}
static PyObject * jellyfish_jaro_winkler(PyObject *self, PyObject *args)
{
const char *s1, *s2;
double result;
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
return NULL;
}
result = jaro_winkler(s1, s2, false);
if (isnan(result)) {
PyErr_NoMemory();
return NULL;
}
return Py_BuildValue("d", result);
}
static PyObject * jellyfish_jaro_distance(PyObject *self, PyObject *args)
{
const char *s1, *s2;
double result;
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
return NULL;
}
result = jaro_distance(s1, s2);
if (isnan(result)) {
PyErr_NoMemory();
return NULL;
}
return Py_BuildValue("d", result);
}
static PyObject * jellyfish_hamming_distance(PyObject *self, PyObject *args)
{
const char *s1, *s2;
unsigned result;
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
return NULL;
}
result = hamming_distance(s1, s2);
return Py_BuildValue("I", result);
}
static PyObject* jellyfish_levenshtein_distance(PyObject *self, PyObject *args)
{
const char *s1, *s2;
int result;
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
return NULL;
}
result = levenshtein_distance(s1, s2);
if (result == -1) {
// levenshtein_distance only returns failure code (-1) on
// failed malloc
PyErr_NoMemory();
return NULL;
}
return Py_BuildValue("i", result);
}
/*
* utility function for damerau_levenshtein to treat unicode and bytes similarly
*/
static void* _strdata(PyObject *obj) {
if (PyBytes_Check(obj)) {
return PyBytes_AsString(obj);
} else if (PyUnicode_Check(obj)) {
#if PY_MAJOR_VERSION >= 3
switch(PyUnicode_KIND(obj)) {
case PyUnicode_1BYTE_KIND:
return PyUnicode_1BYTE_DATA(obj);
case PyUnicode_2BYTE_KIND:
return PyUnicode_2BYTE_DATA(obj);
case PyUnicode_4BYTE_KIND:
return PyUnicode_4BYTE_DATA(obj);
}
#else
return PyUnicode_AsUTF8String(obj);
#endif
}
return NULL;
}
static PyObject* jellyfish_damerau_levenshtein_distance(PyObject *self,
PyObject *args)
{
PyObject *o1, *o2;
const char *s1, *s2;
int result;
if (!PyArg_ParseTuple(args, "OO", &o1, &o2)) {
return NULL;
}
s1 = _strdata(o1);
s2 = _strdata(o2);
result = damerau_levenshtein_distance(s1, s2);
if (result == -1) {
PyErr_NoMemory();
return NULL;
}
return Py_BuildValue("i", result);
}
static PyObject* jellyfish_soundex(PyObject *self, PyObject *args)
{
PyObject *pystr;
PyObject *normalized;
PyObject* ret;
char *result;
if (!PyArg_ParseTuple(args, "O", &pystr)) {
return NULL;
}
normalized = normalize(self, pystr);
if (!normalized) {
return NULL;
}
result = soundex(UTF8_BYTES(normalized));
Py_DECREF(normalized);
if (!result) {
// soundex only fails on bad malloc
PyErr_NoMemory();
return NULL;
}
ret = Py_BuildValue("s", result);
free(result);
return ret;
}
static PyObject* jellyfish_metaphone(PyObject *self, PyObject *args)
{
PyObject *pystr;
PyObject *normalized;
PyObject *ret;
char *result;
if (!PyArg_ParseTuple(args, "O", &pystr)) {
return NULL;
}
normalized = normalize(self, pystr);
if (!normalized) {
return NULL;
}
result = metaphone((const char*)UTF8_BYTES(normalized));
Py_DECREF(normalized);
if (!result) {
// metaphone only fails on bad malloc
PyErr_NoMemory();
return NULL;
}
ret = Py_BuildValue("s", result);
free(result);
return ret;
}
static PyObject* jellyfish_match_rating_codex(PyObject *self, PyObject *args)
{
const char *str;
char *result;
PyObject *ret;
if (!PyArg_ParseTuple(args, "s", &str)) {
return NULL;
}
result = match_rating_codex(str);
if (!result) {
PyErr_NoMemory();
return NULL;
}
ret = Py_BuildValue("s", result);
free(result);
return ret;
}
static PyObject* jellyfish_match_rating_comparison(PyObject *self,
PyObject *args)
{
const char *str1, *str2;
int result;
if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) {
return NULL;
}
result = match_rating_comparison(str1, str2);
if (result == -1) {
PyErr_NoMemory();
return NULL;
}
if (result) {
Py_RETURN_TRUE;
} else {
Py_RETURN_FALSE;
}
}
static PyObject* jellyfish_nysiis(PyObject *self, PyObject *args)
{
const char *str;
char *result;
PyObject *ret;
if (!PyArg_ParseTuple(args, "s", &str)) {
return NULL;
}
result = nysiis(str);
if (!result) {
PyErr_NoMemory();
return NULL;
}
ret = Py_BuildValue("s", result);
free(result);
return ret;
}
static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args)
{
const char *str;
char *result;
PyObject *ret;
struct stemmer *z;
int end;
if (!PyArg_ParseTuple(args, "s", &str)) {
return NULL;
}
z = create_stemmer();
if (!z) {
PyErr_NoMemory();
return NULL;
}
result = strdup(str);
if (!result) {
free_stemmer(z);
PyErr_NoMemory();
return NULL;
}
end = stem(z, result, strlen(result) - 1);
result[end + 1] = '\0';
ret = Py_BuildValue("s", result);
free(result);
free_stemmer(z);
return ret;
}
static PyMethodDef jellyfish_methods[] = {
{"jaro_winkler", jellyfish_jaro_winkler, METH_VARARGS,
"jaro_winkler(string1, string2)\n\n"
"Do a Jaro-Winkler string comparison between string1 and string2."},
{"jaro_distance", jellyfish_jaro_distance, METH_VARARGS,
"jaro_distance(string1, string2)\n\n"
"Get a Jaro string distance metric for string1 and string2."},
{"hamming_distance", jellyfish_hamming_distance, METH_VARARGS,
"hamming_distance(string1, string2)\n\n"
"Compute the Hamming distance between string1 and string2."},
{"levenshtein_distance", jellyfish_levenshtein_distance, METH_VARARGS,
"levenshtein_distance(string1, string2)\n\n"
"Compute the Levenshtein distance between string1 and string2."},
{"damerau_levenshtein_distance", jellyfish_damerau_levenshtein_distance,
METH_VARARGS,
"damerau_levenshtein_distance(string1, string2)\n\n"
"Compute the Damerau-Levenshtein distance between string1 and string2."},
{"soundex", jellyfish_soundex, METH_VARARGS,
"soundex(string)\n\n"
"Calculate the soundex code for a given name."},
{"metaphone", jellyfish_metaphone, METH_VARARGS,
"metaphone(string)\n\n"
"Calculate the metaphone representation of a given string."},
{"match_rating_codex", jellyfish_match_rating_codex, METH_VARARGS,
"match_rating_codex(string)\n\n"
"Calculate the Match Rating Approach representation of a given string."},
{"match_rating_comparison", jellyfish_match_rating_comparison, METH_VARARGS,
"match_rating_comparison(string, string)\n\n"
"Compute the Match Rating Approach similarity between string1 and"
"string2."},
{"nysiis", jellyfish_nysiis, METH_VARARGS,
"nysiis(string)\n\n"
"Compute the NYSIIS (New York State Identification and Intelligence\n"
"System) code for a string."},
{"porter_stem", jellyfish_porter_stem, METH_VARARGS,
"porter_stem(string)\n\n"
"Return the result of running the Porter stemming algorithm on "
"a single-word string."},
{NULL, NULL, 0, NULL}
};
#if PY_MAJOR_VERSION >= 3
#define INITERROR return NULL
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"jellyfish.cjellyfish",
NULL,
sizeof(struct jellyfish_state),
jellyfish_methods,
NULL,
NULL,
NULL,
NULL
};
PyObject* PyInit_cjellyfish(void)
#else
#define INITERROR return
PyMODINIT_FUNC initcjellyfish(void)
#endif
{
PyObject *unicodedata;
#if PY_MAJOR_VERSION >= 3
PyObject *module = PyModule_Create(&moduledef);
#else
PyObject *module = Py_InitModule("jellyfish.cjellyfish", jellyfish_methods);
#endif
if (module == NULL) {
INITERROR;
}
unicodedata = PyImport_ImportModule("unicodedata");
if (!unicodedata) {
INITERROR;
}
GETSTATE(module)->unicodedata_normalize =
PyObject_GetAttrString(unicodedata, "normalize");
Py_DECREF(unicodedata);
#if PY_MAJOR_VERSION >= 3
return module;
#endif
}