444 lines
10 KiB
C
444 lines
10 KiB
C
#include <Python.h>
|
|
#include <math.h>
|
|
#include "jellyfish.h"
|
|
|
|
struct jellyfish_state {
|
|
PyObject *unicodedata_normalize;
|
|
};
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
#define GETSTATE(m) ((struct jellyfish_state*)PyModule_GetState(m))
|
|
#else
|
|
#define GETSTATE(m) (&_state)
|
|
static struct jellyfish_state _state;
|
|
#endif
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
#define UTF8_BYTES(s) (PyBytes_AS_STRING(s))
|
|
#else
|
|
#define UTF8_BYTES(s) (PyString_AS_STRING(s))
|
|
#endif
|
|
|
|
|
|
/* Returns a new reference to a PyString (python < 3) or
|
|
* PyBytes (python >= 3.0).
|
|
*
|
|
* If passed a PyUnicode, the returned object will be NFKD UTF-8.
|
|
* If passed a PyString or PyBytes no conversion is done.
|
|
*/
|
|
static inline PyObject* normalize(PyObject *mod, PyObject *pystr) {
|
|
PyObject *unicodedata_normalize;
|
|
PyObject *normalized;
|
|
PyObject *utf8;
|
|
|
|
#if PY_MAJOR_VERSION < 3
|
|
if (PyString_Check(pystr)) {
|
|
Py_INCREF(pystr);
|
|
return pystr;
|
|
}
|
|
#else
|
|
if (PyBytes_Check(pystr)) {
|
|
Py_INCREF(pystr);
|
|
return pystr;
|
|
}
|
|
#endif
|
|
|
|
if (PyUnicode_Check(pystr)) {
|
|
unicodedata_normalize = GETSTATE(mod)->unicodedata_normalize;
|
|
normalized = PyObject_CallFunction(unicodedata_normalize,
|
|
"sO", "NFKD", pystr);
|
|
if (!normalized) {
|
|
return NULL;
|
|
}
|
|
utf8 = PyUnicode_AsUTF8String(normalized);
|
|
Py_DECREF(normalized);
|
|
return utf8;
|
|
}
|
|
|
|
PyErr_SetString(PyExc_TypeError, "expected str or unicode");
|
|
return NULL;
|
|
}
|
|
|
|
static PyObject * jellyfish_jaro_winkler(PyObject *self, PyObject *args)
|
|
{
|
|
const char *s1, *s2;
|
|
double result;
|
|
|
|
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = jaro_winkler(s1, s2, false);
|
|
if (isnan(result)) {
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
return Py_BuildValue("d", result);
|
|
}
|
|
|
|
static PyObject * jellyfish_jaro_distance(PyObject *self, PyObject *args)
|
|
{
|
|
const char *s1, *s2;
|
|
double result;
|
|
|
|
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = jaro_distance(s1, s2);
|
|
if (isnan(result)) {
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
return Py_BuildValue("d", result);
|
|
}
|
|
|
|
static PyObject * jellyfish_hamming_distance(PyObject *self, PyObject *args)
|
|
{
|
|
const char *s1, *s2;
|
|
unsigned result;
|
|
|
|
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = hamming_distance(s1, s2);
|
|
|
|
return Py_BuildValue("I", result);
|
|
}
|
|
|
|
static PyObject* jellyfish_levenshtein_distance(PyObject *self, PyObject *args)
|
|
{
|
|
const char *s1, *s2;
|
|
int result;
|
|
|
|
if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = levenshtein_distance(s1, s2);
|
|
if (result == -1) {
|
|
// levenshtein_distance only returns failure code (-1) on
|
|
// failed malloc
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
return Py_BuildValue("i", result);
|
|
}
|
|
|
|
/*
|
|
* utility function for damerau_levenshtein to treat unicode and bytes similarly
|
|
*/
|
|
static void* _strdata(PyObject *obj) {
|
|
if (PyBytes_Check(obj)) {
|
|
return PyBytes_AsString(obj);
|
|
} else if (PyUnicode_Check(obj)) {
|
|
#if PY_MAJOR_VERSION >= 3
|
|
switch(PyUnicode_KIND(obj)) {
|
|
case PyUnicode_1BYTE_KIND:
|
|
return PyUnicode_1BYTE_DATA(obj);
|
|
case PyUnicode_2BYTE_KIND:
|
|
return PyUnicode_2BYTE_DATA(obj);
|
|
case PyUnicode_4BYTE_KIND:
|
|
return PyUnicode_4BYTE_DATA(obj);
|
|
}
|
|
#else
|
|
return PyUnicode_AsUTF8String(obj);
|
|
#endif
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static PyObject* jellyfish_damerau_levenshtein_distance(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *o1, *o2;
|
|
const char *s1, *s2;
|
|
int result;
|
|
|
|
if (!PyArg_ParseTuple(args, "OO", &o1, &o2)) {
|
|
return NULL;
|
|
}
|
|
|
|
s1 = _strdata(o1);
|
|
s2 = _strdata(o2);
|
|
|
|
result = damerau_levenshtein_distance(s1, s2);
|
|
if (result == -1) {
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
return Py_BuildValue("i", result);
|
|
}
|
|
|
|
static PyObject* jellyfish_soundex(PyObject *self, PyObject *args)
|
|
{
|
|
PyObject *pystr;
|
|
PyObject *normalized;
|
|
PyObject* ret;
|
|
char *result;
|
|
|
|
if (!PyArg_ParseTuple(args, "O", &pystr)) {
|
|
return NULL;
|
|
}
|
|
|
|
normalized = normalize(self, pystr);
|
|
if (!normalized) {
|
|
return NULL;
|
|
}
|
|
|
|
result = soundex(UTF8_BYTES(normalized));
|
|
Py_DECREF(normalized);
|
|
|
|
if (!result) {
|
|
// soundex only fails on bad malloc
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
ret = Py_BuildValue("s", result);
|
|
free(result);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static PyObject* jellyfish_metaphone(PyObject *self, PyObject *args)
|
|
{
|
|
PyObject *pystr;
|
|
PyObject *normalized;
|
|
PyObject *ret;
|
|
char *result;
|
|
|
|
if (!PyArg_ParseTuple(args, "O", &pystr)) {
|
|
return NULL;
|
|
}
|
|
|
|
normalized = normalize(self, pystr);
|
|
if (!normalized) {
|
|
return NULL;
|
|
}
|
|
|
|
result = metaphone((const char*)UTF8_BYTES(normalized));
|
|
Py_DECREF(normalized);
|
|
|
|
if (!result) {
|
|
// metaphone only fails on bad malloc
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
ret = Py_BuildValue("s", result);
|
|
free(result);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static PyObject* jellyfish_match_rating_codex(PyObject *self, PyObject *args)
|
|
{
|
|
const char *str;
|
|
char *result;
|
|
PyObject *ret;
|
|
|
|
if (!PyArg_ParseTuple(args, "s", &str)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = match_rating_codex(str);
|
|
if (!result) {
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
ret = Py_BuildValue("s", result);
|
|
free(result);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static PyObject* jellyfish_match_rating_comparison(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *str1, *str2;
|
|
int result;
|
|
|
|
if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = match_rating_comparison(str1, str2);
|
|
if (result == -1) {
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
if (result) {
|
|
Py_RETURN_TRUE;
|
|
} else {
|
|
Py_RETURN_FALSE;
|
|
}
|
|
}
|
|
|
|
static PyObject* jellyfish_nysiis(PyObject *self, PyObject *args)
|
|
{
|
|
const char *str;
|
|
char *result;
|
|
PyObject *ret;
|
|
|
|
if (!PyArg_ParseTuple(args, "s", &str)) {
|
|
return NULL;
|
|
}
|
|
|
|
result = nysiis(str);
|
|
if (!result) {
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
ret = Py_BuildValue("s", result);
|
|
free(result);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args)
|
|
{
|
|
const char *str;
|
|
char *result;
|
|
PyObject *ret;
|
|
struct stemmer *z;
|
|
int end;
|
|
|
|
if (!PyArg_ParseTuple(args, "s", &str)) {
|
|
return NULL;
|
|
}
|
|
|
|
z = create_stemmer();
|
|
if (!z) {
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
result = strdup(str);
|
|
if (!result) {
|
|
free_stemmer(z);
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
end = stem(z, result, strlen(result) - 1);
|
|
result[end + 1] = '\0';
|
|
|
|
ret = Py_BuildValue("s", result);
|
|
|
|
free(result);
|
|
free_stemmer(z);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static PyMethodDef jellyfish_methods[] = {
|
|
{"jaro_winkler", jellyfish_jaro_winkler, METH_VARARGS,
|
|
"jaro_winkler(string1, string2)\n\n"
|
|
"Do a Jaro-Winkler string comparison between string1 and string2."},
|
|
|
|
{"jaro_distance", jellyfish_jaro_distance, METH_VARARGS,
|
|
"jaro_distance(string1, string2)\n\n"
|
|
"Get a Jaro string distance metric for string1 and string2."},
|
|
|
|
{"hamming_distance", jellyfish_hamming_distance, METH_VARARGS,
|
|
"hamming_distance(string1, string2)\n\n"
|
|
"Compute the Hamming distance between string1 and string2."},
|
|
|
|
{"levenshtein_distance", jellyfish_levenshtein_distance, METH_VARARGS,
|
|
"levenshtein_distance(string1, string2)\n\n"
|
|
"Compute the Levenshtein distance between string1 and string2."},
|
|
|
|
{"damerau_levenshtein_distance", jellyfish_damerau_levenshtein_distance,
|
|
METH_VARARGS,
|
|
"damerau_levenshtein_distance(string1, string2)\n\n"
|
|
"Compute the Damerau-Levenshtein distance between string1 and string2."},
|
|
|
|
{"soundex", jellyfish_soundex, METH_VARARGS,
|
|
"soundex(string)\n\n"
|
|
"Calculate the soundex code for a given name."},
|
|
|
|
{"metaphone", jellyfish_metaphone, METH_VARARGS,
|
|
"metaphone(string)\n\n"
|
|
"Calculate the metaphone representation of a given string."},
|
|
|
|
{"match_rating_codex", jellyfish_match_rating_codex, METH_VARARGS,
|
|
"match_rating_codex(string)\n\n"
|
|
"Calculate the Match Rating Approach representation of a given string."},
|
|
|
|
{"match_rating_comparison", jellyfish_match_rating_comparison, METH_VARARGS,
|
|
"match_rating_comparison(string, string)\n\n"
|
|
"Compute the Match Rating Approach similarity between string1 and"
|
|
"string2."},
|
|
|
|
{"nysiis", jellyfish_nysiis, METH_VARARGS,
|
|
"nysiis(string)\n\n"
|
|
"Compute the NYSIIS (New York State Identification and Intelligence\n"
|
|
"System) code for a string."},
|
|
|
|
{"porter_stem", jellyfish_porter_stem, METH_VARARGS,
|
|
"porter_stem(string)\n\n"
|
|
"Return the result of running the Porter stemming algorithm on "
|
|
"a single-word string."},
|
|
|
|
{NULL, NULL, 0, NULL}
|
|
};
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
#define INITERROR return NULL
|
|
|
|
static struct PyModuleDef moduledef = {
|
|
PyModuleDef_HEAD_INIT,
|
|
"jellyfish.cjellyfish",
|
|
NULL,
|
|
sizeof(struct jellyfish_state),
|
|
jellyfish_methods,
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
PyObject* PyInit_cjellyfish(void)
|
|
#else
|
|
|
|
#define INITERROR return
|
|
|
|
PyMODINIT_FUNC initcjellyfish(void)
|
|
#endif
|
|
{
|
|
PyObject *unicodedata;
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
PyObject *module = PyModule_Create(&moduledef);
|
|
#else
|
|
PyObject *module = Py_InitModule("jellyfish.cjellyfish", jellyfish_methods);
|
|
#endif
|
|
|
|
if (module == NULL) {
|
|
INITERROR;
|
|
}
|
|
|
|
unicodedata = PyImport_ImportModule("unicodedata");
|
|
if (!unicodedata) {
|
|
INITERROR;
|
|
}
|
|
|
|
GETSTATE(module)->unicodedata_normalize =
|
|
PyObject_GetAttrString(unicodedata, "normalize");
|
|
Py_DECREF(unicodedata);
|
|
|
|
#if PY_MAJOR_VERSION >= 3
|
|
return module;
|
|
#endif
|
|
}
|