#include #include #include "jellyfish.h" struct jellyfish_state { PyObject *unicodedata_normalize; }; #if PY_MAJOR_VERSION >= 3 #define GETSTATE(m) ((struct jellyfish_state*)PyModule_GetState(m)) #else #define GETSTATE(m) (&_state) static struct jellyfish_state _state; #endif #if PY_MAJOR_VERSION >= 3 #define UTF8_BYTES(s) (PyBytes_AS_STRING(s)) #else #define UTF8_BYTES(s) (PyString_AS_STRING(s)) #endif /* Returns a new reference to a PyString (python < 3) or * PyBytes (python >= 3.0). * * If passed a PyUnicode, the returned object will be NFKD UTF-8. * If passed a PyString or PyBytes no conversion is done. */ static inline PyObject* normalize(PyObject *mod, PyObject *pystr) { PyObject *unicodedata_normalize; PyObject *normalized; PyObject *utf8; #if PY_MAJOR_VERSION < 3 if (PyString_Check(pystr)) { Py_INCREF(pystr); return pystr; } #else if (PyBytes_Check(pystr)) { Py_INCREF(pystr); return pystr; } #endif if (PyUnicode_Check(pystr)) { unicodedata_normalize = GETSTATE(mod)->unicodedata_normalize; normalized = PyObject_CallFunction(unicodedata_normalize, "sO", "NFKD", pystr); if (!normalized) { return NULL; } utf8 = PyUnicode_AsUTF8String(normalized); Py_DECREF(normalized); return utf8; } PyErr_SetString(PyExc_TypeError, "expected str or unicode"); return NULL; } static PyObject * jellyfish_jaro_winkler(PyObject *self, PyObject *args) { const char *s1, *s2; double result; if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) { return NULL; } result = jaro_winkler(s1, s2, false); if (isnan(result)) { PyErr_NoMemory(); return NULL; } return Py_BuildValue("d", result); } static PyObject * jellyfish_jaro_distance(PyObject *self, PyObject *args) { const char *s1, *s2; double result; if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) { return NULL; } result = jaro_distance(s1, s2); if (isnan(result)) { PyErr_NoMemory(); return NULL; } return Py_BuildValue("d", result); } static PyObject * jellyfish_hamming_distance(PyObject *self, PyObject *args) { const char *s1, *s2; unsigned result; if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) { return NULL; } result = hamming_distance(s1, s2); return Py_BuildValue("I", result); } static PyObject* jellyfish_levenshtein_distance(PyObject *self, PyObject *args) { const char *s1, *s2; int result; if (!PyArg_ParseTuple(args, "ss", &s1, &s2)) { return NULL; } result = levenshtein_distance(s1, s2); if (result == -1) { // levenshtein_distance only returns failure code (-1) on // failed malloc PyErr_NoMemory(); return NULL; } return Py_BuildValue("i", result); } /* * utility function for damerau_levenshtein to treat unicode and bytes similarly */ static void* _strdata(PyObject *obj) { if (PyBytes_Check(obj)) { return PyBytes_AsString(obj); } else if (PyUnicode_Check(obj)) { #if PY_MAJOR_VERSION >= 3 switch(PyUnicode_KIND(obj)) { case PyUnicode_1BYTE_KIND: return PyUnicode_1BYTE_DATA(obj); case PyUnicode_2BYTE_KIND: return PyUnicode_2BYTE_DATA(obj); case PyUnicode_4BYTE_KIND: return PyUnicode_4BYTE_DATA(obj); } #else return PyUnicode_AsUTF8String(obj); #endif } return NULL; } static PyObject* jellyfish_damerau_levenshtein_distance(PyObject *self, PyObject *args) { PyObject *o1, *o2; const char *s1, *s2; int result; if (!PyArg_ParseTuple(args, "OO", &o1, &o2)) { return NULL; } s1 = _strdata(o1); s2 = _strdata(o2); result = damerau_levenshtein_distance(s1, s2); if (result == -1) { PyErr_NoMemory(); return NULL; } return Py_BuildValue("i", result); } static PyObject* jellyfish_soundex(PyObject *self, PyObject *args) { PyObject *pystr; PyObject *normalized; PyObject* ret; char *result; if (!PyArg_ParseTuple(args, "O", &pystr)) { return NULL; } normalized = normalize(self, pystr); if (!normalized) { return NULL; } result = soundex(UTF8_BYTES(normalized)); Py_DECREF(normalized); if (!result) { // soundex only fails on bad malloc PyErr_NoMemory(); return NULL; } ret = Py_BuildValue("s", result); free(result); return ret; } static PyObject* jellyfish_metaphone(PyObject *self, PyObject *args) { PyObject *pystr; PyObject *normalized; PyObject *ret; char *result; if (!PyArg_ParseTuple(args, "O", &pystr)) { return NULL; } normalized = normalize(self, pystr); if (!normalized) { return NULL; } result = metaphone((const char*)UTF8_BYTES(normalized)); Py_DECREF(normalized); if (!result) { // metaphone only fails on bad malloc PyErr_NoMemory(); return NULL; } ret = Py_BuildValue("s", result); free(result); return ret; } static PyObject* jellyfish_match_rating_codex(PyObject *self, PyObject *args) { const char *str; char *result; PyObject *ret; if (!PyArg_ParseTuple(args, "s", &str)) { return NULL; } result = match_rating_codex(str); if (!result) { PyErr_NoMemory(); return NULL; } ret = Py_BuildValue("s", result); free(result); return ret; } static PyObject* jellyfish_match_rating_comparison(PyObject *self, PyObject *args) { const char *str1, *str2; int result; if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) { return NULL; } result = match_rating_comparison(str1, str2); if (result == -1) { PyErr_NoMemory(); return NULL; } if (result) { Py_RETURN_TRUE; } else { Py_RETURN_FALSE; } } static PyObject* jellyfish_nysiis(PyObject *self, PyObject *args) { const char *str; char *result; PyObject *ret; if (!PyArg_ParseTuple(args, "s", &str)) { return NULL; } result = nysiis(str); if (!result) { PyErr_NoMemory(); return NULL; } ret = Py_BuildValue("s", result); free(result); return ret; } static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args) { const char *str; char *result; PyObject *ret; struct stemmer *z; int end; if (!PyArg_ParseTuple(args, "s", &str)) { return NULL; } z = create_stemmer(); if (!z) { PyErr_NoMemory(); return NULL; } result = strdup(str); if (!result) { free_stemmer(z); PyErr_NoMemory(); return NULL; } end = stem(z, result, strlen(result) - 1); result[end + 1] = '\0'; ret = Py_BuildValue("s", result); free(result); free_stemmer(z); return ret; } static PyMethodDef jellyfish_methods[] = { {"jaro_winkler", jellyfish_jaro_winkler, METH_VARARGS, "jaro_winkler(string1, string2)\n\n" "Do a Jaro-Winkler string comparison between string1 and string2."}, {"jaro_distance", jellyfish_jaro_distance, METH_VARARGS, "jaro_distance(string1, string2)\n\n" "Get a Jaro string distance metric for string1 and string2."}, {"hamming_distance", jellyfish_hamming_distance, METH_VARARGS, "hamming_distance(string1, string2)\n\n" "Compute the Hamming distance between string1 and string2."}, {"levenshtein_distance", jellyfish_levenshtein_distance, METH_VARARGS, "levenshtein_distance(string1, string2)\n\n" "Compute the Levenshtein distance between string1 and string2."}, {"damerau_levenshtein_distance", jellyfish_damerau_levenshtein_distance, METH_VARARGS, "damerau_levenshtein_distance(string1, string2)\n\n" "Compute the Damerau-Levenshtein distance between string1 and string2."}, {"soundex", jellyfish_soundex, METH_VARARGS, "soundex(string)\n\n" "Calculate the soundex code for a given name."}, {"metaphone", jellyfish_metaphone, METH_VARARGS, "metaphone(string)\n\n" "Calculate the metaphone representation of a given string."}, {"match_rating_codex", jellyfish_match_rating_codex, METH_VARARGS, "match_rating_codex(string)\n\n" "Calculate the Match Rating Approach representation of a given string."}, {"match_rating_comparison", jellyfish_match_rating_comparison, METH_VARARGS, "match_rating_comparison(string, string)\n\n" "Compute the Match Rating Approach similarity between string1 and" "string2."}, {"nysiis", jellyfish_nysiis, METH_VARARGS, "nysiis(string)\n\n" "Compute the NYSIIS (New York State Identification and Intelligence\n" "System) code for a string."}, {"porter_stem", jellyfish_porter_stem, METH_VARARGS, "porter_stem(string)\n\n" "Return the result of running the Porter stemming algorithm on " "a single-word string."}, {NULL, NULL, 0, NULL} }; #if PY_MAJOR_VERSION >= 3 #define INITERROR return NULL static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, "jellyfish.cjellyfish", NULL, sizeof(struct jellyfish_state), jellyfish_methods, NULL, NULL, NULL, NULL }; PyObject* PyInit_cjellyfish(void) #else #define INITERROR return PyMODINIT_FUNC initcjellyfish(void) #endif { PyObject *unicodedata; #if PY_MAJOR_VERSION >= 3 PyObject *module = PyModule_Create(&moduledef); #else PyObject *module = Py_InitModule("jellyfish.cjellyfish", jellyfish_methods); #endif if (module == NULL) { INITERROR; } unicodedata = PyImport_ImportModule("unicodedata"); if (!unicodedata) { INITERROR; } GETSTATE(module)->unicodedata_normalize = PyObject_GetAttrString(unicodedata, "normalize"); Py_DECREF(unicodedata); #if PY_MAJOR_VERSION >= 3 return module; #endif }