From d15631cebbd601f509d108fb9f2ac3a56fcb93d3 Mon Sep 17 00:00:00 2001 From: Michael Stephens Date: Tue, 13 Jul 2010 16:15:08 -0400 Subject: [PATCH] jellyfish --- README.rst | 52 ++++++++++++++++++++++++ damerau_levenshtein.c | 2 +- hamming.c | 2 +- jaro.c | 2 +- strfry.h => jellyfish.h | 4 +- strfrymodule.c => jellyfishmodule.c | 61 +++++++++++++++-------------- levenshtein.c | 2 +- metaphone.c | 2 +- mra.c | 2 +- nysiis.c | 2 +- setup.py | 4 +- soundex.c | 2 +- test.py | 24 ++++++------ 13 files changed, 107 insertions(+), 54 deletions(-) create mode 100644 README.rst rename strfry.h => jellyfish.h (93%) rename strfrymodule.c => jellyfishmodule.c (79%) diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..23cee9f --- /dev/null +++ b/README.rst @@ -0,0 +1,52 @@ +========= +jellyfish +========= + +Jellyfish is a python library for doing approximate and phonetic matching of strings. + +jellyfish is a project of Sunlight Labs (c) 2010. +All code is released under a BSD-style license, see LICENSE for details. + +Written by Michael Stephens and James Turk +. + +Source is available at http://github.com/sunlightlabs/jellyfish. + +Included Algorithms +=================== + +String comparison: + + * Levenshtein Distance + * Damerau-Levenshtein Distance + * Jaro Distance + * Jaro-Winkler Distance + * Match Rating Approach Comparison + * Hamming Distance + +Phonetic encoding: + + * American Soundex + * Metaphone + * NYSIIS (New York State Identification and Intelligence System) + * Match Rating Codex + +Example Usage +============= + +>>> import jellyfish +>>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish') +2 +>>> jellyfish.jaro_distance('jellyfish', 'smellyfish') +0.89629629629629637 +>>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') +1 + +>>> jellyfish.metaphone('Jellyfish') +'JLFX' +>>> jellyfish.soundex('Jellyfish') +'J412' +>>> jellyfish.nysiis('Jellyfish') +'JALYF' +>>> jellyfish.match_rating_codex('Jellyfish') +'JLLFSH' \ No newline at end of file diff --git a/damerau_levenshtein.c b/damerau_levenshtein.c index 783bb1a..5674737 100644 --- a/damerau_levenshtein.c +++ b/damerau_levenshtein.c @@ -1,4 +1,4 @@ -#include "strfry.h" +#include "jellyfish.h" #include int damerau_levenshtein_distance(const char *s1, const char *s2) diff --git a/hamming.c b/hamming.c index 8ab8b9c..355b40c 100644 --- a/hamming.c +++ b/hamming.c @@ -1,4 +1,4 @@ -#include "strfry.h" +#include "jellyfish.h" #include size_t hamming_distance(const char *s1, const char *s2) { diff --git a/jaro.c b/jaro.c index 3587dcf..275688c 100644 --- a/jaro.c +++ b/jaro.c @@ -2,7 +2,7 @@ #include #include #include -#include "strfry.h" +#include "jellyfish.h" #define NOTNUM(c) ((c>57) || (c<48)) #define INRANGE(c) ((c>0) && (c<91)) diff --git a/strfry.h b/jellyfish.h similarity index 93% rename from strfry.h rename to jellyfish.h index eee69a0..3bd1c18 100644 --- a/strfry.h +++ b/jellyfish.h @@ -1,5 +1,5 @@ -#ifndef _STRFRY_H_ -#define _STRFRY_H_ +#ifndef _JELLYFISH_H_ +#define _JELLYFISH_H_ #include #include diff --git a/strfrymodule.c b/jellyfishmodule.c similarity index 79% rename from strfrymodule.c rename to jellyfishmodule.c index 577d740..0cba834 100644 --- a/strfrymodule.c +++ b/jellyfishmodule.c @@ -1,16 +1,16 @@ #include #include -#include "strfry.h" +#include "jellyfish.h" -struct strfry_state { +struct jellyfish_state { PyObject *unicodedata_normalize; }; #if PY_MAJOR_VERSION >= 3 -#define GETSTATE(m) ((struct strfry_state*)PyModule_GetState(m)) +#define GETSTATE(m) ((struct jellyfish_state*)PyModule_GetState(m)) #else #define GETSTATE(m) (&_state) -static struct strfry_state _state; +static struct jellyfish_state _state; #endif #if PY_MAJOR_VERSION >= 3 @@ -58,7 +58,7 @@ static inline PyObject* normalize(PyObject *mod, PyObject *pystr) { return NULL; } -static PyObject * strfry_jaro_winkler(PyObject *self, PyObject *args, +static PyObject * jellyfish_jaro_winkler(PyObject *self, PyObject *args, PyObject *keywds) { const char *s1, *s2; @@ -77,7 +77,7 @@ static PyObject * strfry_jaro_winkler(PyObject *self, PyObject *args, return Py_BuildValue("d", result); } -static PyObject * strfry_jaro_distance(PyObject *self, PyObject *args, +static PyObject * jellyfish_jaro_distance(PyObject *self, PyObject *args, PyObject *keywds) { const char *s1, *s2; @@ -96,7 +96,7 @@ static PyObject * strfry_jaro_distance(PyObject *self, PyObject *args, return Py_BuildValue("d", result); } -static PyObject * strfry_hamming_distance(PyObject *self, PyObject *args, +static PyObject * jellyfish_hamming_distance(PyObject *self, PyObject *args, PyObject *keywds) { const char *s1, *s2; @@ -111,7 +111,7 @@ static PyObject * strfry_hamming_distance(PyObject *self, PyObject *args, return Py_BuildValue("I", result); } -static PyObject* strfry_levenshtein_distance(PyObject *self, PyObject *args) +static PyObject* jellyfish_levenshtein_distance(PyObject *self, PyObject *args) { const char *s1, *s2; int result; @@ -131,7 +131,7 @@ static PyObject* strfry_levenshtein_distance(PyObject *self, PyObject *args) return Py_BuildValue("i", result); } -static PyObject* strfry_damerau_levenshtein_distance(PyObject *self, +static PyObject* jellyfish_damerau_levenshtein_distance(PyObject *self, PyObject *args) { const char *s1, *s2; @@ -150,7 +150,7 @@ static PyObject* strfry_damerau_levenshtein_distance(PyObject *self, return Py_BuildValue("i", result); } -static PyObject* strfry_soundex(PyObject *self, PyObject *args) +static PyObject* jellyfish_soundex(PyObject *self, PyObject *args) { PyObject *pystr; PyObject *normalized; @@ -181,7 +181,7 @@ static PyObject* strfry_soundex(PyObject *self, PyObject *args) return ret; } -static PyObject* strfry_metaphone(PyObject *self, PyObject *args) +static PyObject* jellyfish_metaphone(PyObject *self, PyObject *args) { PyObject *pystr; PyObject *normalized; @@ -212,7 +212,7 @@ static PyObject* strfry_metaphone(PyObject *self, PyObject *args) return ret; } -static PyObject* strfry_match_rating_codex(PyObject *self, PyObject *args) +static PyObject* jellyfish_match_rating_codex(PyObject *self, PyObject *args) { const char *str; char *result; @@ -234,7 +234,8 @@ static PyObject* strfry_match_rating_codex(PyObject *self, PyObject *args) return ret; } -static PyObject* strfry_match_rating_comparison(PyObject *self, PyObject *args) +static PyObject* jellyfish_match_rating_comparison(PyObject *self, + PyObject *args) { const char *str1, *str2; int result; @@ -256,7 +257,7 @@ static PyObject* strfry_match_rating_comparison(PyObject *self, PyObject *args) } } -static PyObject* strfry_nysiis(PyObject *self, PyObject *args) +static PyObject* jellyfish_nysiis(PyObject *self, PyObject *args) { const char *str; char *result; @@ -278,46 +279,46 @@ static PyObject* strfry_nysiis(PyObject *self, PyObject *args) return ret; } -static PyMethodDef strfry_methods[] = { - {"jaro_winkler", strfry_jaro_winkler, METH_VARARGS, +static PyMethodDef jellyfish_methods[] = { + {"jaro_winkler", jellyfish_jaro_winkler, METH_VARARGS, "jaro_winkler(string1, string2, ignore_case=True)\n\n" "Do a Jaro-Winkler string comparison between string1 and string2."}, - {"jaro_distance", strfry_jaro_distance, METH_VARARGS, + {"jaro_distance", jellyfish_jaro_distance, METH_VARARGS, "jaro_distance(string1, string2, ignore_case=True)\n\n" "Get a Jaro string distance metric for string1 and string2."}, - {"hamming_distance", strfry_hamming_distance, METH_VARARGS, + {"hamming_distance", jellyfish_hamming_distance, METH_VARARGS, "hamming_distance(string1, string2, ignore_case=True)\n\n" "Compute the Hamming distance between string1 and string2."}, - {"levenshtein_distance", strfry_levenshtein_distance, METH_VARARGS, + {"levenshtein_distance", jellyfish_levenshtein_distance, METH_VARARGS, "levenshtein_distance(string1, string2)\n\n" "Compute the Levenshtein distance between string1 and string2."}, - {"damerau_levenshtein_distance", strfry_damerau_levenshtein_distance, + {"damerau_levenshtein_distance", jellyfish_damerau_levenshtein_distance, METH_VARARGS, "damerau_levenshtein_distance(string1, string2)\n\n" "Compute the Damerau-Levenshtein distance between string1 and string2."}, - {"soundex", strfry_soundex, METH_VARARGS, + {"soundex", jellyfish_soundex, METH_VARARGS, "soundex(string)\n\n" "Calculate the soundex code for a given name."}, - {"metaphone", strfry_metaphone, METH_VARARGS, + {"metaphone", jellyfish_metaphone, METH_VARARGS, "metaphone(string)\n\n" "Calculate the metaphone representation of a given string."}, - {"match_rating_codex", strfry_match_rating_codex, METH_VARARGS, + {"match_rating_codex", jellyfish_match_rating_codex, METH_VARARGS, "match_rating_codex(string)\n\n" "Calculate the Match Rating Approach representation of a given string."}, - {"match_rating_comparison", strfry_match_rating_comparison, METH_VARARGS, + {"match_rating_comparison", jellyfish_match_rating_comparison, METH_VARARGS, "match_rating_comparison(string)\n\n" "Compute the Match Rating Approach similarity between string1 and" "string2."}, - {"nysiis", strfry_nysiis, METH_VARARGS, + {"nysiis", jellyfish_nysiis, METH_VARARGS, "nysiis(string)\n\n" "Compute the NYSIIS (New York State Identification and Intelligence\n" "System) code for a string."}, @@ -332,20 +333,20 @@ static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, "strfry", NULL, - sizeof(struct strfry_state), - strfry_methods, + sizeof(struct jellyfish_state), + jellyfish_methods, NULL, NULL, NULL, NULL }; -PyObject* PyInit_strfry(void) +PyObject* PyInit_jellyfish(void) #else #define INITERROR return -PyMODINIT_FUNC initstrfry(void) +PyMODINIT_FUNC initjellyfish(void) #endif { PyObject *unicodedata; @@ -353,7 +354,7 @@ PyMODINIT_FUNC initstrfry(void) #if PY_MAJOR_VERSION >= 3 PyObject *module = PyModule_Create(&moduledef); #else - PyObject *module = Py_InitModule("strfry", strfry_methods); + PyObject *module = Py_InitModule("jellyfish", jellyfish_methods); #endif if (module == NULL) { diff --git a/levenshtein.c b/levenshtein.c index df38a3e..1f6e0c4 100644 --- a/levenshtein.c +++ b/levenshtein.c @@ -1,4 +1,4 @@ -#include "strfry.h" +#include "jellyfish.h" #include #include #include diff --git a/metaphone.c b/metaphone.c index 3db768b..74d68e1 100644 --- a/metaphone.c +++ b/metaphone.c @@ -1,4 +1,4 @@ -#include "strfry.h" +#include "jellyfish.h" #include #include #include diff --git a/mra.c b/mra.c index 64dd4ac..a7cf51d 100644 --- a/mra.c +++ b/mra.c @@ -1,4 +1,4 @@ -#include "strfry.h" +#include "jellyfish.h" #include #include diff --git a/nysiis.c b/nysiis.c index 9b721fe..88fb4cd 100644 --- a/nysiis.c +++ b/nysiis.c @@ -1,4 +1,4 @@ -#include "strfry.h" +#include "jellyfish.h" #include #include #include diff --git a/setup.py b/setup.py index d839d88..e994456 100755 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python from setuptools import setup, Extension -setup(name="strfry", +setup(name="jellyfish", platforms=["any"], classifiers=["Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -10,7 +10,7 @@ setup(name="strfry", "Operating System :: OS Independent", "Programming Language :: Python", "Topic :: Text Processing :: Linguistic"], - ext_modules=[Extension("strfry", ['strfrymodule.c', 'jaro.c', + ext_modules=[Extension("jellyfish", ['jellyfishmodule.c', 'jaro.c', 'hamming.c', 'levenshtein.c', 'damerau_levenshtein.c', 'mra.c', 'soundex.c', 'metaphone.c', diff --git a/soundex.c b/soundex.c index f9c7a17..537980c 100644 --- a/soundex.c +++ b/soundex.c @@ -1,4 +1,4 @@ -#include "strfry.h" +#include "jellyfish.h" #include #include diff --git a/test.py b/test.py index 82ae94b..3945cd5 100644 --- a/test.py +++ b/test.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- import unittest -import strfry +import jellyfish -class StrfryTestCase(unittest.TestCase): +class JellyfishTestCase(unittest.TestCase): def test_jaro_winkler(self): cases = [("dixon", "dicksonx", 0.8133), @@ -13,7 +13,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, s2, value) in cases: - self.assertAlmostEqual(strfry.jaro_winkler(s1, s2), value, + self.assertAlmostEqual(jellyfish.jaro_winkler(s1, s2), value, places=4) @@ -25,7 +25,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, s2, value) in cases: - self.assertAlmostEqual(strfry.jaro_distance(s1, s2), value, + self.assertAlmostEqual(jellyfish.jaro_distance(s1, s2), value, places=3) def test_hamming_distance(self): @@ -39,7 +39,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, s2, value) in cases: - self.assertEqual(strfry.hamming_distance(s1, s2), value) + self.assertEqual(jellyfish.hamming_distance(s1, s2), value) def test_levenshtein_distance(self): cases = [("", "", 0), @@ -50,7 +50,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, s2, value) in cases: - self.assertEqual(strfry.levenshtein_distance(s1, s2), value) + self.assertEqual(jellyfish.levenshtein_distance(s1, s2), value) def test_damerau_levenshtein_distance(self): cases = [("", "", 0), @@ -60,7 +60,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, s2, value) in cases: - self.assertEqual(strfry.damerau_levenshtein_distance(s1, s2), + self.assertEqual(jellyfish.damerau_levenshtein_distance(s1, s2), value) def test_soundex(self): @@ -76,7 +76,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, code) in cases: - self.assertEqual(strfry.soundex(s1), code) + self.assertEqual(jellyfish.soundex(s1), code) def test_metaphone(self): cases = [("metaphone", 'MTFN'), @@ -91,7 +91,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, code) in cases: - self.assertEqual(strfry.metaphone(s1), code) + self.assertEqual(jellyfish.metaphone(s1), code) def test_nysiis(self): cases = [("Worthy", "WARTY"), @@ -102,7 +102,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, s2) in cases: - self.assertEqual(strfry.nysiis(s1), s2) + self.assertEqual(jellyfish.nysiis(s1), s2) def test_match_rating_codex(self): cases = [("Byrne", "BYRN"), @@ -114,7 +114,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, s2) in cases: - self.assertEqual(strfry.match_rating_codex(s1), s2) + self.assertEqual(jellyfish.match_rating_codex(s1), s2) def test_match_rating_comparison(self): cases = [("Bryne", "Boern", True), @@ -124,7 +124,7 @@ class StrfryTestCase(unittest.TestCase): ] for (s1, s2, value) in cases: - self.assertEqual(strfry.match_rating_comparison(s1, s2), value) + self.assertEqual(jellyfish.match_rating_comparison(s1, s2), value) if __name__ == '__main__': unittest.main()