jellyfish

This commit is contained in:
Michael Stephens 2010-07-13 16:15:08 -04:00
parent af4c76e821
commit d15631cebb
13 changed files with 107 additions and 54 deletions

52
README.rst Normal file
View File

@ -0,0 +1,52 @@
=========
jellyfish
=========
Jellyfish is a python library for doing approximate and phonetic matching of strings.
jellyfish is a project of Sunlight Labs (c) 2010.
All code is released under a BSD-style license, see LICENSE for details.
Written by Michael Stephens <mstephens@sunlightfoundation.com> and James Turk
<jturk@sunlightfoundation.com>.
Source is available at http://github.com/sunlightlabs/jellyfish.
Included Algorithms
===================
String comparison:
* Levenshtein Distance
* Damerau-Levenshtein Distance
* Jaro Distance
* Jaro-Winkler Distance
* Match Rating Approach Comparison
* Hamming Distance
Phonetic encoding:
* American Soundex
* Metaphone
* NYSIIS (New York State Identification and Intelligence System)
* Match Rating Codex
Example Usage
=============
>>> import jellyfish
>>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
2
>>> jellyfish.jaro_distance('jellyfish', 'smellyfish')
0.89629629629629637
>>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
1
>>> jellyfish.metaphone('Jellyfish')
'JLFX'
>>> jellyfish.soundex('Jellyfish')
'J412'
>>> jellyfish.nysiis('Jellyfish')
'JALYF'
>>> jellyfish.match_rating_codex('Jellyfish')
'JLLFSH'

View File

@ -1,4 +1,4 @@
#include "strfry.h"
#include "jellyfish.h"
#include <string.h>
int damerau_levenshtein_distance(const char *s1, const char *s2)

View File

@ -1,4 +1,4 @@
#include "strfry.h"
#include "jellyfish.h"
#include <ctype.h>
size_t hamming_distance(const char *s1, const char *s2) {

2
jaro.c
View File

@ -2,7 +2,7 @@
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "strfry.h"
#include "jellyfish.h"
#define NOTNUM(c) ((c>57) || (c<48))
#define INRANGE(c) ((c>0) && (c<91))

View File

@ -1,5 +1,5 @@
#ifndef _STRFRY_H_
#define _STRFRY_H_
#ifndef _JELLYFISH_H_
#define _JELLYFISH_H_
#include <stdbool.h>
#include <stdlib.h>

View File

@ -1,16 +1,16 @@
#include <Python.h>
#include <math.h>
#include "strfry.h"
#include "jellyfish.h"
struct strfry_state {
struct jellyfish_state {
PyObject *unicodedata_normalize;
};
#if PY_MAJOR_VERSION >= 3
#define GETSTATE(m) ((struct strfry_state*)PyModule_GetState(m))
#define GETSTATE(m) ((struct jellyfish_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct strfry_state _state;
static struct jellyfish_state _state;
#endif
#if PY_MAJOR_VERSION >= 3
@ -58,7 +58,7 @@ static inline PyObject* normalize(PyObject *mod, PyObject *pystr) {
return NULL;
}
static PyObject * strfry_jaro_winkler(PyObject *self, PyObject *args,
static PyObject * jellyfish_jaro_winkler(PyObject *self, PyObject *args,
PyObject *keywds)
{
const char *s1, *s2;
@ -77,7 +77,7 @@ static PyObject * strfry_jaro_winkler(PyObject *self, PyObject *args,
return Py_BuildValue("d", result);
}
static PyObject * strfry_jaro_distance(PyObject *self, PyObject *args,
static PyObject * jellyfish_jaro_distance(PyObject *self, PyObject *args,
PyObject *keywds)
{
const char *s1, *s2;
@ -96,7 +96,7 @@ static PyObject * strfry_jaro_distance(PyObject *self, PyObject *args,
return Py_BuildValue("d", result);
}
static PyObject * strfry_hamming_distance(PyObject *self, PyObject *args,
static PyObject * jellyfish_hamming_distance(PyObject *self, PyObject *args,
PyObject *keywds)
{
const char *s1, *s2;
@ -111,7 +111,7 @@ static PyObject * strfry_hamming_distance(PyObject *self, PyObject *args,
return Py_BuildValue("I", result);
}
static PyObject* strfry_levenshtein_distance(PyObject *self, PyObject *args)
static PyObject* jellyfish_levenshtein_distance(PyObject *self, PyObject *args)
{
const char *s1, *s2;
int result;
@ -131,7 +131,7 @@ static PyObject* strfry_levenshtein_distance(PyObject *self, PyObject *args)
return Py_BuildValue("i", result);
}
static PyObject* strfry_damerau_levenshtein_distance(PyObject *self,
static PyObject* jellyfish_damerau_levenshtein_distance(PyObject *self,
PyObject *args)
{
const char *s1, *s2;
@ -150,7 +150,7 @@ static PyObject* strfry_damerau_levenshtein_distance(PyObject *self,
return Py_BuildValue("i", result);
}
static PyObject* strfry_soundex(PyObject *self, PyObject *args)
static PyObject* jellyfish_soundex(PyObject *self, PyObject *args)
{
PyObject *pystr;
PyObject *normalized;
@ -181,7 +181,7 @@ static PyObject* strfry_soundex(PyObject *self, PyObject *args)
return ret;
}
static PyObject* strfry_metaphone(PyObject *self, PyObject *args)
static PyObject* jellyfish_metaphone(PyObject *self, PyObject *args)
{
PyObject *pystr;
PyObject *normalized;
@ -212,7 +212,7 @@ static PyObject* strfry_metaphone(PyObject *self, PyObject *args)
return ret;
}
static PyObject* strfry_match_rating_codex(PyObject *self, PyObject *args)
static PyObject* jellyfish_match_rating_codex(PyObject *self, PyObject *args)
{
const char *str;
char *result;
@ -234,7 +234,8 @@ static PyObject* strfry_match_rating_codex(PyObject *self, PyObject *args)
return ret;
}
static PyObject* strfry_match_rating_comparison(PyObject *self, PyObject *args)
static PyObject* jellyfish_match_rating_comparison(PyObject *self,
PyObject *args)
{
const char *str1, *str2;
int result;
@ -256,7 +257,7 @@ static PyObject* strfry_match_rating_comparison(PyObject *self, PyObject *args)
}
}
static PyObject* strfry_nysiis(PyObject *self, PyObject *args)
static PyObject* jellyfish_nysiis(PyObject *self, PyObject *args)
{
const char *str;
char *result;
@ -278,46 +279,46 @@ static PyObject* strfry_nysiis(PyObject *self, PyObject *args)
return ret;
}
static PyMethodDef strfry_methods[] = {
{"jaro_winkler", strfry_jaro_winkler, METH_VARARGS,
static PyMethodDef jellyfish_methods[] = {
{"jaro_winkler", jellyfish_jaro_winkler, METH_VARARGS,
"jaro_winkler(string1, string2, ignore_case=True)\n\n"
"Do a Jaro-Winkler string comparison between string1 and string2."},
{"jaro_distance", strfry_jaro_distance, METH_VARARGS,
{"jaro_distance", jellyfish_jaro_distance, METH_VARARGS,
"jaro_distance(string1, string2, ignore_case=True)\n\n"
"Get a Jaro string distance metric for string1 and string2."},
{"hamming_distance", strfry_hamming_distance, METH_VARARGS,
{"hamming_distance", jellyfish_hamming_distance, METH_VARARGS,
"hamming_distance(string1, string2, ignore_case=True)\n\n"
"Compute the Hamming distance between string1 and string2."},
{"levenshtein_distance", strfry_levenshtein_distance, METH_VARARGS,
{"levenshtein_distance", jellyfish_levenshtein_distance, METH_VARARGS,
"levenshtein_distance(string1, string2)\n\n"
"Compute the Levenshtein distance between string1 and string2."},
{"damerau_levenshtein_distance", strfry_damerau_levenshtein_distance,
{"damerau_levenshtein_distance", jellyfish_damerau_levenshtein_distance,
METH_VARARGS,
"damerau_levenshtein_distance(string1, string2)\n\n"
"Compute the Damerau-Levenshtein distance between string1 and string2."},
{"soundex", strfry_soundex, METH_VARARGS,
{"soundex", jellyfish_soundex, METH_VARARGS,
"soundex(string)\n\n"
"Calculate the soundex code for a given name."},
{"metaphone", strfry_metaphone, METH_VARARGS,
{"metaphone", jellyfish_metaphone, METH_VARARGS,
"metaphone(string)\n\n"
"Calculate the metaphone representation of a given string."},
{"match_rating_codex", strfry_match_rating_codex, METH_VARARGS,
{"match_rating_codex", jellyfish_match_rating_codex, METH_VARARGS,
"match_rating_codex(string)\n\n"
"Calculate the Match Rating Approach representation of a given string."},
{"match_rating_comparison", strfry_match_rating_comparison, METH_VARARGS,
{"match_rating_comparison", jellyfish_match_rating_comparison, METH_VARARGS,
"match_rating_comparison(string)\n\n"
"Compute the Match Rating Approach similarity between string1 and"
"string2."},
{"nysiis", strfry_nysiis, METH_VARARGS,
{"nysiis", jellyfish_nysiis, METH_VARARGS,
"nysiis(string)\n\n"
"Compute the NYSIIS (New York State Identification and Intelligence\n"
"System) code for a string."},
@ -332,20 +333,20 @@ static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"strfry",
NULL,
sizeof(struct strfry_state),
strfry_methods,
sizeof(struct jellyfish_state),
jellyfish_methods,
NULL,
NULL,
NULL,
NULL
};
PyObject* PyInit_strfry(void)
PyObject* PyInit_jellyfish(void)
#else
#define INITERROR return
PyMODINIT_FUNC initstrfry(void)
PyMODINIT_FUNC initjellyfish(void)
#endif
{
PyObject *unicodedata;
@ -353,7 +354,7 @@ PyMODINIT_FUNC initstrfry(void)
#if PY_MAJOR_VERSION >= 3
PyObject *module = PyModule_Create(&moduledef);
#else
PyObject *module = Py_InitModule("strfry", strfry_methods);
PyObject *module = Py_InitModule("jellyfish", jellyfish_methods);
#endif
if (module == NULL) {

View File

@ -1,4 +1,4 @@
#include "strfry.h"
#include "jellyfish.h"
#include <string.h>
#include <stdlib.h>
#include <stdio.h>

View File

@ -1,4 +1,4 @@
#include "strfry.h"
#include "jellyfish.h"
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

2
mra.c
View File

@ -1,4 +1,4 @@
#include "strfry.h"
#include "jellyfish.h"
#include <string.h>
#include <ctype.h>

View File

@ -1,4 +1,4 @@
#include "strfry.h"
#include "jellyfish.h"
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python
from setuptools import setup, Extension
setup(name="strfry",
setup(name="jellyfish",
platforms=["any"],
classifiers=["Development Status :: 4 - Beta",
"Intended Audience :: Developers",
@ -10,7 +10,7 @@ setup(name="strfry",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Topic :: Text Processing :: Linguistic"],
ext_modules=[Extension("strfry", ['strfrymodule.c', 'jaro.c',
ext_modules=[Extension("jellyfish", ['jellyfishmodule.c', 'jaro.c',
'hamming.c', 'levenshtein.c',
'damerau_levenshtein.c', 'mra.c',
'soundex.c', 'metaphone.c',

View File

@ -1,4 +1,4 @@
#include "strfry.h"
#include "jellyfish.h"
#include <ctype.h>
#include <stdlib.h>

24
test.py
View File

@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
import unittest
import strfry
import jellyfish
class StrfryTestCase(unittest.TestCase):
class JellyfishTestCase(unittest.TestCase):
def test_jaro_winkler(self):
cases = [("dixon", "dicksonx", 0.8133),
@ -13,7 +13,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, s2, value) in cases:
self.assertAlmostEqual(strfry.jaro_winkler(s1, s2), value,
self.assertAlmostEqual(jellyfish.jaro_winkler(s1, s2), value,
places=4)
@ -25,7 +25,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, s2, value) in cases:
self.assertAlmostEqual(strfry.jaro_distance(s1, s2), value,
self.assertAlmostEqual(jellyfish.jaro_distance(s1, s2), value,
places=3)
def test_hamming_distance(self):
@ -39,7 +39,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, s2, value) in cases:
self.assertEqual(strfry.hamming_distance(s1, s2), value)
self.assertEqual(jellyfish.hamming_distance(s1, s2), value)
def test_levenshtein_distance(self):
cases = [("", "", 0),
@ -50,7 +50,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, s2, value) in cases:
self.assertEqual(strfry.levenshtein_distance(s1, s2), value)
self.assertEqual(jellyfish.levenshtein_distance(s1, s2), value)
def test_damerau_levenshtein_distance(self):
cases = [("", "", 0),
@ -60,7 +60,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, s2, value) in cases:
self.assertEqual(strfry.damerau_levenshtein_distance(s1, s2),
self.assertEqual(jellyfish.damerau_levenshtein_distance(s1, s2),
value)
def test_soundex(self):
@ -76,7 +76,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, code) in cases:
self.assertEqual(strfry.soundex(s1), code)
self.assertEqual(jellyfish.soundex(s1), code)
def test_metaphone(self):
cases = [("metaphone", 'MTFN'),
@ -91,7 +91,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, code) in cases:
self.assertEqual(strfry.metaphone(s1), code)
self.assertEqual(jellyfish.metaphone(s1), code)
def test_nysiis(self):
cases = [("Worthy", "WARTY"),
@ -102,7 +102,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, s2) in cases:
self.assertEqual(strfry.nysiis(s1), s2)
self.assertEqual(jellyfish.nysiis(s1), s2)
def test_match_rating_codex(self):
cases = [("Byrne", "BYRN"),
@ -114,7 +114,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, s2) in cases:
self.assertEqual(strfry.match_rating_codex(s1), s2)
self.assertEqual(jellyfish.match_rating_codex(s1), s2)
def test_match_rating_comparison(self):
cases = [("Bryne", "Boern", True),
@ -124,7 +124,7 @@ class StrfryTestCase(unittest.TestCase):
]
for (s1, s2, value) in cases:
self.assertEqual(strfry.match_rating_comparison(s1, s2), value)
self.assertEqual(jellyfish.match_rating_comparison(s1, s2), value)
if __name__ == '__main__':
unittest.main()