jellyfish
This commit is contained in:
parent
af4c76e821
commit
d15631cebb
|
@ -0,0 +1,52 @@
|
|||
=========
|
||||
jellyfish
|
||||
=========
|
||||
|
||||
Jellyfish is a python library for doing approximate and phonetic matching of strings.
|
||||
|
||||
jellyfish is a project of Sunlight Labs (c) 2010.
|
||||
All code is released under a BSD-style license, see LICENSE for details.
|
||||
|
||||
Written by Michael Stephens <mstephens@sunlightfoundation.com> and James Turk
|
||||
<jturk@sunlightfoundation.com>.
|
||||
|
||||
Source is available at http://github.com/sunlightlabs/jellyfish.
|
||||
|
||||
Included Algorithms
|
||||
===================
|
||||
|
||||
String comparison:
|
||||
|
||||
* Levenshtein Distance
|
||||
* Damerau-Levenshtein Distance
|
||||
* Jaro Distance
|
||||
* Jaro-Winkler Distance
|
||||
* Match Rating Approach Comparison
|
||||
* Hamming Distance
|
||||
|
||||
Phonetic encoding:
|
||||
|
||||
* American Soundex
|
||||
* Metaphone
|
||||
* NYSIIS (New York State Identification and Intelligence System)
|
||||
* Match Rating Codex
|
||||
|
||||
Example Usage
|
||||
=============
|
||||
|
||||
>>> import jellyfish
|
||||
>>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
|
||||
2
|
||||
>>> jellyfish.jaro_distance('jellyfish', 'smellyfish')
|
||||
0.89629629629629637
|
||||
>>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
|
||||
1
|
||||
|
||||
>>> jellyfish.metaphone('Jellyfish')
|
||||
'JLFX'
|
||||
>>> jellyfish.soundex('Jellyfish')
|
||||
'J412'
|
||||
>>> jellyfish.nysiis('Jellyfish')
|
||||
'JALYF'
|
||||
>>> jellyfish.match_rating_codex('Jellyfish')
|
||||
'JLLFSH'
|
|
@ -1,4 +1,4 @@
|
|||
#include "strfry.h"
|
||||
#include "jellyfish.h"
|
||||
#include <string.h>
|
||||
|
||||
int damerau_levenshtein_distance(const char *s1, const char *s2)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#include "strfry.h"
|
||||
#include "jellyfish.h"
|
||||
#include <ctype.h>
|
||||
|
||||
size_t hamming_distance(const char *s1, const char *s2) {
|
||||
|
|
2
jaro.c
2
jaro.c
|
@ -2,7 +2,7 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "strfry.h"
|
||||
#include "jellyfish.h"
|
||||
|
||||
#define NOTNUM(c) ((c>57) || (c<48))
|
||||
#define INRANGE(c) ((c>0) && (c<91))
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#ifndef _STRFRY_H_
|
||||
#define _STRFRY_H_
|
||||
#ifndef _JELLYFISH_H_
|
||||
#define _JELLYFISH_H_
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
|
@ -1,16 +1,16 @@
|
|||
#include <Python.h>
|
||||
#include <math.h>
|
||||
#include "strfry.h"
|
||||
#include "jellyfish.h"
|
||||
|
||||
struct strfry_state {
|
||||
struct jellyfish_state {
|
||||
PyObject *unicodedata_normalize;
|
||||
};
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define GETSTATE(m) ((struct strfry_state*)PyModule_GetState(m))
|
||||
#define GETSTATE(m) ((struct jellyfish_state*)PyModule_GetState(m))
|
||||
#else
|
||||
#define GETSTATE(m) (&_state)
|
||||
static struct strfry_state _state;
|
||||
static struct jellyfish_state _state;
|
||||
#endif
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
|
@ -58,7 +58,7 @@ static inline PyObject* normalize(PyObject *mod, PyObject *pystr) {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static PyObject * strfry_jaro_winkler(PyObject *self, PyObject *args,
|
||||
static PyObject * jellyfish_jaro_winkler(PyObject *self, PyObject *args,
|
||||
PyObject *keywds)
|
||||
{
|
||||
const char *s1, *s2;
|
||||
|
@ -77,7 +77,7 @@ static PyObject * strfry_jaro_winkler(PyObject *self, PyObject *args,
|
|||
return Py_BuildValue("d", result);
|
||||
}
|
||||
|
||||
static PyObject * strfry_jaro_distance(PyObject *self, PyObject *args,
|
||||
static PyObject * jellyfish_jaro_distance(PyObject *self, PyObject *args,
|
||||
PyObject *keywds)
|
||||
{
|
||||
const char *s1, *s2;
|
||||
|
@ -96,7 +96,7 @@ static PyObject * strfry_jaro_distance(PyObject *self, PyObject *args,
|
|||
return Py_BuildValue("d", result);
|
||||
}
|
||||
|
||||
static PyObject * strfry_hamming_distance(PyObject *self, PyObject *args,
|
||||
static PyObject * jellyfish_hamming_distance(PyObject *self, PyObject *args,
|
||||
PyObject *keywds)
|
||||
{
|
||||
const char *s1, *s2;
|
||||
|
@ -111,7 +111,7 @@ static PyObject * strfry_hamming_distance(PyObject *self, PyObject *args,
|
|||
return Py_BuildValue("I", result);
|
||||
}
|
||||
|
||||
static PyObject* strfry_levenshtein_distance(PyObject *self, PyObject *args)
|
||||
static PyObject* jellyfish_levenshtein_distance(PyObject *self, PyObject *args)
|
||||
{
|
||||
const char *s1, *s2;
|
||||
int result;
|
||||
|
@ -131,7 +131,7 @@ static PyObject* strfry_levenshtein_distance(PyObject *self, PyObject *args)
|
|||
return Py_BuildValue("i", result);
|
||||
}
|
||||
|
||||
static PyObject* strfry_damerau_levenshtein_distance(PyObject *self,
|
||||
static PyObject* jellyfish_damerau_levenshtein_distance(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
const char *s1, *s2;
|
||||
|
@ -150,7 +150,7 @@ static PyObject* strfry_damerau_levenshtein_distance(PyObject *self,
|
|||
return Py_BuildValue("i", result);
|
||||
}
|
||||
|
||||
static PyObject* strfry_soundex(PyObject *self, PyObject *args)
|
||||
static PyObject* jellyfish_soundex(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *pystr;
|
||||
PyObject *normalized;
|
||||
|
@ -181,7 +181,7 @@ static PyObject* strfry_soundex(PyObject *self, PyObject *args)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static PyObject* strfry_metaphone(PyObject *self, PyObject *args)
|
||||
static PyObject* jellyfish_metaphone(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *pystr;
|
||||
PyObject *normalized;
|
||||
|
@ -212,7 +212,7 @@ static PyObject* strfry_metaphone(PyObject *self, PyObject *args)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static PyObject* strfry_match_rating_codex(PyObject *self, PyObject *args)
|
||||
static PyObject* jellyfish_match_rating_codex(PyObject *self, PyObject *args)
|
||||
{
|
||||
const char *str;
|
||||
char *result;
|
||||
|
@ -234,7 +234,8 @@ static PyObject* strfry_match_rating_codex(PyObject *self, PyObject *args)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static PyObject* strfry_match_rating_comparison(PyObject *self, PyObject *args)
|
||||
static PyObject* jellyfish_match_rating_comparison(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
const char *str1, *str2;
|
||||
int result;
|
||||
|
@ -256,7 +257,7 @@ static PyObject* strfry_match_rating_comparison(PyObject *self, PyObject *args)
|
|||
}
|
||||
}
|
||||
|
||||
static PyObject* strfry_nysiis(PyObject *self, PyObject *args)
|
||||
static PyObject* jellyfish_nysiis(PyObject *self, PyObject *args)
|
||||
{
|
||||
const char *str;
|
||||
char *result;
|
||||
|
@ -278,46 +279,46 @@ static PyObject* strfry_nysiis(PyObject *self, PyObject *args)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static PyMethodDef strfry_methods[] = {
|
||||
{"jaro_winkler", strfry_jaro_winkler, METH_VARARGS,
|
||||
static PyMethodDef jellyfish_methods[] = {
|
||||
{"jaro_winkler", jellyfish_jaro_winkler, METH_VARARGS,
|
||||
"jaro_winkler(string1, string2, ignore_case=True)\n\n"
|
||||
"Do a Jaro-Winkler string comparison between string1 and string2."},
|
||||
|
||||
{"jaro_distance", strfry_jaro_distance, METH_VARARGS,
|
||||
{"jaro_distance", jellyfish_jaro_distance, METH_VARARGS,
|
||||
"jaro_distance(string1, string2, ignore_case=True)\n\n"
|
||||
"Get a Jaro string distance metric for string1 and string2."},
|
||||
|
||||
{"hamming_distance", strfry_hamming_distance, METH_VARARGS,
|
||||
{"hamming_distance", jellyfish_hamming_distance, METH_VARARGS,
|
||||
"hamming_distance(string1, string2, ignore_case=True)\n\n"
|
||||
"Compute the Hamming distance between string1 and string2."},
|
||||
|
||||
{"levenshtein_distance", strfry_levenshtein_distance, METH_VARARGS,
|
||||
{"levenshtein_distance", jellyfish_levenshtein_distance, METH_VARARGS,
|
||||
"levenshtein_distance(string1, string2)\n\n"
|
||||
"Compute the Levenshtein distance between string1 and string2."},
|
||||
|
||||
{"damerau_levenshtein_distance", strfry_damerau_levenshtein_distance,
|
||||
{"damerau_levenshtein_distance", jellyfish_damerau_levenshtein_distance,
|
||||
METH_VARARGS,
|
||||
"damerau_levenshtein_distance(string1, string2)\n\n"
|
||||
"Compute the Damerau-Levenshtein distance between string1 and string2."},
|
||||
|
||||
{"soundex", strfry_soundex, METH_VARARGS,
|
||||
{"soundex", jellyfish_soundex, METH_VARARGS,
|
||||
"soundex(string)\n\n"
|
||||
"Calculate the soundex code for a given name."},
|
||||
|
||||
{"metaphone", strfry_metaphone, METH_VARARGS,
|
||||
{"metaphone", jellyfish_metaphone, METH_VARARGS,
|
||||
"metaphone(string)\n\n"
|
||||
"Calculate the metaphone representation of a given string."},
|
||||
|
||||
{"match_rating_codex", strfry_match_rating_codex, METH_VARARGS,
|
||||
{"match_rating_codex", jellyfish_match_rating_codex, METH_VARARGS,
|
||||
"match_rating_codex(string)\n\n"
|
||||
"Calculate the Match Rating Approach representation of a given string."},
|
||||
|
||||
{"match_rating_comparison", strfry_match_rating_comparison, METH_VARARGS,
|
||||
{"match_rating_comparison", jellyfish_match_rating_comparison, METH_VARARGS,
|
||||
"match_rating_comparison(string)\n\n"
|
||||
"Compute the Match Rating Approach similarity between string1 and"
|
||||
"string2."},
|
||||
|
||||
{"nysiis", strfry_nysiis, METH_VARARGS,
|
||||
{"nysiis", jellyfish_nysiis, METH_VARARGS,
|
||||
"nysiis(string)\n\n"
|
||||
"Compute the NYSIIS (New York State Identification and Intelligence\n"
|
||||
"System) code for a string."},
|
||||
|
@ -332,20 +333,20 @@ static struct PyModuleDef moduledef = {
|
|||
PyModuleDef_HEAD_INIT,
|
||||
"strfry",
|
||||
NULL,
|
||||
sizeof(struct strfry_state),
|
||||
strfry_methods,
|
||||
sizeof(struct jellyfish_state),
|
||||
jellyfish_methods,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
PyObject* PyInit_strfry(void)
|
||||
PyObject* PyInit_jellyfish(void)
|
||||
#else
|
||||
|
||||
#define INITERROR return
|
||||
|
||||
PyMODINIT_FUNC initstrfry(void)
|
||||
PyMODINIT_FUNC initjellyfish(void)
|
||||
#endif
|
||||
{
|
||||
PyObject *unicodedata;
|
||||
|
@ -353,7 +354,7 @@ PyMODINIT_FUNC initstrfry(void)
|
|||
#if PY_MAJOR_VERSION >= 3
|
||||
PyObject *module = PyModule_Create(&moduledef);
|
||||
#else
|
||||
PyObject *module = Py_InitModule("strfry", strfry_methods);
|
||||
PyObject *module = Py_InitModule("jellyfish", jellyfish_methods);
|
||||
#endif
|
||||
|
||||
if (module == NULL) {
|
|
@ -1,4 +1,4 @@
|
|||
#include "strfry.h"
|
||||
#include "jellyfish.h"
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#include "strfry.h"
|
||||
#include "jellyfish.h"
|
||||
#include <ctype.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
2
mra.c
2
mra.c
|
@ -1,4 +1,4 @@
|
|||
#include "strfry.h"
|
||||
#include "jellyfish.h"
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
|
|
2
nysiis.c
2
nysiis.c
|
@ -1,4 +1,4 @@
|
|||
#include "strfry.h"
|
||||
#include "jellyfish.h"
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
|
|
4
setup.py
4
setup.py
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
from setuptools import setup, Extension
|
||||
|
||||
setup(name="strfry",
|
||||
setup(name="jellyfish",
|
||||
platforms=["any"],
|
||||
classifiers=["Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
|
@ -10,7 +10,7 @@ setup(name="strfry",
|
|||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python",
|
||||
"Topic :: Text Processing :: Linguistic"],
|
||||
ext_modules=[Extension("strfry", ['strfrymodule.c', 'jaro.c',
|
||||
ext_modules=[Extension("jellyfish", ['jellyfishmodule.c', 'jaro.c',
|
||||
'hamming.c', 'levenshtein.c',
|
||||
'damerau_levenshtein.c', 'mra.c',
|
||||
'soundex.c', 'metaphone.c',
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#include "strfry.h"
|
||||
#include "jellyfish.h"
|
||||
#include <ctype.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
|
24
test.py
24
test.py
|
@ -1,9 +1,9 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import unittest
|
||||
import strfry
|
||||
import jellyfish
|
||||
|
||||
|
||||
class StrfryTestCase(unittest.TestCase):
|
||||
class JellyfishTestCase(unittest.TestCase):
|
||||
|
||||
def test_jaro_winkler(self):
|
||||
cases = [("dixon", "dicksonx", 0.8133),
|
||||
|
@ -13,7 +13,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, s2, value) in cases:
|
||||
self.assertAlmostEqual(strfry.jaro_winkler(s1, s2), value,
|
||||
self.assertAlmostEqual(jellyfish.jaro_winkler(s1, s2), value,
|
||||
places=4)
|
||||
|
||||
|
||||
|
@ -25,7 +25,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, s2, value) in cases:
|
||||
self.assertAlmostEqual(strfry.jaro_distance(s1, s2), value,
|
||||
self.assertAlmostEqual(jellyfish.jaro_distance(s1, s2), value,
|
||||
places=3)
|
||||
|
||||
def test_hamming_distance(self):
|
||||
|
@ -39,7 +39,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, s2, value) in cases:
|
||||
self.assertEqual(strfry.hamming_distance(s1, s2), value)
|
||||
self.assertEqual(jellyfish.hamming_distance(s1, s2), value)
|
||||
|
||||
def test_levenshtein_distance(self):
|
||||
cases = [("", "", 0),
|
||||
|
@ -50,7 +50,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, s2, value) in cases:
|
||||
self.assertEqual(strfry.levenshtein_distance(s1, s2), value)
|
||||
self.assertEqual(jellyfish.levenshtein_distance(s1, s2), value)
|
||||
|
||||
def test_damerau_levenshtein_distance(self):
|
||||
cases = [("", "", 0),
|
||||
|
@ -60,7 +60,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, s2, value) in cases:
|
||||
self.assertEqual(strfry.damerau_levenshtein_distance(s1, s2),
|
||||
self.assertEqual(jellyfish.damerau_levenshtein_distance(s1, s2),
|
||||
value)
|
||||
|
||||
def test_soundex(self):
|
||||
|
@ -76,7 +76,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, code) in cases:
|
||||
self.assertEqual(strfry.soundex(s1), code)
|
||||
self.assertEqual(jellyfish.soundex(s1), code)
|
||||
|
||||
def test_metaphone(self):
|
||||
cases = [("metaphone", 'MTFN'),
|
||||
|
@ -91,7 +91,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, code) in cases:
|
||||
self.assertEqual(strfry.metaphone(s1), code)
|
||||
self.assertEqual(jellyfish.metaphone(s1), code)
|
||||
|
||||
def test_nysiis(self):
|
||||
cases = [("Worthy", "WARTY"),
|
||||
|
@ -102,7 +102,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, s2) in cases:
|
||||
self.assertEqual(strfry.nysiis(s1), s2)
|
||||
self.assertEqual(jellyfish.nysiis(s1), s2)
|
||||
|
||||
def test_match_rating_codex(self):
|
||||
cases = [("Byrne", "BYRN"),
|
||||
|
@ -114,7 +114,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, s2) in cases:
|
||||
self.assertEqual(strfry.match_rating_codex(s1), s2)
|
||||
self.assertEqual(jellyfish.match_rating_codex(s1), s2)
|
||||
|
||||
def test_match_rating_comparison(self):
|
||||
cases = [("Bryne", "Boern", True),
|
||||
|
@ -124,7 +124,7 @@ class StrfryTestCase(unittest.TestCase):
|
|||
]
|
||||
|
||||
for (s1, s2, value) in cases:
|
||||
self.assertEqual(strfry.match_rating_comparison(s1, s2), value)
|
||||
self.assertEqual(jellyfish.match_rating_comparison(s1, s2), value)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
Loading…
Reference in New Issue