mirror of https://github.com/Yomguithereal/fog.git
Adding fog.key.omission
This commit is contained in:
parent
65881f081d
commit
ed4c1f0834
|
@ -246,7 +246,7 @@ def pairwise_fuzzy_clusters(data, similarity=None, distance=None, radius=None,
|
|||
|
||||
# Pool
|
||||
with Pool(processes=processes) as pool:
|
||||
for matches in pool.imap_unordered(pairwise_fuzzy_clusters_worker, pool_iter):
|
||||
for matches in pool.imap(pairwise_fuzzy_clusters_worker, pool_iter):
|
||||
for i, j in matches:
|
||||
graph[i].append(j)
|
||||
graph[j].append(i)
|
||||
|
|
|
@ -4,4 +4,5 @@ from fog.key.fingerprint import (
|
|||
fingerprint,
|
||||
ngrams_fingerprint
|
||||
)
|
||||
from fog.key.omission import omission_key
|
||||
from fog.key.rusalka import rusalka
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
# =============================================================================
|
||||
# Fog Omission Key
|
||||
# =============================================================================
|
||||
#
|
||||
# The omission key by Pollock and Zamora.
|
||||
#
|
||||
# [Urls]:
|
||||
# http://dl.acm.org/citation.cfm?id=358048
|
||||
# http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.12.385&rep=rep1&type=pdf
|
||||
#
|
||||
# [Reference]:
|
||||
# Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
|
||||
# in Scientific and Scholarly Text." Communications of the ACM, 27(4).
|
||||
# 358--368.
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
|
||||
UNDESIRABLES_RE = re.compile(r'[^A-Z]')
|
||||
CONSONANTS = 'JKQXZVWYBFMGPDHCLNTSR'
|
||||
VOWELS = set('AEIOU')
|
||||
|
||||
# TODO: omission/skeleton key clustering for distance = 1
|
||||
|
||||
|
||||
def omission_key(string):
|
||||
"""
|
||||
Function returning a string's omission key which is constructed thusly:
|
||||
1) First we record the string's set of consonant in an order
|
||||
where most frequently mispelled consonants will be last.
|
||||
2) Then we record the string's set of vowels in the order of
|
||||
first appearance.
|
||||
|
||||
This key is very useful when searching for mispelled strings because
|
||||
if sorted using this key, similar strings will be next to each other.
|
||||
|
||||
Args:
|
||||
string (str): The string to encode.
|
||||
|
||||
Returns:
|
||||
string: The string's omission key.
|
||||
|
||||
"""
|
||||
|
||||
# Deburring
|
||||
string = unidecode(string)
|
||||
|
||||
# Normalizing case
|
||||
string = string.upper()
|
||||
|
||||
# Dropping useless characters
|
||||
string = re.sub(UNDESIRABLES_RE, '', string)
|
||||
|
||||
if not string:
|
||||
return ''
|
||||
|
||||
# Composing the key
|
||||
letters = set()
|
||||
consonants = []
|
||||
vowels = []
|
||||
|
||||
# Adding vowels in order they appeared
|
||||
for c in string:
|
||||
if c in VOWELS and c not in vowels:
|
||||
vowels.append(c)
|
||||
else:
|
||||
letters.add(c)
|
||||
|
||||
# Adding consonants in order
|
||||
for consonant in CONSONANTS:
|
||||
if consonant in letters:
|
||||
consonants.append(consonant)
|
||||
|
||||
return ''.join(consonants + vowels)
|
|
@ -0,0 +1,34 @@
|
|||
# =============================================================================
|
||||
# Fog Omission Key Unit Tests
|
||||
# =============================================================================
|
||||
from fog.key import omission_key
|
||||
|
||||
TESTS = [
|
||||
('', ''),
|
||||
('hello', 'HLEO'),
|
||||
('The quick brown fox jumped over the lazy dog.', 'JKQXZVWYBFMGPDHCLNTREUIOA'),
|
||||
('Christopher', 'PHCTSRIOE'),
|
||||
('Niall', 'LNIA'),
|
||||
('caramel', 'MCLRAE'),
|
||||
('Carlson', 'CLNSRAO'),
|
||||
('Karlsson', 'KLNSRAO'),
|
||||
('microeletronics', 'MCLNTSRIOE'),
|
||||
('Circumstantial', 'MCLNTSRIUA'),
|
||||
('LUMINESCENT', 'MCLNTSUIE'),
|
||||
('multinucleate', 'MCLNTUIEA'),
|
||||
('multinucleon', 'MCLNTUIEO'),
|
||||
('cumulene', 'MCLNUE'),
|
||||
('luminance', 'MCLNUIAE'),
|
||||
('cœlomic', 'MCLOEI'),
|
||||
('Molecule', 'MCLOEU'),
|
||||
('Cameral', 'MCLRAE'),
|
||||
('Maceral', 'MCLRAE'),
|
||||
('Lacrimal', 'MCLRAI')
|
||||
]
|
||||
|
||||
|
||||
class TestOmissionKey(object):
|
||||
def test_basics(self):
|
||||
|
||||
for string, key in TESTS:
|
||||
assert omission_key(string) == key, '%s => %s' % (string, key)
|
|
@ -1,7 +1,6 @@
|
|||
# =============================================================================
|
||||
# Fog Rusalka Unit Tests
|
||||
# =============================================================================
|
||||
from pytest import approx
|
||||
from fog.key import rusalka
|
||||
|
||||
TESTS = [
|
||||
|
|
Loading…
Reference in New Issue