Adding fog.key.omission

This commit is contained in:
Yomguithereal 2018-07-06 17:30:26 +02:00
parent 65881f081d
commit ed4c1f0834
5 changed files with 109 additions and 2 deletions

View File

@ -246,7 +246,7 @@ def pairwise_fuzzy_clusters(data, similarity=None, distance=None, radius=None,
# Pool
with Pool(processes=processes) as pool:
for matches in pool.imap_unordered(pairwise_fuzzy_clusters_worker, pool_iter):
for matches in pool.imap(pairwise_fuzzy_clusters_worker, pool_iter):
for i, j in matches:
graph[i].append(j)
graph[j].append(i)

View File

@ -4,4 +4,5 @@ from fog.key.fingerprint import (
fingerprint,
ngrams_fingerprint
)
from fog.key.omission import omission_key
from fog.key.rusalka import rusalka

73
fog/key/omission.py Normal file
View File

@ -0,0 +1,73 @@
# =============================================================================
# Fog Omission Key
# =============================================================================
#
# The omission key by Pollock and Zamora.
#
# [Urls]:
# http://dl.acm.org/citation.cfm?id=358048
# http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.12.385&rep=rep1&type=pdf
#
# [Reference]:
# Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
# in Scientific and Scholarly Text." Communications of the ACM, 27(4).
# 358--368.
import re
from unidecode import unidecode
UNDESIRABLES_RE = re.compile(r'[^A-Z]')
CONSONANTS = 'JKQXZVWYBFMGPDHCLNTSR'
VOWELS = set('AEIOU')
# TODO: omission/skeleton key clustering for distance = 1
def omission_key(string):
"""
Function returning a string's omission key which is constructed thusly:
1) First we record the string's set of consonant in an order
where most frequently mispelled consonants will be last.
2) Then we record the string's set of vowels in the order of
first appearance.
This key is very useful when searching for mispelled strings because
if sorted using this key, similar strings will be next to each other.
Args:
string (str): The string to encode.
Returns:
string: The string's omission key.
"""
# Deburring
string = unidecode(string)
# Normalizing case
string = string.upper()
# Dropping useless characters
string = re.sub(UNDESIRABLES_RE, '', string)
if not string:
return ''
# Composing the key
letters = set()
consonants = []
vowels = []
# Adding vowels in order they appeared
for c in string:
if c in VOWELS and c not in vowels:
vowels.append(c)
else:
letters.add(c)
# Adding consonants in order
for consonant in CONSONANTS:
if consonant in letters:
consonants.append(consonant)
return ''.join(consonants + vowels)

34
test/key/omission_test.py Normal file
View File

@ -0,0 +1,34 @@
# =============================================================================
# Fog Omission Key Unit Tests
# =============================================================================
from fog.key import omission_key
TESTS = [
('', ''),
('hello', 'HLEO'),
('The quick brown fox jumped over the lazy dog.', 'JKQXZVWYBFMGPDHCLNTREUIOA'),
('Christopher', 'PHCTSRIOE'),
('Niall', 'LNIA'),
('caramel', 'MCLRAE'),
('Carlson', 'CLNSRAO'),
('Karlsson', 'KLNSRAO'),
('microeletronics', 'MCLNTSRIOE'),
('Circumstantial', 'MCLNTSRIUA'),
('LUMINESCENT', 'MCLNTSUIE'),
('multinucleate', 'MCLNTUIEA'),
('multinucleon', 'MCLNTUIEO'),
('cumulene', 'MCLNUE'),
('luminance', 'MCLNUIAE'),
('cœlomic', 'MCLOEI'),
('Molecule', 'MCLOEU'),
('Cameral', 'MCLRAE'),
('Maceral', 'MCLRAE'),
('Lacrimal', 'MCLRAI')
]
class TestOmissionKey(object):
def test_basics(self):
for string, key in TESTS:
assert omission_key(string) == key, '%s => %s' % (string, key)

View File

@ -1,7 +1,6 @@
# =============================================================================
# Fog Rusalka Unit Tests
# =============================================================================
from pytest import approx
from fog.key import rusalka
TESTS = [