From ed4c1f0834c9df070239e3644dbd52f441e7e953 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Fri, 6 Jul 2018 17:30:26 +0200 Subject: [PATCH] Adding fog.key.omission --- fog/clustering/pairwise.py | 2 +- fog/key/__init__.py | 1 + fog/key/omission.py | 73 ++++++++++++++++++++++++++++++++++++++ test/key/omission_test.py | 34 ++++++++++++++++++ test/key/rusalka_test.py | 1 - 5 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 fog/key/omission.py create mode 100644 test/key/omission_test.py diff --git a/fog/clustering/pairwise.py b/fog/clustering/pairwise.py index 291bca0..c04413d 100644 --- a/fog/clustering/pairwise.py +++ b/fog/clustering/pairwise.py @@ -246,7 +246,7 @@ def pairwise_fuzzy_clusters(data, similarity=None, distance=None, radius=None, # Pool with Pool(processes=processes) as pool: - for matches in pool.imap_unordered(pairwise_fuzzy_clusters_worker, pool_iter): + for matches in pool.imap(pairwise_fuzzy_clusters_worker, pool_iter): for i, j in matches: graph[i].append(j) graph[j].append(i) diff --git a/fog/key/__init__.py b/fog/key/__init__.py index e8debe4..6fe140f 100644 --- a/fog/key/__init__.py +++ b/fog/key/__init__.py @@ -4,4 +4,5 @@ from fog.key.fingerprint import ( fingerprint, ngrams_fingerprint ) +from fog.key.omission import omission_key from fog.key.rusalka import rusalka diff --git a/fog/key/omission.py b/fog/key/omission.py new file mode 100644 index 0000000..952f370 --- /dev/null +++ b/fog/key/omission.py @@ -0,0 +1,73 @@ +# ============================================================================= +# Fog Omission Key +# ============================================================================= +# +# The omission key by Pollock and Zamora. +# +# [Urls]: +# http://dl.acm.org/citation.cfm?id=358048 +# http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.12.385&rep=rep1&type=pdf +# +# [Reference]: +# Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction +# in Scientific and Scholarly Text." Communications of the ACM, 27(4). +# 358--368. +import re +from unidecode import unidecode + +UNDESIRABLES_RE = re.compile(r'[^A-Z]') +CONSONANTS = 'JKQXZVWYBFMGPDHCLNTSR' +VOWELS = set('AEIOU') + +# TODO: omission/skeleton key clustering for distance = 1 + + +def omission_key(string): + """ + Function returning a string's omission key which is constructed thusly: + 1) First we record the string's set of consonant in an order + where most frequently mispelled consonants will be last. + 2) Then we record the string's set of vowels in the order of + first appearance. + + This key is very useful when searching for mispelled strings because + if sorted using this key, similar strings will be next to each other. + + Args: + string (str): The string to encode. + + Returns: + string: The string's omission key. + + """ + + # Deburring + string = unidecode(string) + + # Normalizing case + string = string.upper() + + # Dropping useless characters + string = re.sub(UNDESIRABLES_RE, '', string) + + if not string: + return '' + + # Composing the key + letters = set() + consonants = [] + vowels = [] + + # Adding vowels in order they appeared + for c in string: + if c in VOWELS and c not in vowels: + vowels.append(c) + else: + letters.add(c) + + # Adding consonants in order + for consonant in CONSONANTS: + if consonant in letters: + consonants.append(consonant) + + return ''.join(consonants + vowels) diff --git a/test/key/omission_test.py b/test/key/omission_test.py new file mode 100644 index 0000000..2fa5d1f --- /dev/null +++ b/test/key/omission_test.py @@ -0,0 +1,34 @@ +# ============================================================================= +# Fog Omission Key Unit Tests +# ============================================================================= +from fog.key import omission_key + +TESTS = [ + ('', ''), + ('hello', 'HLEO'), + ('The quick brown fox jumped over the lazy dog.', 'JKQXZVWYBFMGPDHCLNTREUIOA'), + ('Christopher', 'PHCTSRIOE'), + ('Niall', 'LNIA'), + ('caramel', 'MCLRAE'), + ('Carlson', 'CLNSRAO'), + ('Karlsson', 'KLNSRAO'), + ('microeletronics', 'MCLNTSRIOE'), + ('Circumstantial', 'MCLNTSRIUA'), + ('LUMINESCENT', 'MCLNTSUIE'), + ('multinucleate', 'MCLNTUIEA'), + ('multinucleon', 'MCLNTUIEO'), + ('cumulene', 'MCLNUE'), + ('luminance', 'MCLNUIAE'), + ('cœlomic', 'MCLOEI'), + ('Molecule', 'MCLOEU'), + ('Cameral', 'MCLRAE'), + ('Maceral', 'MCLRAE'), + ('Lacrimal', 'MCLRAI') +] + + +class TestOmissionKey(object): + def test_basics(self): + + for string, key in TESTS: + assert omission_key(string) == key, '%s => %s' % (string, key) diff --git a/test/key/rusalka_test.py b/test/key/rusalka_test.py index 2b95215..d6cd047 100644 --- a/test/key/rusalka_test.py +++ b/test/key/rusalka_test.py @@ -1,7 +1,6 @@ # ============================================================================= # Fog Rusalka Unit Tests # ============================================================================= -from pytest import approx from fog.key import rusalka TESTS = [