Adding fog.key.omission

2018-07-06 17:30:26 +02:00 · 2018-07-06 17:30:26 +02:00 · ed4c1f0834
parent 65881f081d
commit ed4c1f0834
5 changed files with 109 additions and 2 deletions
--- a/fog/clustering/pairwise.py
+++ b/fog/clustering/pairwise.py
@ -246,7 +246,7 @@ def pairwise_fuzzy_clusters(data, similarity=None, distance=None, radius=None,

        # Pool
        with Pool(processes=processes) as pool:
-            for matches in pool.imap_unordered(pairwise_fuzzy_clusters_worker, pool_iter):
+            for matches in pool.imap(pairwise_fuzzy_clusters_worker, pool_iter):
                for i, j in matches:
                    graph[i].append(j)
                    graph[j].append(i)
--- a/fog/key/init.py
+++ b/fog/key/init.py
@ -4,4 +4,5 @@ from fog.key.fingerprint import (
    fingerprint,
    ngrams_fingerprint
 )
+from fog.key.omission import omission_key
 from fog.key.rusalka import rusalka
--- a/fog/key/omission.py
+++ b/fog/key/omission.py
@ -0,0 +1,73 @@
+# =============================================================================
+# Fog Omission Key
+# =============================================================================
+#
+# The omission key by Pollock and Zamora.
+#
+# [Urls]:
+# http://dl.acm.org/citation.cfm?id=358048
+# http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.12.385&rep=rep1&type=pdf
+#
+# [Reference]:
+# Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
+# in Scientific and Scholarly Text." Communications of the ACM, 27(4).
+# 358--368.
+import re
+from unidecode import unidecode
+
+UNDESIRABLES_RE = re.compile(r'[^A-Z]')
+CONSONANTS = 'JKQXZVWYBFMGPDHCLNTSR'
+VOWELS = set('AEIOU')
+
+# TODO: omission/skeleton key clustering for distance = 1
+
+
+def omission_key(string):
+    """
+    Function returning a string's omission key which is constructed thusly:
+        1) First we record the string's set of consonant in an order
+        where most frequently mispelled consonants will be last.
+        2) Then we record the string's set of vowels in the order of
+        first appearance.
+
+    This key is very useful when searching for mispelled strings because
+    if sorted using this key, similar strings will be next to each other.
+
+    Args:
+        string (str): The string to encode.
+
+    Returns:
+        string: The string's omission key.
+
+    """
+
+    # Deburring
+    string = unidecode(string)
+
+    # Normalizing case
+    string = string.upper()
+
+    # Dropping useless characters
+    string = re.sub(UNDESIRABLES_RE, '', string)
+
+    if not string:
+        return ''
+
+    # Composing the key
+    letters = set()
+    consonants = []
+    vowels = []
+
+    # Adding vowels in order they appeared
+    for c in string:
+        if c in VOWELS and c not in vowels:
+            vowels.append(c)
+        else:
+            letters.add(c)
+
+    # Adding consonants in order
+    for consonant in CONSONANTS:
+        if consonant in letters:
+            consonants.append(consonant)
+
+    return ''.join(consonants + vowels)
--- a/test/key/omission_test.py
+++ b/test/key/omission_test.py
@ -0,0 +1,34 @@
+# =============================================================================
+# Fog Omission Key Unit Tests
+# =============================================================================
+from fog.key import omission_key
+
+TESTS = [
+    ('', ''),
+    ('hello', 'HLEO'),
+    ('The quick brown fox jumped over the lazy dog.', 'JKQXZVWYBFMGPDHCLNTREUIOA'),
+    ('Christopher', 'PHCTSRIOE'),
+    ('Niall', 'LNIA'),
+    ('caramel', 'MCLRAE'),
+    ('Carlson', 'CLNSRAO'),
+    ('Karlsson', 'KLNSRAO'),
+    ('microeletronics', 'MCLNTSRIOE'),
+    ('Circumstantial', 'MCLNTSRIUA'),
+    ('LUMINESCENT', 'MCLNTSUIE'),
+    ('multinucleate', 'MCLNTUIEA'),
+    ('multinucleon', 'MCLNTUIEO'),
+    ('cumulene', 'MCLNUE'),
+    ('luminance', 'MCLNUIAE'),
+    ('cœlomic', 'MCLOEI'),
+    ('Molecule', 'MCLOEU'),
+    ('Cameral', 'MCLRAE'),
+    ('Maceral', 'MCLRAE'),
+    ('Lacrimal', 'MCLRAI')
+]
+
+
+class TestOmissionKey(object):
+    def test_basics(self):
+
+        for string, key in TESTS:
+            assert omission_key(string) == key, '%s => %s' % (string, key)
--- a/test/key/rusalka_test.py
+++ b/test/key/rusalka_test.py
@ -1,7 +1,6 @@
 # =============================================================================
 # Fog Rusalka Unit Tests
 # =============================================================================
-from pytest import approx
 from fog.key import rusalka

 TESTS = [