From ed4c1f0834c9df070239e3644dbd52f441e7e953 Mon Sep 17 00:00:00 2001
From: Yomguithereal <guillaumeplique@gmail.com>
Date: Fri, 6 Jul 2018 17:30:26 +0200
Subject: [PATCH] Adding fog.key.omission

---
 fog/clustering/pairwise.py |  2 +-
 fog/key/__init__.py        |  1 +
 fog/key/omission.py        | 73 ++++++++++++++++++++++++++++++++++++++
 test/key/omission_test.py  | 34 ++++++++++++++++++
 test/key/rusalka_test.py   |  1 -
 5 files changed, 109 insertions(+), 2 deletions(-)
 create mode 100644 fog/key/omission.py
 create mode 100644 test/key/omission_test.py

diff --git a/fog/clustering/pairwise.py b/fog/clustering/pairwise.py
index 291bca0..c04413d 100644
--- a/fog/clustering/pairwise.py
+++ b/fog/clustering/pairwise.py
@@ -246,7 +246,7 @@ def pairwise_fuzzy_clusters(data, similarity=None, distance=None, radius=None,
 
         # Pool
         with Pool(processes=processes) as pool:
-            for matches in pool.imap_unordered(pairwise_fuzzy_clusters_worker, pool_iter):
+            for matches in pool.imap(pairwise_fuzzy_clusters_worker, pool_iter):
                 for i, j in matches:
                     graph[i].append(j)
                     graph[j].append(i)
diff --git a/fog/key/__init__.py b/fog/key/__init__.py
index e8debe4..6fe140f 100644
--- a/fog/key/__init__.py
+++ b/fog/key/__init__.py
@@ -4,4 +4,5 @@ from fog.key.fingerprint import (
     fingerprint,
     ngrams_fingerprint
 )
+from fog.key.omission import omission_key
 from fog.key.rusalka import rusalka
diff --git a/fog/key/omission.py b/fog/key/omission.py
new file mode 100644
index 0000000..952f370
--- /dev/null
+++ b/fog/key/omission.py
@@ -0,0 +1,73 @@
+# =============================================================================
+# Fog Omission Key
+# =============================================================================
+#
+# The omission key by Pollock and Zamora.
+#
+# [Urls]:
+# http://dl.acm.org/citation.cfm?id=358048
+# http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.12.385&rep=rep1&type=pdf
+#
+# [Reference]:
+# Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
+# in Scientific and Scholarly Text." Communications of the ACM, 27(4).
+# 358--368.
+import re
+from unidecode import unidecode
+
+UNDESIRABLES_RE = re.compile(r'[^A-Z]')
+CONSONANTS = 'JKQXZVWYBFMGPDHCLNTSR'
+VOWELS = set('AEIOU')
+
+# TODO: omission/skeleton key clustering for distance = 1
+
+
+def omission_key(string):
+    """
+    Function returning a string's omission key which is constructed thusly:
+        1) First we record the string's set of consonant in an order
+        where most frequently mispelled consonants will be last.
+        2) Then we record the string's set of vowels in the order of
+        first appearance.
+
+    This key is very useful when searching for mispelled strings because
+    if sorted using this key, similar strings will be next to each other.
+
+    Args:
+        string (str): The string to encode.
+
+    Returns:
+        string: The string's omission key.
+
+    """
+
+    # Deburring
+    string = unidecode(string)
+
+    # Normalizing case
+    string = string.upper()
+
+    # Dropping useless characters
+    string = re.sub(UNDESIRABLES_RE, '', string)
+
+    if not string:
+        return ''
+
+    # Composing the key
+    letters = set()
+    consonants = []
+    vowels = []
+
+    # Adding vowels in order they appeared
+    for c in string:
+        if c in VOWELS and c not in vowels:
+            vowels.append(c)
+        else:
+            letters.add(c)
+
+    # Adding consonants in order
+    for consonant in CONSONANTS:
+        if consonant in letters:
+            consonants.append(consonant)
+
+    return ''.join(consonants + vowels)
diff --git a/test/key/omission_test.py b/test/key/omission_test.py
new file mode 100644
index 0000000..2fa5d1f
--- /dev/null
+++ b/test/key/omission_test.py
@@ -0,0 +1,34 @@
+# =============================================================================
+# Fog Omission Key Unit Tests
+# =============================================================================
+from fog.key import omission_key
+
+TESTS = [
+    ('', ''),
+    ('hello', 'HLEO'),
+    ('The quick brown fox jumped over the lazy dog.', 'JKQXZVWYBFMGPDHCLNTREUIOA'),
+    ('Christopher', 'PHCTSRIOE'),
+    ('Niall', 'LNIA'),
+    ('caramel', 'MCLRAE'),
+    ('Carlson', 'CLNSRAO'),
+    ('Karlsson', 'KLNSRAO'),
+    ('microeletronics', 'MCLNTSRIOE'),
+    ('Circumstantial', 'MCLNTSRIUA'),
+    ('LUMINESCENT', 'MCLNTSUIE'),
+    ('multinucleate', 'MCLNTUIEA'),
+    ('multinucleon', 'MCLNTUIEO'),
+    ('cumulene', 'MCLNUE'),
+    ('luminance', 'MCLNUIAE'),
+    ('cœlomic', 'MCLOEI'),
+    ('Molecule', 'MCLOEU'),
+    ('Cameral', 'MCLRAE'),
+    ('Maceral', 'MCLRAE'),
+    ('Lacrimal', 'MCLRAI')
+]
+
+
+class TestOmissionKey(object):
+    def test_basics(self):
+
+        for string, key in TESTS:
+            assert omission_key(string) == key, '%s => %s' % (string, key)
diff --git a/test/key/rusalka_test.py b/test/key/rusalka_test.py
index 2b95215..d6cd047 100644
--- a/test/key/rusalka_test.py
+++ b/test/key/rusalka_test.py
@@ -1,7 +1,6 @@
 # =============================================================================
 # Fog Rusalka Unit Tests
 # =============================================================================
-from pytest import approx
 from fog.key import rusalka
 
 TESTS = [