Adding fog.phonetics.cologne

2018-07-04 17:06:43 +02:00 · 2018-07-04 17:06:43 +02:00 · b9d4e11e25
parent ed1379f4fc
commit b9d4e11e25
7 changed files with 167 additions and 1 deletions
--- a/2
+++ b/2
@ -11,7 +11,7 @@ clean:

 lint:
 	@echo Linting source code using pep8...
-	pycodestyle --ignore E501,E722,E741 $(SOURCE) test
+	pycodestyle --ignore E501,E722,E741,W503,W504 $(SOURCE) test
 	@echo

 unit:
--- a/fog/init.py
+++ b/fog/init.py
@ -2,4 +2,5 @@ import fog.clustering as clustering
 import fog.key as key
 import fog.lsh as lsh
 import fog.metrics as metrics
+import fog.phonetics as phonetics
 import fog.tokenizers as tokenizers
--- a/fog/phonetics/init.py
+++ b/fog/phonetics/init.py
@ -0,0 +1 @@
+from fog.phonetics.cologne import cologne
--- a/fog/phonetics/cologne.py
+++ b/fog/phonetics/cologne.py
@ -0,0 +1,122 @@
+# =============================================================================
+# Fog Cologne Phonetic Algorithm
+# =============================================================================
+#
+# Function computing the Cologne phonetic code for German names.
+#
+# [Url]:
+# https://en.wikipedia.org/wiki/Cologne_phonetics
+#
+# [Article]:
+# Hans Joachim Postel: Die Kölner Phonetik. Ein Verfahren zur Identifizierung
+# von Personennamen auf der Grundlage der Gestaltanalyse.
+# in: IBM-Nachrichten, 19. Jahrgang, 1969, S. 925-931.
+#
+import re
+from unidecode import unidecode
+from fog.phonetics.utils import squeeze
+
+ALPHA_RE = re.compile(r'[^A-Z]')
+
+CODES = {
+    'H': None,
+
+    'A': '0',
+    'E': '0',
+    'I': '0',
+    'O': '0',
+    'U': '0',
+    'J': '0',
+    'Y': '0',
+
+    'B': '1',
+    'P': '1',
+
+    'F': '3',
+    'V': '3',
+    'W': '3',
+
+    'G': '4',
+    'K': '4',
+    'Q': '4',
+
+    'L': '5',
+
+    'M': '6',
+    'N': '6',
+
+    'R': '7',
+
+    'S': '8',
+    'Z': '8'
+}
+
+DT = set(['C', 'S', 'Z'])
+CFOLLOWING1 = set(['A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'])
+CFOLLOWING2 = set(['A', 'H', 'K', 'O', 'Q', 'U', 'X'])
+CPREVIOUS = set(['S', 'Z'])
+X = set(['C', 'Q', 'K'])
+
+GERMANIC_SUBSTITUTIONS = [
+    ('Ä', 'A'),
+    ('Ö', 'O'),
+    ('Ü', 'U'),
+    ('ß', 'SS'),
+    ('PH', 'F')
+]
+
+
+def cologne(name):
+    """
+    Function returning the Cologne phonetic code for the given name.
+
+    Args:
+        name (string): Name to encode.
+
+    Returns:
+        string: The name encoded.
+
+    """
+
+    # Preparing the name
+    name = name.upper()
+
+    for p, r in GERMANIC_SUBSTITUTIONS:
+        name = name.replace(p, r)
+
+    name = unidecode(name)
+    name = re.sub(ALPHA_RE, '', name)
+
+    code = []
+
+    last_i = len(name) - 1
+
+    for i, letter in enumerate(name):
+        possible_code = CODES.get(letter)
+
+        if possible_code is not None:
+            code.append(possible_code)
+
+        # Handling D/T
+        elif letter == 'D' or letter == 'T':
+            code.append('8' if i < last_i and name[i + 1] in DT else '2')
+
+        # Handling C
+        elif letter == 'C':
+
+            if (
+                (i == 0 and name[i + 1] in CFOLLOWING1)
+                or (i < last_i and name[i + 1] in CFOLLOWING2 and name[i - 1] not in CPREVIOUS)
+            ):
+                code.append('4')
+            else:
+                code.append('8')
+
+        # Handling X
+        elif letter == 'X':
+            code.append('8' if name[i - 1] in X else '48')
+
+    # Squeezing and dropping not leading 0
+    rest = squeeze(''.join(code[1:])).replace('0', '')
+
+    return code[0] + rest
--- a/fog/phonetics/utils.py
+++ b/fog/phonetics/utils.py
@ -0,0 +1,13 @@
+# =============================================================================
+# Fog Phonetics Utility Functions
+# =============================================================================
+#
+# Miscellaneous functions used throughout the phonetics module.
+#
+import re
+
+SQUEEZE_RE = re.compile(r'(?:(.)\1+)')
+
+
+def squeeze(string):
+    return SQUEEZE_RE.sub(r'\1', string)
--- a/test/phonetics/init.py
+++ b/test/phonetics/init.py
--- a/test/phonetics/cologne_test.py
+++ b/test/phonetics/cologne_test.py
@ -0,0 +1,29 @@
+# =============================================================================
+# Fog Cologne Unit Tests
+# =============================================================================
+from pytest import approx
+from fog.phonetics import cologne
+
+TESTS = [
+    ('65752682', 'Müller-Lüdenscheidt'),
+    ('17863', 'Breschnew'),
+    ('3412', 'Wikipedia'),
+    ('4837', 'Xavier'),
+    ('478237', 'Christopher'),
+    ('3556', 'Wilhelm'),
+    ('351', 'Philip'),
+    ('1274', 'Patrick'),
+    ('051742', 'Albrecht'),
+    ('68', 'Mac'),
+    ('64', 'Mack')
+]
+
+
+class TestCologne(object):
+    def test_basics(self):
+
+        for code, name in TESTS:
+            assert cologne(name) == code, '%s => %s' % (name, code)
+
+        assert cologne('Meyer') != cologne('Müller')
+        assert cologne('Meyer') == cologne('Mayr')