From b9d4e11e251d6fb8157ab2b5a763c9bb230517db Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Wed, 4 Jul 2018 17:06:43 +0200 Subject: [PATCH] Adding fog.phonetics.cologne --- Makefile | 2 +- fog/__init__.py | 1 + fog/phonetics/__init__.py | 1 + fog/phonetics/cologne.py | 122 +++++++++++++++++++++++++++++++++ fog/phonetics/utils.py | 13 ++++ test/phonetics/__init__.py | 0 test/phonetics/cologne_test.py | 29 ++++++++ 7 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 fog/phonetics/__init__.py create mode 100644 fog/phonetics/cologne.py create mode 100644 fog/phonetics/utils.py create mode 100644 test/phonetics/__init__.py create mode 100644 test/phonetics/cologne_test.py diff --git a/Makefile b/Makefile index 5cff791..efd48ed 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ clean: lint: @echo Linting source code using pep8... - pycodestyle --ignore E501,E722,E741 $(SOURCE) test + pycodestyle --ignore E501,E722,E741,W503,W504 $(SOURCE) test @echo unit: diff --git a/fog/__init__.py b/fog/__init__.py index e7facd3..7b1aa23 100644 --- a/fog/__init__.py +++ b/fog/__init__.py @@ -2,4 +2,5 @@ import fog.clustering as clustering import fog.key as key import fog.lsh as lsh import fog.metrics as metrics +import fog.phonetics as phonetics import fog.tokenizers as tokenizers diff --git a/fog/phonetics/__init__.py b/fog/phonetics/__init__.py new file mode 100644 index 0000000..04e8dd4 --- /dev/null +++ b/fog/phonetics/__init__.py @@ -0,0 +1 @@ +from fog.phonetics.cologne import cologne diff --git a/fog/phonetics/cologne.py b/fog/phonetics/cologne.py new file mode 100644 index 0000000..ccc27fb --- /dev/null +++ b/fog/phonetics/cologne.py @@ -0,0 +1,122 @@ +# ============================================================================= +# Fog Cologne Phonetic Algorithm +# ============================================================================= +# +# Function computing the Cologne phonetic code for German names. +# +# [Url]: +# https://en.wikipedia.org/wiki/Cologne_phonetics +# +# [Article]: +# Hans Joachim Postel: Die Kölner Phonetik. Ein Verfahren zur Identifizierung +# von Personennamen auf der Grundlage der Gestaltanalyse. +# in: IBM-Nachrichten, 19. Jahrgang, 1969, S. 925-931. +# +import re +from unidecode import unidecode +from fog.phonetics.utils import squeeze + +ALPHA_RE = re.compile(r'[^A-Z]') + +CODES = { + 'H': None, + + 'A': '0', + 'E': '0', + 'I': '0', + 'O': '0', + 'U': '0', + 'J': '0', + 'Y': '0', + + 'B': '1', + 'P': '1', + + 'F': '3', + 'V': '3', + 'W': '3', + + 'G': '4', + 'K': '4', + 'Q': '4', + + 'L': '5', + + 'M': '6', + 'N': '6', + + 'R': '7', + + 'S': '8', + 'Z': '8' +} + +DT = set(['C', 'S', 'Z']) +CFOLLOWING1 = set(['A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X']) +CFOLLOWING2 = set(['A', 'H', 'K', 'O', 'Q', 'U', 'X']) +CPREVIOUS = set(['S', 'Z']) +X = set(['C', 'Q', 'K']) + +GERMANIC_SUBSTITUTIONS = [ + ('Ä', 'A'), + ('Ö', 'O'), + ('Ü', 'U'), + ('ß', 'SS'), + ('PH', 'F') +] + + +def cologne(name): + """ + Function returning the Cologne phonetic code for the given name. + + Args: + name (string): Name to encode. + + Returns: + string: The name encoded. + + """ + + # Preparing the name + name = name.upper() + + for p, r in GERMANIC_SUBSTITUTIONS: + name = name.replace(p, r) + + name = unidecode(name) + name = re.sub(ALPHA_RE, '', name) + + code = [] + + last_i = len(name) - 1 + + for i, letter in enumerate(name): + possible_code = CODES.get(letter) + + if possible_code is not None: + code.append(possible_code) + + # Handling D/T + elif letter == 'D' or letter == 'T': + code.append('8' if i < last_i and name[i + 1] in DT else '2') + + # Handling C + elif letter == 'C': + + if ( + (i == 0 and name[i + 1] in CFOLLOWING1) + or (i < last_i and name[i + 1] in CFOLLOWING2 and name[i - 1] not in CPREVIOUS) + ): + code.append('4') + else: + code.append('8') + + # Handling X + elif letter == 'X': + code.append('8' if name[i - 1] in X else '48') + + # Squeezing and dropping not leading 0 + rest = squeeze(''.join(code[1:])).replace('0', '') + + return code[0] + rest diff --git a/fog/phonetics/utils.py b/fog/phonetics/utils.py new file mode 100644 index 0000000..35eb8f1 --- /dev/null +++ b/fog/phonetics/utils.py @@ -0,0 +1,13 @@ +# ============================================================================= +# Fog Phonetics Utility Functions +# ============================================================================= +# +# Miscellaneous functions used throughout the phonetics module. +# +import re + +SQUEEZE_RE = re.compile(r'(?:(.)\1+)') + + +def squeeze(string): + return SQUEEZE_RE.sub(r'\1', string) diff --git a/test/phonetics/__init__.py b/test/phonetics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/phonetics/cologne_test.py b/test/phonetics/cologne_test.py new file mode 100644 index 0000000..9bdd6cb --- /dev/null +++ b/test/phonetics/cologne_test.py @@ -0,0 +1,29 @@ +# ============================================================================= +# Fog Cologne Unit Tests +# ============================================================================= +from pytest import approx +from fog.phonetics import cologne + +TESTS = [ + ('65752682', 'Müller-Lüdenscheidt'), + ('17863', 'Breschnew'), + ('3412', 'Wikipedia'), + ('4837', 'Xavier'), + ('478237', 'Christopher'), + ('3556', 'Wilhelm'), + ('351', 'Philip'), + ('1274', 'Patrick'), + ('051742', 'Albrecht'), + ('68', 'Mac'), + ('64', 'Mack') +] + + +class TestCologne(object): + def test_basics(self): + + for code, name in TESTS: + assert cologne(name) == code, '%s => %s' % (name, code) + + assert cologne('Meyer') != cologne('Müller') + assert cologne('Meyer') == cologne('Mayr')