Adding fog.phonetics.cologne

This commit is contained in:
Yomguithereal 2018-07-04 17:06:43 +02:00
parent ed1379f4fc
commit b9d4e11e25
7 changed files with 167 additions and 1 deletions

View File

@ -11,7 +11,7 @@ clean:
lint:
@echo Linting source code using pep8...
pycodestyle --ignore E501,E722,E741 $(SOURCE) test
pycodestyle --ignore E501,E722,E741,W503,W504 $(SOURCE) test
@echo
unit:

View File

@ -2,4 +2,5 @@ import fog.clustering as clustering
import fog.key as key
import fog.lsh as lsh
import fog.metrics as metrics
import fog.phonetics as phonetics
import fog.tokenizers as tokenizers

View File

@ -0,0 +1 @@
from fog.phonetics.cologne import cologne

122
fog/phonetics/cologne.py Normal file
View File

@ -0,0 +1,122 @@
# =============================================================================
# Fog Cologne Phonetic Algorithm
# =============================================================================
#
# Function computing the Cologne phonetic code for German names.
#
# [Url]:
# https://en.wikipedia.org/wiki/Cologne_phonetics
#
# [Article]:
# Hans Joachim Postel: Die Kölner Phonetik. Ein Verfahren zur Identifizierung
# von Personennamen auf der Grundlage der Gestaltanalyse.
# in: IBM-Nachrichten, 19. Jahrgang, 1969, S. 925-931.
#
import re
from unidecode import unidecode
from fog.phonetics.utils import squeeze
ALPHA_RE = re.compile(r'[^A-Z]')
CODES = {
'H': None,
'A': '0',
'E': '0',
'I': '0',
'O': '0',
'U': '0',
'J': '0',
'Y': '0',
'B': '1',
'P': '1',
'F': '3',
'V': '3',
'W': '3',
'G': '4',
'K': '4',
'Q': '4',
'L': '5',
'M': '6',
'N': '6',
'R': '7',
'S': '8',
'Z': '8'
}
DT = set(['C', 'S', 'Z'])
CFOLLOWING1 = set(['A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'])
CFOLLOWING2 = set(['A', 'H', 'K', 'O', 'Q', 'U', 'X'])
CPREVIOUS = set(['S', 'Z'])
X = set(['C', 'Q', 'K'])
GERMANIC_SUBSTITUTIONS = [
('Ä', 'A'),
('Ö', 'O'),
('Ü', 'U'),
('ß', 'SS'),
('PH', 'F')
]
def cologne(name):
"""
Function returning the Cologne phonetic code for the given name.
Args:
name (string): Name to encode.
Returns:
string: The name encoded.
"""
# Preparing the name
name = name.upper()
for p, r in GERMANIC_SUBSTITUTIONS:
name = name.replace(p, r)
name = unidecode(name)
name = re.sub(ALPHA_RE, '', name)
code = []
last_i = len(name) - 1
for i, letter in enumerate(name):
possible_code = CODES.get(letter)
if possible_code is not None:
code.append(possible_code)
# Handling D/T
elif letter == 'D' or letter == 'T':
code.append('8' if i < last_i and name[i + 1] in DT else '2')
# Handling C
elif letter == 'C':
if (
(i == 0 and name[i + 1] in CFOLLOWING1)
or (i < last_i and name[i + 1] in CFOLLOWING2 and name[i - 1] not in CPREVIOUS)
):
code.append('4')
else:
code.append('8')
# Handling X
elif letter == 'X':
code.append('8' if name[i - 1] in X else '48')
# Squeezing and dropping not leading 0
rest = squeeze(''.join(code[1:])).replace('0', '')
return code[0] + rest

13
fog/phonetics/utils.py Normal file
View File

@ -0,0 +1,13 @@
# =============================================================================
# Fog Phonetics Utility Functions
# =============================================================================
#
# Miscellaneous functions used throughout the phonetics module.
#
import re
SQUEEZE_RE = re.compile(r'(?:(.)\1+)')
def squeeze(string):
return SQUEEZE_RE.sub(r'\1', string)

View File

View File

@ -0,0 +1,29 @@
# =============================================================================
# Fog Cologne Unit Tests
# =============================================================================
from pytest import approx
from fog.phonetics import cologne
TESTS = [
('65752682', 'Müller-Lüdenscheidt'),
('17863', 'Breschnew'),
('3412', 'Wikipedia'),
('4837', 'Xavier'),
('478237', 'Christopher'),
('3556', 'Wilhelm'),
('351', 'Philip'),
('1274', 'Patrick'),
('051742', 'Albrecht'),
('68', 'Mac'),
('64', 'Mack')
]
class TestCologne(object):
def test_basics(self):
for code, name in TESTS:
assert cologne(name) == code, '%s => %s' % (name, code)
assert cologne('Meyer') != cologne('Müller')
assert cologne('Meyer') == cologne('Mayr')