mirror of https://github.com/Yomguithereal/fog.git
Adding fog.phonetics.cologne
This commit is contained in:
parent
ed1379f4fc
commit
b9d4e11e25
2
Makefile
2
Makefile
|
@ -11,7 +11,7 @@ clean:
|
|||
|
||||
lint:
|
||||
@echo Linting source code using pep8...
|
||||
pycodestyle --ignore E501,E722,E741 $(SOURCE) test
|
||||
pycodestyle --ignore E501,E722,E741,W503,W504 $(SOURCE) test
|
||||
@echo
|
||||
|
||||
unit:
|
||||
|
|
|
@ -2,4 +2,5 @@ import fog.clustering as clustering
|
|||
import fog.key as key
|
||||
import fog.lsh as lsh
|
||||
import fog.metrics as metrics
|
||||
import fog.phonetics as phonetics
|
||||
import fog.tokenizers as tokenizers
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from fog.phonetics.cologne import cologne
|
|
@ -0,0 +1,122 @@
|
|||
# =============================================================================
|
||||
# Fog Cologne Phonetic Algorithm
|
||||
# =============================================================================
|
||||
#
|
||||
# Function computing the Cologne phonetic code for German names.
|
||||
#
|
||||
# [Url]:
|
||||
# https://en.wikipedia.org/wiki/Cologne_phonetics
|
||||
#
|
||||
# [Article]:
|
||||
# Hans Joachim Postel: Die Kölner Phonetik. Ein Verfahren zur Identifizierung
|
||||
# von Personennamen auf der Grundlage der Gestaltanalyse.
|
||||
# in: IBM-Nachrichten, 19. Jahrgang, 1969, S. 925-931.
|
||||
#
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
from fog.phonetics.utils import squeeze
|
||||
|
||||
ALPHA_RE = re.compile(r'[^A-Z]')
|
||||
|
||||
CODES = {
|
||||
'H': None,
|
||||
|
||||
'A': '0',
|
||||
'E': '0',
|
||||
'I': '0',
|
||||
'O': '0',
|
||||
'U': '0',
|
||||
'J': '0',
|
||||
'Y': '0',
|
||||
|
||||
'B': '1',
|
||||
'P': '1',
|
||||
|
||||
'F': '3',
|
||||
'V': '3',
|
||||
'W': '3',
|
||||
|
||||
'G': '4',
|
||||
'K': '4',
|
||||
'Q': '4',
|
||||
|
||||
'L': '5',
|
||||
|
||||
'M': '6',
|
||||
'N': '6',
|
||||
|
||||
'R': '7',
|
||||
|
||||
'S': '8',
|
||||
'Z': '8'
|
||||
}
|
||||
|
||||
DT = set(['C', 'S', 'Z'])
|
||||
CFOLLOWING1 = set(['A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'])
|
||||
CFOLLOWING2 = set(['A', 'H', 'K', 'O', 'Q', 'U', 'X'])
|
||||
CPREVIOUS = set(['S', 'Z'])
|
||||
X = set(['C', 'Q', 'K'])
|
||||
|
||||
GERMANIC_SUBSTITUTIONS = [
|
||||
('Ä', 'A'),
|
||||
('Ö', 'O'),
|
||||
('Ü', 'U'),
|
||||
('ß', 'SS'),
|
||||
('PH', 'F')
|
||||
]
|
||||
|
||||
|
||||
def cologne(name):
|
||||
"""
|
||||
Function returning the Cologne phonetic code for the given name.
|
||||
|
||||
Args:
|
||||
name (string): Name to encode.
|
||||
|
||||
Returns:
|
||||
string: The name encoded.
|
||||
|
||||
"""
|
||||
|
||||
# Preparing the name
|
||||
name = name.upper()
|
||||
|
||||
for p, r in GERMANIC_SUBSTITUTIONS:
|
||||
name = name.replace(p, r)
|
||||
|
||||
name = unidecode(name)
|
||||
name = re.sub(ALPHA_RE, '', name)
|
||||
|
||||
code = []
|
||||
|
||||
last_i = len(name) - 1
|
||||
|
||||
for i, letter in enumerate(name):
|
||||
possible_code = CODES.get(letter)
|
||||
|
||||
if possible_code is not None:
|
||||
code.append(possible_code)
|
||||
|
||||
# Handling D/T
|
||||
elif letter == 'D' or letter == 'T':
|
||||
code.append('8' if i < last_i and name[i + 1] in DT else '2')
|
||||
|
||||
# Handling C
|
||||
elif letter == 'C':
|
||||
|
||||
if (
|
||||
(i == 0 and name[i + 1] in CFOLLOWING1)
|
||||
or (i < last_i and name[i + 1] in CFOLLOWING2 and name[i - 1] not in CPREVIOUS)
|
||||
):
|
||||
code.append('4')
|
||||
else:
|
||||
code.append('8')
|
||||
|
||||
# Handling X
|
||||
elif letter == 'X':
|
||||
code.append('8' if name[i - 1] in X else '48')
|
||||
|
||||
# Squeezing and dropping not leading 0
|
||||
rest = squeeze(''.join(code[1:])).replace('0', '')
|
||||
|
||||
return code[0] + rest
|
|
@ -0,0 +1,13 @@
|
|||
# =============================================================================
|
||||
# Fog Phonetics Utility Functions
|
||||
# =============================================================================
|
||||
#
|
||||
# Miscellaneous functions used throughout the phonetics module.
|
||||
#
|
||||
import re
|
||||
|
||||
SQUEEZE_RE = re.compile(r'(?:(.)\1+)')
|
||||
|
||||
|
||||
def squeeze(string):
|
||||
return SQUEEZE_RE.sub(r'\1', string)
|
|
@ -0,0 +1,29 @@
|
|||
# =============================================================================
|
||||
# Fog Cologne Unit Tests
|
||||
# =============================================================================
|
||||
from pytest import approx
|
||||
from fog.phonetics import cologne
|
||||
|
||||
TESTS = [
|
||||
('65752682', 'Müller-Lüdenscheidt'),
|
||||
('17863', 'Breschnew'),
|
||||
('3412', 'Wikipedia'),
|
||||
('4837', 'Xavier'),
|
||||
('478237', 'Christopher'),
|
||||
('3556', 'Wilhelm'),
|
||||
('351', 'Philip'),
|
||||
('1274', 'Patrick'),
|
||||
('051742', 'Albrecht'),
|
||||
('68', 'Mac'),
|
||||
('64', 'Mack')
|
||||
]
|
||||
|
||||
|
||||
class TestCologne(object):
|
||||
def test_basics(self):
|
||||
|
||||
for code, name in TESTS:
|
||||
assert cologne(name) == code, '%s => %s' % (name, code)
|
||||
|
||||
assert cologne('Meyer') != cologne('Müller')
|
||||
assert cologne('Meyer') == cologne('Mayr')
|
Loading…
Reference in New Issue