268 lines
11 KiB
Python
268 lines
11 KiB
Python
from itertools import product
|
|
from functools import partial
|
|
from string import ascii_letters, digits, punctuation
|
|
|
|
from hypothesis import given, assume, settings
|
|
import hypothesis.strategies as st
|
|
import pytest
|
|
|
|
from rapidfuzz import fuzz, process, utils, string_metric
|
|
import random
|
|
from math import isclose
|
|
|
|
def levenshtein(s1, s2, weights=(1, 1, 1)):
|
|
"""
|
|
python implementation of a generic Levenshtein distance
|
|
this is much less error prone, than the bitparallel C implementations
|
|
and is therefor used to test the C implementation
|
|
However this makes this very slow even for testing purposes
|
|
"""
|
|
|
|
rows = len(s1)+1
|
|
cols = len(s2)+1
|
|
insert, delete, substitute = weights
|
|
|
|
dist = [[0 for x in range(cols)] for x in range(rows)]
|
|
|
|
for row in range(1, rows):
|
|
dist[row][0] = row * delete
|
|
|
|
for col in range(1, cols):
|
|
dist[0][col] = col * insert
|
|
|
|
for col in range(1, cols):
|
|
for row in range(1, rows):
|
|
if s1[row-1] == s2[col-1]:
|
|
cost = 0
|
|
else:
|
|
cost = substitute
|
|
|
|
dist[row][col] = min(
|
|
dist[row-1][col] + delete, # deletion
|
|
dist[row][col-1] + insert, # insertion
|
|
dist[row-1][col-1] + cost # substitution
|
|
)
|
|
|
|
return dist[-1][-1]
|
|
|
|
def normalize_distance(dist, s1, s2, weights=(1, 1, 1)):
|
|
insert, delete, substitute = weights
|
|
if len(s1) > len(s2):
|
|
max_dist = min([
|
|
# delete all characters from s1 and insert all characters from s2
|
|
len(s1) * delete + len(s2) * insert,
|
|
# replace all characters and delete the remaining characters from s1
|
|
len(s2) * substitute + (len(s1) - len(s2)) * delete
|
|
])
|
|
else:
|
|
max_dist = min([
|
|
# delete all characters from s1 and insert all characters from s2
|
|
len(s1) * delete + len(s2) * insert,
|
|
# replace all characters and insert the remaining characters into s1
|
|
len(s1) * substitute + (len(s2) - len(s1)) * insert
|
|
])
|
|
|
|
return 100 - 100 * dist / max_dist if max_dist else 100
|
|
|
|
def extractOne_scorer(s1, s2, scorer, processor=None, **kwargs):
|
|
return process.extractOne(s1, [s2], processor=processor, scorer=scorer, **kwargs)[1]
|
|
|
|
def extract_scorer(s1, s2, scorer, processor=None, **kwargs):
|
|
return process.extract(s1, [s2], processor=processor, scorer=scorer, **kwargs)[0][1]
|
|
|
|
def extract_iter_scorer(s1, s2, scorer, processor=None, **kwargs):
|
|
return list(process.extract_iter(s1, [s2], processor=processor, scorer=scorer, **kwargs))[0][1]
|
|
|
|
|
|
HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation
|
|
|
|
SCORERS = [
|
|
fuzz.ratio,
|
|
fuzz.partial_ratio,
|
|
fuzz.token_set_ratio,
|
|
fuzz.token_sort_ratio,
|
|
fuzz.token_ratio,
|
|
fuzz.partial_token_set_ratio,
|
|
fuzz.partial_token_sort_ratio,
|
|
fuzz.partial_token_ratio,
|
|
fuzz.WRatio,
|
|
fuzz.QRatio
|
|
]
|
|
|
|
FULL_SCORERS = [
|
|
fuzz.ratio,
|
|
fuzz.WRatio,
|
|
fuzz.QRatio
|
|
]
|
|
|
|
PROCESSORS = [
|
|
lambda x: x,
|
|
utils.default_process
|
|
]
|
|
|
|
@given(s1=st.text(), s2=st.text())
|
|
@settings(max_examples=500, deadline=None)
|
|
def test_partial_ratio(s1, s2):
|
|
"""
|
|
test partial_ratio. Currently this only tests, so there are no exceptions
|
|
In the future this should validate the implementation. However the current implementation
|
|
is not completely optimal in some edge cases
|
|
"""
|
|
fuzz.partial_ratio(s1, s2)
|
|
|
|
|
|
@given(s1=st.text(), s2=st.text())
|
|
@settings(max_examples=500, deadline=None)
|
|
def test_token_ratio(s1, s2):
|
|
"""
|
|
token_ratio should be max(token_sort_ratio, token_set_ratio)
|
|
"""
|
|
assert fuzz.token_ratio(s1, s2) == max(fuzz.token_sort_ratio(s1, s2), fuzz.token_set_ratio(s1, s2))
|
|
|
|
@given(s1=st.text(), s2=st.text())
|
|
@settings(max_examples=500, deadline=None)
|
|
def test_partial_token_ratio(s1, s2):
|
|
"""
|
|
partial_token_ratio should be max(partial_token_sort_ratio, partial_token_set_ratio)
|
|
"""
|
|
assert fuzz.partial_token_ratio(s1, s2) == max(fuzz.partial_token_sort_ratio(s1, s2), fuzz.partial_token_set_ratio(s1, s2))
|
|
|
|
|
|
@given(s1=st.text(min_size=0, max_size=64), s2=st.text(min_size=0, max_size=64))
|
|
@settings(max_examples=500, deadline=None)
|
|
def test_levenshtein_word(s1, s2):
|
|
"""
|
|
Test short Levenshtein implementation against simple implementation
|
|
"""
|
|
# uniform Levenshtein
|
|
# distance
|
|
reference_dist = levenshtein(s1, s2)
|
|
assert string_metric.levenshtein(s1, s2) == reference_dist
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein) == reference_dist
|
|
# normalized distance
|
|
reference_sim = normalize_distance(reference_dist, s1, s2)
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
|
|
# InDel-Distance
|
|
# distance
|
|
reference_dist = levenshtein(s1, s2, (1,1,2))
|
|
assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
# normalized distance
|
|
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim)
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
|
|
|
|
@given(s1=st.text(min_size=65), s2=st.text(min_size=65))
|
|
@settings(max_examples=500, deadline=None)
|
|
def test_levenshtein_block(s1, s2):
|
|
"""
|
|
Test blockwise Levenshtein implementation against simple implementation
|
|
"""
|
|
# uniform Levenshtein
|
|
# distance
|
|
reference_dist = levenshtein(s1, s2)
|
|
assert string_metric.levenshtein(s1, s2) == reference_dist
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein) == reference_dist
|
|
# normalized distance
|
|
reference_sim = normalize_distance(reference_dist, s1, s2)
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
|
|
# InDel-Distance
|
|
# distance
|
|
reference_dist = levenshtein(s1, s2, (1,1,2))
|
|
assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
# normalized distance
|
|
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2, weights=(1,1,2)), reference_sim)
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
|
|
@given(s1=st.text(), s2=st.text())
|
|
@settings(max_examples=500, deadline=None)
|
|
def test_levenshtein_random(s1, s2):
|
|
"""
|
|
Test mixed strings to test through all implementations of Levenshtein
|
|
"""
|
|
# uniform Levenshtein
|
|
# distance
|
|
reference_dist = levenshtein(s1, s2)
|
|
assert string_metric.levenshtein(s1, s2) == reference_dist
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein) == reference_dist
|
|
# normalized distance
|
|
reference_sim = normalize_distance(reference_dist, s1, s2)
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
|
|
# InDel-Distance
|
|
# distance
|
|
reference_dist = levenshtein(s1, s2, (1,1,2))
|
|
assert string_metric.levenshtein(s1, s2, (1,1,2)) == reference_dist
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
# normalized distance
|
|
reference_sim = normalize_distance(reference_dist, s1, s2, (1,1,2))
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2, (1,1,2)), reference_sim)
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
|
|
@given(sentence=st.text())
|
|
@settings(max_examples=200)
|
|
def test_multiple_processor_runs(sentence):
|
|
"""
|
|
Test that running a preprocessor on a sentence
|
|
a second time does not change the result
|
|
"""
|
|
assert utils.default_process(sentence) \
|
|
== utils.default_process(utils.default_process(sentence))
|
|
|
|
|
|
@pytest.mark.parametrize('scorer,processor', list(product(FULL_SCORERS, PROCESSORS)))
|
|
@given(choices=st.lists(st.text(), min_size=1))
|
|
@settings(max_examples=20, deadline=5000)
|
|
def test_only_identical_strings_extracted(scorer, processor, choices):
|
|
"""
|
|
Test that only identical (post processing) strings score 100 on the test.
|
|
If two strings are not identical then using full comparison methods they should
|
|
not be a perfect (100) match.
|
|
:param scorer:
|
|
:param processor:
|
|
:param data:
|
|
:return:
|
|
"""
|
|
query = random.choice(choices)
|
|
assume(processor(query) != '')
|
|
|
|
matches = process.extract(query, choices,
|
|
scorer=scorer, processor=processor,
|
|
score_cutoff=100, limit=None)
|
|
|
|
assert matches != []
|
|
|
|
for match in matches:
|
|
assert processor(query) == processor(match[0]) |