2020-11-15 19:18:46 +00:00
|
|
|
from itertools import product
|
|
|
|
from functools import partial
|
|
|
|
from string import ascii_letters, digits, punctuation
|
|
|
|
|
|
|
|
from hypothesis import given, assume, settings
|
|
|
|
import hypothesis.strategies as st
|
|
|
|
import pytest
|
|
|
|
|
2021-02-21 18:42:36 +00:00
|
|
|
from rapidfuzz import fuzz, process, utils, string_metric
|
2020-11-15 19:18:46 +00:00
|
|
|
import random
|
2021-09-10 10:44:54 +00:00
|
|
|
import numpy as np
|
2020-11-15 19:18:46 +00:00
|
|
|
|
2021-09-11 10:25:31 +00:00
|
|
|
def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
|
|
|
|
return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
|
|
|
|
|
2021-02-21 18:42:36 +00:00
|
|
|
def levenshtein(s1, s2, weights=(1, 1, 1)):
|
|
|
|
"""
|
|
|
|
python implementation of a generic Levenshtein distance
|
|
|
|
this is much less error prone, than the bitparallel C implementations
|
|
|
|
and is therefor used to test the C implementation
|
2021-03-07 16:04:03 +00:00
|
|
|
However this makes this very slow even for testing purposes
|
2021-02-21 18:42:36 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
rows = len(s1)+1
|
|
|
|
cols = len(s2)+1
|
|
|
|
insert, delete, substitute = weights
|
|
|
|
|
|
|
|
dist = [[0 for x in range(cols)] for x in range(rows)]
|
|
|
|
|
|
|
|
for row in range(1, rows):
|
|
|
|
dist[row][0] = row * delete
|
|
|
|
|
|
|
|
for col in range(1, cols):
|
|
|
|
dist[0][col] = col * insert
|
|
|
|
|
|
|
|
for col in range(1, cols):
|
|
|
|
for row in range(1, rows):
|
|
|
|
if s1[row-1] == s2[col-1]:
|
|
|
|
cost = 0
|
|
|
|
else:
|
|
|
|
cost = substitute
|
|
|
|
|
|
|
|
dist[row][col] = min(
|
|
|
|
dist[row-1][col] + delete, # deletion
|
|
|
|
dist[row][col-1] + insert, # insertion
|
|
|
|
dist[row-1][col-1] + cost # substitution
|
|
|
|
)
|
|
|
|
|
|
|
|
return dist[-1][-1]
|
|
|
|
|
2021-03-20 11:04:12 +00:00
|
|
|
def normalize_distance(dist, s1, s2, weights=(1, 1, 1)):
|
|
|
|
insert, delete, substitute = weights
|
|
|
|
if len(s1) > len(s2):
|
|
|
|
max_dist = min([
|
|
|
|
# delete all characters from s1 and insert all characters from s2
|
|
|
|
len(s1) * delete + len(s2) * insert,
|
|
|
|
# replace all characters and delete the remaining characters from s1
|
|
|
|
len(s2) * substitute + (len(s1) - len(s2)) * delete
|
|
|
|
])
|
|
|
|
else:
|
|
|
|
max_dist = min([
|
|
|
|
# delete all characters from s1 and insert all characters from s2
|
|
|
|
len(s1) * delete + len(s2) * insert,
|
|
|
|
# replace all characters and insert the remaining characters into s1
|
|
|
|
len(s1) * substitute + (len(s2) - len(s1)) * insert
|
|
|
|
])
|
|
|
|
|
2021-09-11 10:25:31 +00:00
|
|
|
return 100 - 100 * float(dist) / float(max_dist) if max_dist else 100
|
2021-03-20 11:04:12 +00:00
|
|
|
|
2021-09-10 00:08:08 +00:00
|
|
|
def partial_ratio_short_needle(s1, s2):
|
|
|
|
if not s1 and not s2:
|
|
|
|
return 100
|
|
|
|
|
|
|
|
if not s1 or not s2:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
if len(s1) > len(s2):
|
|
|
|
return partial_ratio_short_needle(s2, s1)
|
|
|
|
parts = [s2[max(0, i) : min(len(s2), i+len(s1))] for i in range(-len(s1), len(s2))]
|
|
|
|
res = 0
|
|
|
|
for part in parts:
|
|
|
|
res = max(res, fuzz.ratio(s1, part))
|
|
|
|
return res
|
|
|
|
|
2021-09-10 10:44:54 +00:00
|
|
|
def cdist_scorer(queries, choices, scorer):
|
|
|
|
matrix = np.zeros((len(queries), len(choices)), dtype=np.uint8)
|
|
|
|
|
|
|
|
for i, query in enumerate(queries):
|
|
|
|
for j, choice in enumerate(choices):
|
|
|
|
matrix[i, j] = scorer(query, choice)
|
|
|
|
|
|
|
|
return matrix
|
|
|
|
|
|
|
|
def cdist_distance(queries, choices, scorer):
|
|
|
|
matrix = np.zeros((len(queries), len(choices)), dtype=np.int32)
|
|
|
|
|
|
|
|
for i, query in enumerate(queries):
|
|
|
|
for j, choice in enumerate(choices):
|
|
|
|
matrix[i, j] = scorer(query, choice)
|
|
|
|
|
|
|
|
return matrix
|
|
|
|
|
2021-03-20 11:04:12 +00:00
|
|
|
def extractOne_scorer(s1, s2, scorer, processor=None, **kwargs):
|
|
|
|
return process.extractOne(s1, [s2], processor=processor, scorer=scorer, **kwargs)[1]
|
|
|
|
|
|
|
|
def extract_scorer(s1, s2, scorer, processor=None, **kwargs):
|
|
|
|
return process.extract(s1, [s2], processor=processor, scorer=scorer, **kwargs)[0][1]
|
2021-02-21 18:42:36 +00:00
|
|
|
|
2021-03-29 17:09:22 +00:00
|
|
|
def extract_iter_scorer(s1, s2, scorer, processor=None, **kwargs):
|
|
|
|
return list(process.extract_iter(s1, [s2], processor=processor, scorer=scorer, **kwargs))[0][1]
|
|
|
|
|
|
|
|
|
2020-11-15 19:18:46 +00:00
|
|
|
HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation
|
|
|
|
|
|
|
|
SCORERS = [
|
|
|
|
fuzz.ratio,
|
|
|
|
fuzz.partial_ratio,
|
|
|
|
fuzz.token_set_ratio,
|
|
|
|
fuzz.token_sort_ratio,
|
|
|
|
fuzz.token_ratio,
|
|
|
|
fuzz.partial_token_set_ratio,
|
|
|
|
fuzz.partial_token_sort_ratio,
|
|
|
|
fuzz.partial_token_ratio,
|
|
|
|
fuzz.WRatio,
|
|
|
|
fuzz.QRatio
|
|
|
|
]
|
|
|
|
|
|
|
|
FULL_SCORERS = [
|
|
|
|
fuzz.ratio,
|
|
|
|
fuzz.WRatio,
|
|
|
|
fuzz.QRatio
|
|
|
|
]
|
|
|
|
|
|
|
|
PROCESSORS = [
|
|
|
|
lambda x: x,
|
|
|
|
utils.default_process
|
|
|
|
]
|
|
|
|
|
2021-08-21 00:53:32 +00:00
|
|
|
@given(s1=st.text(), s2=st.text())
|
2021-09-23 20:41:10 +00:00
|
|
|
@settings(max_examples=50, deadline=1000)
|
2021-08-21 00:53:32 +00:00
|
|
|
def test_levenshtein_editops(s1, s2):
|
|
|
|
"""
|
|
|
|
test levenshtein_editops. Currently this only tests, so there are no exceptions.
|
|
|
|
"""
|
|
|
|
string_metric.levenshtein_editops(s1, s2)
|
|
|
|
|
2021-09-10 00:08:08 +00:00
|
|
|
@given(s1=st.text(max_size=64), s2=st.text())
|
2021-09-23 20:41:10 +00:00
|
|
|
@settings(max_examples=50, deadline=1000)
|
2021-09-10 00:08:08 +00:00
|
|
|
def test_partial_ratio_short_needle(s1, s2):
|
|
|
|
"""
|
|
|
|
test partial_ratio for short needles (needle <= 64)
|
|
|
|
"""
|
|
|
|
assert isclose(fuzz.partial_ratio(s1, s2), partial_ratio_short_needle(s1, s2))
|
2021-08-21 00:53:32 +00:00
|
|
|
|
2021-03-07 16:04:03 +00:00
|
|
|
@given(s1=st.text(), s2=st.text())
|
2021-09-23 20:41:10 +00:00
|
|
|
@settings(max_examples=50, deadline=1000)
|
2021-03-07 16:04:03 +00:00
|
|
|
def test_token_ratio(s1, s2):
|
|
|
|
"""
|
|
|
|
token_ratio should be max(token_sort_ratio, token_set_ratio)
|
|
|
|
"""
|
|
|
|
assert fuzz.token_ratio(s1, s2) == max(fuzz.token_sort_ratio(s1, s2), fuzz.token_set_ratio(s1, s2))
|
|
|
|
|
|
|
|
@given(s1=st.text(), s2=st.text())
|
2021-09-23 20:41:10 +00:00
|
|
|
@settings(max_examples=50, deadline=1000)
|
2021-03-07 16:04:03 +00:00
|
|
|
def test_partial_token_ratio(s1, s2):
|
|
|
|
"""
|
|
|
|
partial_token_ratio should be max(partial_token_sort_ratio, partial_token_set_ratio)
|
|
|
|
"""
|
|
|
|
assert fuzz.partial_token_ratio(s1, s2) == max(fuzz.partial_token_sort_ratio(s1, s2), fuzz.partial_token_set_ratio(s1, s2))
|
|
|
|
|
2021-02-21 18:42:36 +00:00
|
|
|
|
2021-09-10 00:08:08 +00:00
|
|
|
@given(s1=st.text(max_size=64), s2=st.text(max_size=64))
|
2021-09-24 00:24:06 +00:00
|
|
|
@settings(max_examples=50, deadline=None)
|
2021-02-21 18:42:36 +00:00
|
|
|
def test_levenshtein_word(s1, s2):
|
|
|
|
"""
|
|
|
|
Test short Levenshtein implementation against simple implementation
|
|
|
|
"""
|
2021-03-29 17:09:22 +00:00
|
|
|
# uniform Levenshtein
|
|
|
|
# distance
|
2021-03-20 11:04:12 +00:00
|
|
|
reference_dist = levenshtein(s1, s2)
|
|
|
|
assert string_metric.levenshtein(s1, s2) == reference_dist
|
2021-03-29 17:09:22 +00:00
|
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein) == reference_dist
|
|
|
|
# normalized distance
|
|
|
|
reference_sim = normalize_distance(reference_dist, s1, s2)
|
2021-03-20 11:04:12 +00:00
|
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
|
2021-03-29 17:09:22 +00:00
|
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
2021-03-20 11:04:12 +00:00
|
|
|
|
2021-03-29 17:09:22 +00:00
|
|
|
# InDel-Distance
|
|
|
|
# distance
|
2021-08-19 20:49:18 +00:00
|
|
|
reference_dist = levenshtein(s1, s2, weights=(1,1,2))
|
|
|
|
assert string_metric.levenshtein(s1, s2, weights=(1,1,2)) == reference_dist
|
2021-03-29 17:09:22 +00:00
|
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
|
|
# normalized distance
|
2021-08-19 20:49:18 +00:00
|
|
|
reference_sim = normalize_distance(reference_dist, s1, s2, weights=(1,1,2))
|
|
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2, weights=(1,1,2)), reference_sim)
|
2021-03-29 17:09:22 +00:00
|
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
2021-02-21 18:42:36 +00:00
|
|
|
|
|
|
|
|
|
|
|
@given(s1=st.text(min_size=65), s2=st.text(min_size=65))
|
2021-09-24 00:24:06 +00:00
|
|
|
@settings(max_examples=50, deadline=None)
|
2021-02-21 18:42:36 +00:00
|
|
|
def test_levenshtein_block(s1, s2):
|
|
|
|
"""
|
|
|
|
Test blockwise Levenshtein implementation against simple implementation
|
|
|
|
"""
|
2021-03-29 17:09:22 +00:00
|
|
|
# uniform Levenshtein
|
|
|
|
# distance
|
2021-03-20 11:04:12 +00:00
|
|
|
reference_dist = levenshtein(s1, s2)
|
|
|
|
assert string_metric.levenshtein(s1, s2) == reference_dist
|
2021-03-29 17:09:22 +00:00
|
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein) == reference_dist
|
|
|
|
# normalized distance
|
|
|
|
reference_sim = normalize_distance(reference_dist, s1, s2)
|
2021-03-20 11:04:12 +00:00
|
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
|
2021-03-29 17:09:22 +00:00
|
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
2021-03-20 11:04:12 +00:00
|
|
|
|
2021-03-29 17:09:22 +00:00
|
|
|
# InDel-Distance
|
|
|
|
# distance
|
2021-08-19 20:49:18 +00:00
|
|
|
reference_dist = levenshtein(s1, s2, weights=(1,1,2))
|
|
|
|
assert string_metric.levenshtein(s1, s2, weights=(1,1,2)) == reference_dist
|
2021-03-29 17:09:22 +00:00
|
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
|
|
# normalized distance
|
2021-08-19 20:49:18 +00:00
|
|
|
reference_sim = normalize_distance(reference_dist, s1, s2, weights=(1,1,2))
|
2021-03-29 17:09:22 +00:00
|
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2, weights=(1,1,2)), reference_sim)
|
|
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
2021-02-21 18:42:36 +00:00
|
|
|
|
|
|
|
@given(s1=st.text(), s2=st.text())
|
2021-09-24 00:24:06 +00:00
|
|
|
@settings(max_examples=50, deadline=None)
|
2021-02-21 18:42:36 +00:00
|
|
|
def test_levenshtein_random(s1, s2):
|
|
|
|
"""
|
|
|
|
Test mixed strings to test through all implementations of Levenshtein
|
|
|
|
"""
|
2021-03-29 17:09:22 +00:00
|
|
|
# uniform Levenshtein
|
|
|
|
# distance
|
2021-03-20 11:04:12 +00:00
|
|
|
reference_dist = levenshtein(s1, s2)
|
|
|
|
assert string_metric.levenshtein(s1, s2) == reference_dist
|
2021-03-29 17:09:22 +00:00
|
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein) == reference_dist
|
|
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein) == reference_dist
|
|
|
|
# normalized distance
|
|
|
|
reference_sim = normalize_distance(reference_dist, s1, s2)
|
2021-03-20 11:04:12 +00:00
|
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2), reference_sim)
|
2021-03-29 17:09:22 +00:00
|
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
|
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein), reference_sim)
|
2021-03-20 11:04:12 +00:00
|
|
|
|
2021-03-29 17:09:22 +00:00
|
|
|
# InDel-Distance
|
|
|
|
# distance
|
2021-08-19 20:49:18 +00:00
|
|
|
reference_dist = levenshtein(s1, s2, weights=(1,1,2))
|
|
|
|
assert string_metric.levenshtein(s1, s2, weights=(1,1,2)) == reference_dist
|
2021-03-29 17:09:22 +00:00
|
|
|
assert extractOne_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
|
|
assert extract_scorer( s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
|
|
assert extract_iter_scorer(s1, s2, string_metric.levenshtein, weights=(1,1,2)) == reference_dist
|
|
|
|
# normalized distance
|
2021-08-19 20:49:18 +00:00
|
|
|
reference_sim = normalize_distance(reference_dist, s1, s2, weights=(1,1,2))
|
|
|
|
assert isclose(string_metric.normalized_levenshtein(s1, s2, weights=(1,1,2)), reference_sim)
|
2021-03-29 17:09:22 +00:00
|
|
|
assert isclose(extractOne_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
|
|
assert isclose(extract_scorer( s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
|
|
|
assert isclose(extract_iter_scorer(s1, s2, string_metric.normalized_levenshtein, weights=(1,1,2)), reference_sim)
|
2021-02-21 18:42:36 +00:00
|
|
|
|
2020-11-15 19:18:46 +00:00
|
|
|
@given(sentence=st.text())
|
2021-09-23 20:41:10 +00:00
|
|
|
@settings(max_examples=50, deadline=1000)
|
2020-11-15 19:18:46 +00:00
|
|
|
def test_multiple_processor_runs(sentence):
|
|
|
|
"""
|
|
|
|
Test that running a preprocessor on a sentence
|
|
|
|
a second time does not change the result
|
|
|
|
"""
|
|
|
|
assert utils.default_process(sentence) \
|
|
|
|
== utils.default_process(utils.default_process(sentence))
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('scorer,processor', list(product(FULL_SCORERS, PROCESSORS)))
|
|
|
|
@given(choices=st.lists(st.text(), min_size=1))
|
2021-09-23 20:41:10 +00:00
|
|
|
@settings(max_examples=50, deadline=1000)
|
2020-11-15 19:18:46 +00:00
|
|
|
def test_only_identical_strings_extracted(scorer, processor, choices):
|
|
|
|
"""
|
|
|
|
Test that only identical (post processing) strings score 100 on the test.
|
|
|
|
If two strings are not identical then using full comparison methods they should
|
|
|
|
not be a perfect (100) match.
|
|
|
|
:param scorer:
|
|
|
|
:param processor:
|
|
|
|
:param data:
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
query = random.choice(choices)
|
|
|
|
assume(processor(query) != '')
|
|
|
|
|
|
|
|
matches = process.extract(query, choices,
|
|
|
|
scorer=scorer, processor=processor,
|
|
|
|
score_cutoff=100, limit=None)
|
|
|
|
|
|
|
|
assert matches != []
|
|
|
|
|
|
|
|
for match in matches:
|
2021-08-21 00:53:32 +00:00
|
|
|
assert processor(query) == processor(match[0])
|
2021-09-10 10:44:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
@given(queries=st.lists(st.text(), min_size=1), choices=st.lists(st.text(), min_size=1))
|
2021-09-23 20:41:10 +00:00
|
|
|
@settings(max_examples=50, deadline=1000)
|
2021-09-10 10:44:54 +00:00
|
|
|
def test_cdist(queries, choices):
|
|
|
|
"""
|
|
|
|
Test that cdist returns correct results
|
|
|
|
"""
|
|
|
|
|
|
|
|
reference_matrix = cdist_distance(queries, choices, scorer=string_metric.levenshtein)
|
|
|
|
matrix = process.cdist(queries, choices, scorer=string_metric.levenshtein)
|
|
|
|
assert (matrix == reference_matrix).all()
|
|
|
|
|
|
|
|
reference_matrix = cdist_distance(queries, queries, scorer=string_metric.levenshtein)
|
|
|
|
matrix = process.cdist(queries, queries, scorer=string_metric.levenshtein)
|
|
|
|
assert (matrix == reference_matrix).all()
|