RapidFuzz/bench/benchmark_cdist.py

159 lines
4.3 KiB
Python

import importlib
import random
import string
from timeit import timeit
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
random.seed(18)
plt.rc("font", size=13) # controls default text sizes
plt.rc("axes", titlesize=18) # fontsize of the axes title
plt.rc("axes", labelsize=15) # fontsize of the x and y labels
plt.rc("xtick", labelsize=15) # fontsize of the tick labels
plt.rc("ytick", labelsize=15) # fontsize of the tick labels
plt.rc("legend", fontsize=15) # legend fontsize
PROCESSOR = {
"ratio": False,
"partial_ratio": False,
"token_sort_ratio": True,
"token_set_ratio": True,
"partial_token_sort_ratio": True,
"partial_token_set_ratio": True,
"QRatio": True,
"WRatio": True,
}
LIBRARIES = (
"ratio",
"partial_ratio",
"token_sort_ratio",
"token_set_ratio",
"partial_token_sort_ratio",
"partial_token_set_ratio",
"QRatio",
"WRatio",
)
def load_func(target):
modname, funcname = target.rsplit(".", maxsplit=1)
module = importlib.import_module(modname)
return getattr(module, funcname)
def get_platform():
import platform
uname = platform.uname()
pyver = platform.python_version()
return f"Python {pyver} on {uname.system} ({uname.machine})"
def benchmark():
words = [
"".join(random.choice(string.ascii_letters + string.digits) for _ in range(8))
for _ in range(10000)
]
sample_rate = len(words) // 100
sample = words[::sample_rate]
total = len(words) * len(sample)
print("System:", get_platform())
print("Words :", len(words))
print("Sample:", len(sample))
print("Total : %s calls\n" % total)
def wrap_cdist(scorer, processor):
from rapidfuzz.process import cdist
def func():
cdist(sample, words, scorer=scorer, processor=processor)
return func
def wrap_iterate(scorer, processor):
def func():
for query in sample:
for choice in words:
scorer(query, choice)
return func
fuzz = []
rfuzz = []
header_list = ["Function", "RapidFuzz", "FuzzyWuzzy", "SpeedImprovement"]
row_format = "{:>25}" * len(header_list)
print(row_format.format(*header_list))
for target in LIBRARIES:
scorer = load_func("fuzzywuzzy.fuzz." + target)
sec = timeit(
"func()",
globals={"func": wrap_iterate(scorer, PROCESSOR[target])},
number=1,
)
calls = total / sec
fuzz.append(calls)
rscorer = load_func("rapidfuzz.fuzz." + target)
rsec = timeit(
"func()", globals={"func": wrap_cdist(rscorer, PROCESSOR[target])}, number=1
)
rcalls = total / rsec
rfuzz.append(rcalls)
print(
row_format.format(
target, f"{rcalls//1000}k", f"{calls//1000}k", f"{int(100 * sec/rsec)}%"
)
)
labels = LIBRARIES
x = np.arange(len(labels)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots(figsize=(17, 10))
rects1 = ax.bar(x - width / 2, fuzz, width, label="FuzzyWuzzy", color="xkcd:coral")
rects2 = ax.bar(x + width / 2, rfuzz, width, label="RapidFuzz", color="#6495ED")
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("evaluated word pairs [inputs/s]")
ax.set_xlabel("Scorer")
ax.set_title(
"The number of word pairs evaluated per second\n(the larger the better)"
)
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=30)
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ","))
)
ax.legend()
def autolabel(rects):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = rect.get_height()
ax.annotate(
f"{int(height):,}",
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha="center",
va="bottom",
)
autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
plt.show()
if __name__ == "__main__":
benchmark()