RapidFuzz/bench/benchmark_cdist.py

import importlib
import random
import string
from timeit import timeit

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

random.seed(18)

plt.rc("font", size=13)  # controls default text sizes
plt.rc("axes", titlesize=18)  # fontsize of the axes title
plt.rc("axes", labelsize=15)  # fontsize of the x and y labels
plt.rc("xtick", labelsize=15)  # fontsize of the tick labels
plt.rc("ytick", labelsize=15)  # fontsize of the tick labels
plt.rc("legend", fontsize=15)  # legend fontsize

PROCESSOR = {
    "ratio": False,
    "partial_ratio": False,
    "token_sort_ratio": True,
    "token_set_ratio": True,
    "partial_token_sort_ratio": True,
    "partial_token_set_ratio": True,
    "QRatio": True,
    "WRatio": True,
}

LIBRARIES = (
    "ratio",
    "partial_ratio",
    "token_sort_ratio",
    "token_set_ratio",
    "partial_token_sort_ratio",
    "partial_token_set_ratio",
    "QRatio",
    "WRatio",
)


def load_func(target):
    modname, funcname = target.rsplit(".", maxsplit=1)

    module = importlib.import_module(modname)
    return getattr(module, funcname)


def get_platform():
    import platform

    uname = platform.uname()
    pyver = platform.python_version()
    return f"Python {pyver} on {uname.system} ({uname.machine})"


def benchmark():
    words = [
        "".join(random.choice(string.ascii_letters + string.digits) for _ in range(8))
        for _ in range(10000)
    ]
    sample_rate = len(words) // 100
    sample = words[::sample_rate]
    total = len(words) * len(sample)

    print("System:", get_platform())
    print("Words :", len(words))
    print("Sample:", len(sample))
    print("Total : %s calls\n" % total)

    def wrap_cdist(scorer, processor):
        from rapidfuzz.process import cdist

        def func():
            cdist(sample, words, scorer=scorer, processor=processor)

        return func

    def wrap_iterate(scorer, processor):
        def func():
            for query in sample:
                for choice in words:
                    scorer(query, choice)

        return func

    fuzz = []
    rfuzz = []

    header_list = ["Function", "RapidFuzz", "FuzzyWuzzy", "SpeedImprovement"]
    row_format = "{:>25}" * len(header_list)
    print(row_format.format(*header_list))
    for target in LIBRARIES:
        scorer = load_func("fuzzywuzzy.fuzz." + target)
        sec = timeit(
            "func()",
            globals={"func": wrap_iterate(scorer, PROCESSOR[target])},
            number=1,
        )
        calls = total / sec
        fuzz.append(calls)

        rscorer = load_func("rapidfuzz.fuzz." + target)
        rsec = timeit(
            "func()", globals={"func": wrap_cdist(rscorer, PROCESSOR[target])}, number=1
        )
        rcalls = total / rsec
        rfuzz.append(rcalls)

        print(
            row_format.format(
                target, f"{rcalls//1000}k", f"{calls//1000}k", f"{int(100 * sec/rsec)}%"
            )
        )

    labels = LIBRARIES

    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots(figsize=(17, 10))
    rects1 = ax.bar(x - width / 2, fuzz, width, label="FuzzyWuzzy", color="xkcd:coral")
    rects2 = ax.bar(x + width / 2, rfuzz, width, label="RapidFuzz", color="#6495ED")

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel("evaluated word pairs [inputs/s]")
    ax.set_xlabel("Scorer")
    ax.set_title(
        "The number of word pairs evaluated per second\n(the larger the better)"
    )
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=30)
    ax.get_yaxis().set_major_formatter(
        matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ","))
    )
    ax.legend()

    def autolabel(rects):
        """Attach a text label above each bar in *rects*, displaying its height."""
        for rect in rects:
            height = rect.get_height()
            ax.annotate(
                f"{int(height):,}",
                xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha="center",
                va="bottom",
            )

    autolabel(rects1)
    autolabel(rects2)
    fig.tight_layout()
    plt.show()


if __name__ == "__main__":
    benchmark()