diff --git a/bench/benchmark_damerau_levenshtein.py b/bench/benchmark_damerau_levenshtein.py deleted file mode 100644 index c27eaf2..0000000 --- a/bench/benchmark_damerau_levenshtein.py +++ /dev/null @@ -1,51 +0,0 @@ -# todo combine benchmarks of scorers into common code base -import timeit - -import pandas - - -def benchmark(name, func, setup, lengths, count): - print(f"starting {name}") - start = timeit.default_timer() - results = [] - from tqdm import tqdm - - for length in tqdm(lengths): - test = timeit.Timer(func, setup=setup.format(length, count)) - results.append(min(test.timeit(number=1) for _ in range(7)) / count) - stop = timeit.default_timer() - print(f"finished {name}, Runtime: ", stop - start) - return results - - -setup = """ -from rapidfuzz.distance.DamerauLevenshtein import distance -from jellyfish import damerau_levenshtein_distance -import string -import random -random.seed(18) -characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation -a = ''.join(random.choice(characters) for _ in range({0})) -b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})] -""" - -lengths = list(range(1, 256, 2)) -count = 1000 - -time_rapidfuzz = benchmark( - "rapidfuzz", "[distance(a, b) for b in b_list]", setup, lengths, count -) - -time_jellyfish = benchmark( - "jellyfish", - "[damerau_levenshtein_distance(a, b) for b in b_list]", - setup, - lengths, - count, -) - -df = pandas.DataFrame( - data={"length": lengths, "rapidfuzz": time_rapidfuzz, "jellyfish": time_jellyfish} -) - -df.to_csv("results/levenshtein_damerau.csv", sep=",", index=False) diff --git a/bench/benchmark_indel_levenshtein.py b/bench/benchmark_indel_levenshtein.py deleted file mode 100644 index afe2f82..0000000 --- a/bench/benchmark_indel_levenshtein.py +++ /dev/null @@ -1,62 +0,0 @@ -# todo combine benchmarks of scorers into common code base -import timeit - -import numpy as np -import pandas - - -def benchmark(name, func, setup, lengths, count): - print(f"starting {name}") - start = timeit.default_timer() - results = [] - for length in lengths: - test = timeit.Timer(func, setup=setup.format(length, count)) - results.append(min(test.timeit(number=1) for _ in range(7)) / count) - stop = timeit.default_timer() - print(f"finished {name}, Runtime: ", stop - start) - return results - - -setup = """ -from rapidfuzz import string_metric, process, fuzz -import Levenshtein -import string -import random -random.seed(18) -characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation -a = ''.join(random.choice(characters) for _ in range({0})) -b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})] -""" - -lengths = list(range(1, 512, 2)) -count = 3000 - -time_rapidfuzz = benchmark( - "rapidfuzz", - "[string_metric.levenshtein(a, b, weights=(1,1,2)) for b in b_list]", - setup, - lengths, - count, -) - -# this gets very slow, so only benchmark it for smaller values -time_python_levenshtein = ( - benchmark( - "python-Levenshtein", - "[Levenshtein.ratio(a, b) for b in b_list]", - setup, - list(range(1, 256, 2)), - count, - ) - + [np.NaN] * 128 -) - -df = pandas.DataFrame( - data={ - "length": lengths, - "rapidfuzz": time_rapidfuzz, - "python-Levenshtein": time_python_levenshtein, - } -) - -df.to_csv("results/levenshtein_indel.csv", sep=",", index=False) diff --git a/bench/benchmark_jaro.py b/bench/benchmark_jaro.py deleted file mode 100644 index fbdf83e..0000000 --- a/bench/benchmark_jaro.py +++ /dev/null @@ -1,60 +0,0 @@ -# todo combine benchmarks of scorers into common code base -import timeit - -import numpy as np -import pandas - - -def benchmark(name, func, setup, lengths, count): - print(f"starting {name}") - start = timeit.default_timer() - results = [] - from tqdm import tqdm - - for length in tqdm(lengths): - test = timeit.Timer(func, setup=setup.format(length, count)) - results.append(min(test.timeit(number=1) for _ in range(7)) / count) - stop = timeit.default_timer() - print(f"finished {name}, Runtime: ", stop - start) - return results - - -setup = """ -from rapidfuzz.distance import Jaro -import jellyfish -import string -import random -random.seed(18) -characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation -a = ''.join(random.choice(characters) for _ in range({0})) -b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})] -""" - -lengths = list(range(1, 256, 4)) -count = 4000 - -time_rapidfuzz = benchmark( - "rapidfuzz", "[Jaro.similarity(a, b) for b in b_list]", setup, lengths, count -) - -# this gets very slow, so only benchmark it for smaller values -time_jellyfish = ( - benchmark( - "jellyfish", - "[jellyfish.jaro_similarity(a, b) for b in b_list]", - setup, - list(range(1, 128, 4)), - count, - ) - + [np.NaN] * 32 -) - -df = pandas.DataFrame( - data={ - "length": lengths, - "rapidfuzz": time_rapidfuzz, - "jellyfish": time_jellyfish, - } -) - -df.to_csv("results/jaro.csv", sep=",", index=False) diff --git a/bench/benchmark_jaro_winkler.py b/bench/benchmark_jaro_winkler.py deleted file mode 100644 index 7561afd..0000000 --- a/bench/benchmark_jaro_winkler.py +++ /dev/null @@ -1,60 +0,0 @@ -# todo combine benchmarks of scorers into common code base -import timeit - -import numpy as np -import pandas - - -def benchmark(name, func, setup, lengths, count): - print(f"starting {name}") - start = timeit.default_timer() - results = [] - from tqdm import tqdm - - for length in tqdm(lengths): - test = timeit.Timer(func, setup=setup.format(length, count)) - results.append(min(test.timeit(number=1) for _ in range(7)) / count) - stop = timeit.default_timer() - print(f"finished {name}, Runtime: ", stop - start) - return results - - -setup = """ -from rapidfuzz.distance import JaroWinkler -import jellyfish -import string -import random -random.seed(18) -characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation -a = ''.join(random.choice(characters) for _ in range({0})) -b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})] -""" - -lengths = list(range(1, 256, 4)) -count = 4000 - -time_rapidfuzz = benchmark( - "rapidfuzz", "[JaroWinkler.similarity(a, b) for b in b_list]", setup, lengths, count -) - -# this gets very slow, so only benchmark it for smaller values -time_jellyfish = ( - benchmark( - "jellyfish", - "[jellyfish.jaro_winkler_similarity(a, b) for b in b_list]", - setup, - list(range(1, 128, 4)), - count, - ) - + [np.NaN] * 32 -) - -df = pandas.DataFrame( - data={ - "length": lengths, - "rapidfuzz": time_rapidfuzz, - "jellyfish": time_jellyfish, - } -) - -df.to_csv("results/jaro_winkler.csv", sep=",", index=False) diff --git a/bench/benchmark_osa.py b/bench/benchmark_osa.py deleted file mode 100644 index c2ed8f6..0000000 --- a/bench/benchmark_osa.py +++ /dev/null @@ -1,56 +0,0 @@ -# todo combine benchmarks of scorers into common code base -import timeit - -import numpy as np -import pandas - - -def benchmark(name, func, setup, lengths, count): - print(f"starting {name}") - start = timeit.default_timer() - results = [] - from tqdm import tqdm - - for length in tqdm(lengths): - test = timeit.Timer(func, setup=setup.format(length, count)) - results.append(min(test.timeit(number=1) for _ in range(7)) / count) - stop = timeit.default_timer() - print(f"finished {name}, Runtime: ", stop - start) - return results - - -setup = """ -from rapidfuzz.distance.OSA import distance -from pyxdameraulevenshtein import damerau_levenshtein_distance -import string -import random -random.seed(18) -characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation -a = ''.join(random.choice(characters) for _ in range({0})) -b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})] -""" - -lengths = list(range(1, 256, 2)) -count = 1000 - -time_rapidfuzz = benchmark( - "rapidfuzz", "[distance(a, b) for b in b_list]", setup, lengths, count -) - -time_pyxdameraulevenshtein = benchmark( - "pyxdameraulevenshtein", - "[damerau_levenshtein_distance(a, b) for b in b_list]", - setup, - list(range(1, 16, 2)), - count, -) + [np.NaN] * int((256 - 16) / 2) - -df = pandas.DataFrame( - data={ - "length": lengths, - "rapidfuzz": time_rapidfuzz, - "pyxdameraulevenshtein": time_pyxdameraulevenshtein, - } -) - -df.to_csv("results/osa.csv", sep=",", index=False) diff --git a/bench/benchmark_uniform_levenshtein.py b/bench/benchmark_uniform_levenshtein.py deleted file mode 100644 index 0ac5cd3..0000000 --- a/bench/benchmark_uniform_levenshtein.py +++ /dev/null @@ -1,79 +0,0 @@ -import timeit - -import numpy as np -import pandas - - -def benchmark(name, func, setup, lengths, count): - print(f"starting {name}") - start = timeit.default_timer() - results = [] - for length in lengths: - test = timeit.Timer(func, setup=setup.format(length, count)) - results.append(min(test.timeit(number=1) for _ in range(7)) / count) - stop = timeit.default_timer() - print(f"finished {name}, Runtime: ", stop - start) - return results - - -setup = """ -from rapidfuzz import string_metric -import Levenshtein -import polyleven -import edlib -import editdistance -import string -import random -random.seed(18) -characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation -a = ''.join(random.choice(characters) for _ in range({0})) -b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})] -""" - -lengths = list(range(1, 512, 2)) -count = 2000 - -time_rapidfuzz = benchmark( - "rapidfuzz", - "[string_metric.levenshtein(a, b) for b in b_list]", - setup, - lengths, - count, -) - -time_polyleven = benchmark( - "polyleven", "[polyleven.levenshtein(a, b) for b in b_list]", setup, lengths, count -) - -# this gets very slow, so only benchmark it for smaller values -time_python_levenshtein = ( - benchmark( - "python-Levenshtein", - "[Levenshtein.distance(a, b) for b in b_list]", - setup, - list(range(1, 256, 2)), - count, - ) - + [np.NaN] * 128 -) - -time_edlib = benchmark( - "edlib", "[edlib.align(a, b) for b in b_list]", setup, lengths, count -) - -time_editdistance = benchmark( - "editdistance", "[editdistance.eval(a, b) for b in b_list]", setup, lengths, count -) - -df = pandas.DataFrame( - data={ - "length": lengths, - "rapidfuzz": time_rapidfuzz, - "polyleven": time_polyleven, - "python-Levenshtein": time_python_levenshtein, - "edlib": time_edlib, - "editdistance": time_editdistance, - } -) - -df.to_csv("results/levenshtein_uniform.csv", sep=",", index=False)