remove benchmarks already covered by new benchmark suite

2022-12-24 13:46:02 +01:00 · 2022-12-24 13:46:02 +01:00 · 171c7d5ff4
parent fa90076b65
commit 171c7d5ff4
6 changed files with 0 additions and 368 deletions
--- a/bench/benchmark_damerau_levenshtein.py
+++ b/bench/benchmark_damerau_levenshtein.py
@ -1,51 +0,0 @@
-# todo combine benchmarks of scorers into common code base
-import timeit
-
-import pandas
-
-
-def benchmark(name, func, setup, lengths, count):
-    print(f"starting {name}")
-    start = timeit.default_timer()
-    results = []
-    from tqdm import tqdm
-
-    for length in tqdm(lengths):
-        test = timeit.Timer(func, setup=setup.format(length, count))
-        results.append(min(test.timeit(number=1) for _ in range(7)) / count)
-    stop = timeit.default_timer()
-    print(f"finished {name}, Runtime: ", stop - start)
-    return results
-
-
-setup = """
-from rapidfuzz.distance.DamerauLevenshtein import distance
-from jellyfish import damerau_levenshtein_distance
-import string
-import random
-random.seed(18)
-characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
-a      = ''.join(random.choice(characters) for _ in range({0}))
-b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
-"""
-
-lengths = list(range(1, 256, 2))
-count = 1000
-
-time_rapidfuzz = benchmark(
-    "rapidfuzz", "[distance(a, b) for b in b_list]", setup, lengths, count
-)
-
-time_jellyfish = benchmark(
-    "jellyfish",
-    "[damerau_levenshtein_distance(a, b) for b in b_list]",
-    setup,
-    lengths,
-    count,
-)
-
-df = pandas.DataFrame(
-    data={"length": lengths, "rapidfuzz": time_rapidfuzz, "jellyfish": time_jellyfish}
-)
-
-df.to_csv("results/levenshtein_damerau.csv", sep=",", index=False)
--- a/bench/benchmark_indel_levenshtein.py
+++ b/bench/benchmark_indel_levenshtein.py
@ -1,62 +0,0 @@
-# todo combine benchmarks of scorers into common code base
-import timeit
-
-import numpy as np
-import pandas
-
-
-def benchmark(name, func, setup, lengths, count):
-    print(f"starting {name}")
-    start = timeit.default_timer()
-    results = []
-    for length in lengths:
-        test = timeit.Timer(func, setup=setup.format(length, count))
-        results.append(min(test.timeit(number=1) for _ in range(7)) / count)
-    stop = timeit.default_timer()
-    print(f"finished {name}, Runtime: ", stop - start)
-    return results
-
-
-setup = """
-from rapidfuzz import string_metric, process, fuzz
-import Levenshtein
-import string
-import random
-random.seed(18)
-characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
-a      = ''.join(random.choice(characters) for _ in range({0}))
-b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
-"""
-
-lengths = list(range(1, 512, 2))
-count = 3000
-
-time_rapidfuzz = benchmark(
-    "rapidfuzz",
-    "[string_metric.levenshtein(a, b, weights=(1,1,2)) for b in b_list]",
-    setup,
-    lengths,
-    count,
-)
-
-# this gets very slow, so only benchmark it for smaller values
-time_python_levenshtein = (
-    benchmark(
-        "python-Levenshtein",
-        "[Levenshtein.ratio(a, b) for b in b_list]",
-        setup,
-        list(range(1, 256, 2)),
-        count,
-    )
-    + [np.NaN] * 128
-)
-
-df = pandas.DataFrame(
-    data={
-        "length": lengths,
-        "rapidfuzz": time_rapidfuzz,
-        "python-Levenshtein": time_python_levenshtein,
-    }
-)
-
-df.to_csv("results/levenshtein_indel.csv", sep=",", index=False)
--- a/bench/benchmark_jaro.py
+++ b/bench/benchmark_jaro.py
@ -1,60 +0,0 @@
-# todo combine benchmarks of scorers into common code base
-import timeit
-
-import numpy as np
-import pandas
-
-
-def benchmark(name, func, setup, lengths, count):
-    print(f"starting {name}")
-    start = timeit.default_timer()
-    results = []
-    from tqdm import tqdm
-
-    for length in tqdm(lengths):
-        test = timeit.Timer(func, setup=setup.format(length, count))
-        results.append(min(test.timeit(number=1) for _ in range(7)) / count)
-    stop = timeit.default_timer()
-    print(f"finished {name}, Runtime: ", stop - start)
-    return results
-
-
-setup = """
-from rapidfuzz.distance import Jaro
-import jellyfish
-import string
-import random
-random.seed(18)
-characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
-a      = ''.join(random.choice(characters) for _ in range({0}))
-b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
-"""
-
-lengths = list(range(1, 256, 4))
-count = 4000
-
-time_rapidfuzz = benchmark(
-    "rapidfuzz", "[Jaro.similarity(a, b) for b in b_list]", setup, lengths, count
-)
-
-# this gets very slow, so only benchmark it for smaller values
-time_jellyfish = (
-    benchmark(
-        "jellyfish",
-        "[jellyfish.jaro_similarity(a, b) for b in b_list]",
-        setup,
-        list(range(1, 128, 4)),
-        count,
-    )
-    + [np.NaN] * 32
-)
-
-df = pandas.DataFrame(
-    data={
-        "length": lengths,
-        "rapidfuzz": time_rapidfuzz,
-        "jellyfish": time_jellyfish,
-    }
-)
-
-df.to_csv("results/jaro.csv", sep=",", index=False)
--- a/bench/benchmark_jaro_winkler.py
+++ b/bench/benchmark_jaro_winkler.py
@ -1,60 +0,0 @@
-# todo combine benchmarks of scorers into common code base
-import timeit
-
-import numpy as np
-import pandas
-
-
-def benchmark(name, func, setup, lengths, count):
-    print(f"starting {name}")
-    start = timeit.default_timer()
-    results = []
-    from tqdm import tqdm
-
-    for length in tqdm(lengths):
-        test = timeit.Timer(func, setup=setup.format(length, count))
-        results.append(min(test.timeit(number=1) for _ in range(7)) / count)
-    stop = timeit.default_timer()
-    print(f"finished {name}, Runtime: ", stop - start)
-    return results
-
-
-setup = """
-from rapidfuzz.distance import JaroWinkler
-import jellyfish
-import string
-import random
-random.seed(18)
-characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
-a      = ''.join(random.choice(characters) for _ in range({0}))
-b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
-"""
-
-lengths = list(range(1, 256, 4))
-count = 4000
-
-time_rapidfuzz = benchmark(
-    "rapidfuzz", "[JaroWinkler.similarity(a, b) for b in b_list]", setup, lengths, count
-)
-
-# this gets very slow, so only benchmark it for smaller values
-time_jellyfish = (
-    benchmark(
-        "jellyfish",
-        "[jellyfish.jaro_winkler_similarity(a, b) for b in b_list]",
-        setup,
-        list(range(1, 128, 4)),
-        count,
-    )
-    + [np.NaN] * 32
-)
-
-df = pandas.DataFrame(
-    data={
-        "length": lengths,
-        "rapidfuzz": time_rapidfuzz,
-        "jellyfish": time_jellyfish,
-    }
-)
-
-df.to_csv("results/jaro_winkler.csv", sep=",", index=False)
--- a/bench/benchmark_osa.py
+++ b/bench/benchmark_osa.py
@ -1,56 +0,0 @@
-# todo combine benchmarks of scorers into common code base
-import timeit
-
-import numpy as np
-import pandas
-
-
-def benchmark(name, func, setup, lengths, count):
-    print(f"starting {name}")
-    start = timeit.default_timer()
-    results = []
-    from tqdm import tqdm
-
-    for length in tqdm(lengths):
-        test = timeit.Timer(func, setup=setup.format(length, count))
-        results.append(min(test.timeit(number=1) for _ in range(7)) / count)
-    stop = timeit.default_timer()
-    print(f"finished {name}, Runtime: ", stop - start)
-    return results
-
-
-setup = """
-from rapidfuzz.distance.OSA import distance
-from pyxdameraulevenshtein import damerau_levenshtein_distance
-import string
-import random
-random.seed(18)
-characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
-a      = ''.join(random.choice(characters) for _ in range({0}))
-b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
-"""
-
-lengths = list(range(1, 256, 2))
-count = 1000
-
-time_rapidfuzz = benchmark(
-    "rapidfuzz", "[distance(a, b) for b in b_list]", setup, lengths, count
-)
-
-time_pyxdameraulevenshtein = benchmark(
-    "pyxdameraulevenshtein",
-    "[damerau_levenshtein_distance(a, b) for b in b_list]",
-    setup,
-    list(range(1, 16, 2)),
-    count,
-) + [np.NaN] * int((256 - 16) / 2)
-
-df = pandas.DataFrame(
-    data={
-        "length": lengths,
-        "rapidfuzz": time_rapidfuzz,
-        "pyxdameraulevenshtein": time_pyxdameraulevenshtein,
-    }
-)
-
-df.to_csv("results/osa.csv", sep=",", index=False)
--- a/bench/benchmark_uniform_levenshtein.py
+++ b/bench/benchmark_uniform_levenshtein.py
@ -1,79 +0,0 @@
-import timeit
-
-import numpy as np
-import pandas
-
-
-def benchmark(name, func, setup, lengths, count):
-    print(f"starting {name}")
-    start = timeit.default_timer()
-    results = []
-    for length in lengths:
-        test = timeit.Timer(func, setup=setup.format(length, count))
-        results.append(min(test.timeit(number=1) for _ in range(7)) / count)
-    stop = timeit.default_timer()
-    print(f"finished {name}, Runtime: ", stop - start)
-    return results
-
-
-setup = """
-from rapidfuzz import string_metric
-import Levenshtein
-import polyleven
-import edlib
-import editdistance
-import string
-import random
-random.seed(18)
-characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
-a = ''.join(random.choice(characters) for _ in range({0}))
-b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
-"""
-
-lengths = list(range(1, 512, 2))
-count = 2000
-
-time_rapidfuzz = benchmark(
-    "rapidfuzz",
-    "[string_metric.levenshtein(a, b) for b in b_list]",
-    setup,
-    lengths,
-    count,
-)
-
-time_polyleven = benchmark(
-    "polyleven", "[polyleven.levenshtein(a, b) for b in b_list]", setup, lengths, count
-)
-
-# this gets very slow, so only benchmark it for smaller values
-time_python_levenshtein = (
-    benchmark(
-        "python-Levenshtein",
-        "[Levenshtein.distance(a, b) for b in b_list]",
-        setup,
-        list(range(1, 256, 2)),
-        count,
-    )
-    + [np.NaN] * 128
-)
-
-time_edlib = benchmark(
-    "edlib", "[edlib.align(a, b) for b in b_list]", setup, lengths, count
-)
-
-time_editdistance = benchmark(
-    "editdistance", "[editdistance.eval(a, b) for b in b_list]", setup, lengths, count
-)
-
-df = pandas.DataFrame(
-    data={
-        "length": lengths,
-        "rapidfuzz": time_rapidfuzz,
-        "polyleven": time_polyleven,
-        "python-Levenshtein": time_python_levenshtein,
-        "edlib": time_edlib,
-        "editdistance": time_editdistance,
-    }
-)
-
-df.to_csv("results/levenshtein_uniform.csv", sep=",", index=False)