remove benchmarks already covered by new benchmark suite
This commit is contained in:
parent
fa90076b65
commit
171c7d5ff4
|
@ -1,51 +0,0 @@
|
|||
# todo combine benchmarks of scorers into common code base
|
||||
import timeit
|
||||
|
||||
import pandas
|
||||
|
||||
|
||||
def benchmark(name, func, setup, lengths, count):
|
||||
print(f"starting {name}")
|
||||
start = timeit.default_timer()
|
||||
results = []
|
||||
from tqdm import tqdm
|
||||
|
||||
for length in tqdm(lengths):
|
||||
test = timeit.Timer(func, setup=setup.format(length, count))
|
||||
results.append(min(test.timeit(number=1) for _ in range(7)) / count)
|
||||
stop = timeit.default_timer()
|
||||
print(f"finished {name}, Runtime: ", stop - start)
|
||||
return results
|
||||
|
||||
|
||||
setup = """
|
||||
from rapidfuzz.distance.DamerauLevenshtein import distance
|
||||
from jellyfish import damerau_levenshtein_distance
|
||||
import string
|
||||
import random
|
||||
random.seed(18)
|
||||
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
|
||||
a = ''.join(random.choice(characters) for _ in range({0}))
|
||||
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
|
||||
"""
|
||||
|
||||
lengths = list(range(1, 256, 2))
|
||||
count = 1000
|
||||
|
||||
time_rapidfuzz = benchmark(
|
||||
"rapidfuzz", "[distance(a, b) for b in b_list]", setup, lengths, count
|
||||
)
|
||||
|
||||
time_jellyfish = benchmark(
|
||||
"jellyfish",
|
||||
"[damerau_levenshtein_distance(a, b) for b in b_list]",
|
||||
setup,
|
||||
lengths,
|
||||
count,
|
||||
)
|
||||
|
||||
df = pandas.DataFrame(
|
||||
data={"length": lengths, "rapidfuzz": time_rapidfuzz, "jellyfish": time_jellyfish}
|
||||
)
|
||||
|
||||
df.to_csv("results/levenshtein_damerau.csv", sep=",", index=False)
|
|
@ -1,62 +0,0 @@
|
|||
# todo combine benchmarks of scorers into common code base
|
||||
import timeit
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
|
||||
def benchmark(name, func, setup, lengths, count):
|
||||
print(f"starting {name}")
|
||||
start = timeit.default_timer()
|
||||
results = []
|
||||
for length in lengths:
|
||||
test = timeit.Timer(func, setup=setup.format(length, count))
|
||||
results.append(min(test.timeit(number=1) for _ in range(7)) / count)
|
||||
stop = timeit.default_timer()
|
||||
print(f"finished {name}, Runtime: ", stop - start)
|
||||
return results
|
||||
|
||||
|
||||
setup = """
|
||||
from rapidfuzz import string_metric, process, fuzz
|
||||
import Levenshtein
|
||||
import string
|
||||
import random
|
||||
random.seed(18)
|
||||
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
|
||||
a = ''.join(random.choice(characters) for _ in range({0}))
|
||||
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
|
||||
"""
|
||||
|
||||
lengths = list(range(1, 512, 2))
|
||||
count = 3000
|
||||
|
||||
time_rapidfuzz = benchmark(
|
||||
"rapidfuzz",
|
||||
"[string_metric.levenshtein(a, b, weights=(1,1,2)) for b in b_list]",
|
||||
setup,
|
||||
lengths,
|
||||
count,
|
||||
)
|
||||
|
||||
# this gets very slow, so only benchmark it for smaller values
|
||||
time_python_levenshtein = (
|
||||
benchmark(
|
||||
"python-Levenshtein",
|
||||
"[Levenshtein.ratio(a, b) for b in b_list]",
|
||||
setup,
|
||||
list(range(1, 256, 2)),
|
||||
count,
|
||||
)
|
||||
+ [np.NaN] * 128
|
||||
)
|
||||
|
||||
df = pandas.DataFrame(
|
||||
data={
|
||||
"length": lengths,
|
||||
"rapidfuzz": time_rapidfuzz,
|
||||
"python-Levenshtein": time_python_levenshtein,
|
||||
}
|
||||
)
|
||||
|
||||
df.to_csv("results/levenshtein_indel.csv", sep=",", index=False)
|
|
@ -1,60 +0,0 @@
|
|||
# todo combine benchmarks of scorers into common code base
|
||||
import timeit
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
|
||||
def benchmark(name, func, setup, lengths, count):
|
||||
print(f"starting {name}")
|
||||
start = timeit.default_timer()
|
||||
results = []
|
||||
from tqdm import tqdm
|
||||
|
||||
for length in tqdm(lengths):
|
||||
test = timeit.Timer(func, setup=setup.format(length, count))
|
||||
results.append(min(test.timeit(number=1) for _ in range(7)) / count)
|
||||
stop = timeit.default_timer()
|
||||
print(f"finished {name}, Runtime: ", stop - start)
|
||||
return results
|
||||
|
||||
|
||||
setup = """
|
||||
from rapidfuzz.distance import Jaro
|
||||
import jellyfish
|
||||
import string
|
||||
import random
|
||||
random.seed(18)
|
||||
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
|
||||
a = ''.join(random.choice(characters) for _ in range({0}))
|
||||
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
|
||||
"""
|
||||
|
||||
lengths = list(range(1, 256, 4))
|
||||
count = 4000
|
||||
|
||||
time_rapidfuzz = benchmark(
|
||||
"rapidfuzz", "[Jaro.similarity(a, b) for b in b_list]", setup, lengths, count
|
||||
)
|
||||
|
||||
# this gets very slow, so only benchmark it for smaller values
|
||||
time_jellyfish = (
|
||||
benchmark(
|
||||
"jellyfish",
|
||||
"[jellyfish.jaro_similarity(a, b) for b in b_list]",
|
||||
setup,
|
||||
list(range(1, 128, 4)),
|
||||
count,
|
||||
)
|
||||
+ [np.NaN] * 32
|
||||
)
|
||||
|
||||
df = pandas.DataFrame(
|
||||
data={
|
||||
"length": lengths,
|
||||
"rapidfuzz": time_rapidfuzz,
|
||||
"jellyfish": time_jellyfish,
|
||||
}
|
||||
)
|
||||
|
||||
df.to_csv("results/jaro.csv", sep=",", index=False)
|
|
@ -1,60 +0,0 @@
|
|||
# todo combine benchmarks of scorers into common code base
|
||||
import timeit
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
|
||||
def benchmark(name, func, setup, lengths, count):
|
||||
print(f"starting {name}")
|
||||
start = timeit.default_timer()
|
||||
results = []
|
||||
from tqdm import tqdm
|
||||
|
||||
for length in tqdm(lengths):
|
||||
test = timeit.Timer(func, setup=setup.format(length, count))
|
||||
results.append(min(test.timeit(number=1) for _ in range(7)) / count)
|
||||
stop = timeit.default_timer()
|
||||
print(f"finished {name}, Runtime: ", stop - start)
|
||||
return results
|
||||
|
||||
|
||||
setup = """
|
||||
from rapidfuzz.distance import JaroWinkler
|
||||
import jellyfish
|
||||
import string
|
||||
import random
|
||||
random.seed(18)
|
||||
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
|
||||
a = ''.join(random.choice(characters) for _ in range({0}))
|
||||
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
|
||||
"""
|
||||
|
||||
lengths = list(range(1, 256, 4))
|
||||
count = 4000
|
||||
|
||||
time_rapidfuzz = benchmark(
|
||||
"rapidfuzz", "[JaroWinkler.similarity(a, b) for b in b_list]", setup, lengths, count
|
||||
)
|
||||
|
||||
# this gets very slow, so only benchmark it for smaller values
|
||||
time_jellyfish = (
|
||||
benchmark(
|
||||
"jellyfish",
|
||||
"[jellyfish.jaro_winkler_similarity(a, b) for b in b_list]",
|
||||
setup,
|
||||
list(range(1, 128, 4)),
|
||||
count,
|
||||
)
|
||||
+ [np.NaN] * 32
|
||||
)
|
||||
|
||||
df = pandas.DataFrame(
|
||||
data={
|
||||
"length": lengths,
|
||||
"rapidfuzz": time_rapidfuzz,
|
||||
"jellyfish": time_jellyfish,
|
||||
}
|
||||
)
|
||||
|
||||
df.to_csv("results/jaro_winkler.csv", sep=",", index=False)
|
|
@ -1,56 +0,0 @@
|
|||
# todo combine benchmarks of scorers into common code base
|
||||
import timeit
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
|
||||
def benchmark(name, func, setup, lengths, count):
|
||||
print(f"starting {name}")
|
||||
start = timeit.default_timer()
|
||||
results = []
|
||||
from tqdm import tqdm
|
||||
|
||||
for length in tqdm(lengths):
|
||||
test = timeit.Timer(func, setup=setup.format(length, count))
|
||||
results.append(min(test.timeit(number=1) for _ in range(7)) / count)
|
||||
stop = timeit.default_timer()
|
||||
print(f"finished {name}, Runtime: ", stop - start)
|
||||
return results
|
||||
|
||||
|
||||
setup = """
|
||||
from rapidfuzz.distance.OSA import distance
|
||||
from pyxdameraulevenshtein import damerau_levenshtein_distance
|
||||
import string
|
||||
import random
|
||||
random.seed(18)
|
||||
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
|
||||
a = ''.join(random.choice(characters) for _ in range({0}))
|
||||
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
|
||||
"""
|
||||
|
||||
lengths = list(range(1, 256, 2))
|
||||
count = 1000
|
||||
|
||||
time_rapidfuzz = benchmark(
|
||||
"rapidfuzz", "[distance(a, b) for b in b_list]", setup, lengths, count
|
||||
)
|
||||
|
||||
time_pyxdameraulevenshtein = benchmark(
|
||||
"pyxdameraulevenshtein",
|
||||
"[damerau_levenshtein_distance(a, b) for b in b_list]",
|
||||
setup,
|
||||
list(range(1, 16, 2)),
|
||||
count,
|
||||
) + [np.NaN] * int((256 - 16) / 2)
|
||||
|
||||
df = pandas.DataFrame(
|
||||
data={
|
||||
"length": lengths,
|
||||
"rapidfuzz": time_rapidfuzz,
|
||||
"pyxdameraulevenshtein": time_pyxdameraulevenshtein,
|
||||
}
|
||||
)
|
||||
|
||||
df.to_csv("results/osa.csv", sep=",", index=False)
|
|
@ -1,79 +0,0 @@
|
|||
import timeit
|
||||
|
||||
import numpy as np
|
||||
import pandas
|
||||
|
||||
|
||||
def benchmark(name, func, setup, lengths, count):
|
||||
print(f"starting {name}")
|
||||
start = timeit.default_timer()
|
||||
results = []
|
||||
for length in lengths:
|
||||
test = timeit.Timer(func, setup=setup.format(length, count))
|
||||
results.append(min(test.timeit(number=1) for _ in range(7)) / count)
|
||||
stop = timeit.default_timer()
|
||||
print(f"finished {name}, Runtime: ", stop - start)
|
||||
return results
|
||||
|
||||
|
||||
setup = """
|
||||
from rapidfuzz import string_metric
|
||||
import Levenshtein
|
||||
import polyleven
|
||||
import edlib
|
||||
import editdistance
|
||||
import string
|
||||
import random
|
||||
random.seed(18)
|
||||
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
|
||||
a = ''.join(random.choice(characters) for _ in range({0}))
|
||||
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
|
||||
"""
|
||||
|
||||
lengths = list(range(1, 512, 2))
|
||||
count = 2000
|
||||
|
||||
time_rapidfuzz = benchmark(
|
||||
"rapidfuzz",
|
||||
"[string_metric.levenshtein(a, b) for b in b_list]",
|
||||
setup,
|
||||
lengths,
|
||||
count,
|
||||
)
|
||||
|
||||
time_polyleven = benchmark(
|
||||
"polyleven", "[polyleven.levenshtein(a, b) for b in b_list]", setup, lengths, count
|
||||
)
|
||||
|
||||
# this gets very slow, so only benchmark it for smaller values
|
||||
time_python_levenshtein = (
|
||||
benchmark(
|
||||
"python-Levenshtein",
|
||||
"[Levenshtein.distance(a, b) for b in b_list]",
|
||||
setup,
|
||||
list(range(1, 256, 2)),
|
||||
count,
|
||||
)
|
||||
+ [np.NaN] * 128
|
||||
)
|
||||
|
||||
time_edlib = benchmark(
|
||||
"edlib", "[edlib.align(a, b) for b in b_list]", setup, lengths, count
|
||||
)
|
||||
|
||||
time_editdistance = benchmark(
|
||||
"editdistance", "[editdistance.eval(a, b) for b in b_list]", setup, lengths, count
|
||||
)
|
||||
|
||||
df = pandas.DataFrame(
|
||||
data={
|
||||
"length": lengths,
|
||||
"rapidfuzz": time_rapidfuzz,
|
||||
"polyleven": time_polyleven,
|
||||
"python-Levenshtein": time_python_levenshtein,
|
||||
"edlib": time_edlib,
|
||||
"editdistance": time_editdistance,
|
||||
}
|
||||
)
|
||||
|
||||
df.to_csv("results/levenshtein_uniform.csv", sep=",", index=False)
|
Loading…
Reference in New Issue