add benchmark
This commit is contained in:
parent
5b11f9ca26
commit
54609c7508
Binary file not shown.
After Width: | Height: | Size: 28 KiB |
Binary file not shown.
After Width: | Height: | Size: 28 KiB |
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
|
@ -1 +1,18 @@
|
|||
Will be added soon
|
||||
# Benchmarks
|
||||
|
||||
To compare the speed of FuzzyWuzzy and RapidFuzz the Benchmark of FuzzyWuzzy is used.
|
||||
Therefore the Benchmark is always executed for both FuzzyWuzzy and RapidFuzz.
|
||||
Afterwards a ratio between the runtime of both results is calculated. The benchmark can be found [here](https://github.com/rhasspy/rapidfuzz/blob/master/python/bench). The results of the benchmarks are visualised below.
|
||||
|
||||
## fuzz.ratio
|
||||
|
||||
![](.github/fuzz.ratio.png)
|
||||
|
||||
|
||||
## fuzz.partial_ratio
|
||||
|
||||
![](.github/fuzz.partial_ratio.png)
|
||||
|
||||
## fuzz.WRatio
|
||||
|
||||
![](.github/fuzz.WRatio.png)
|
|
@ -34,7 +34,7 @@
|
|||
## Description
|
||||
RapidFuzz is a fast string matching library for Python and C++, which is using the string similarity calculations from [FuzzyWuzzy](https://github.com/seatgeek/fuzzywuzzy). However there are two aspects that set RapidFuzz apart from FuzzyWuzzy:
|
||||
1) It is MIT licensed so it can be used whichever License you might want to choose for your project, while you're forced to adopt the GPLv2 license when using FuzzyWuzzy
|
||||
2) It is mostly written in C++ and on top of this comes with a lot of Algorithmic improvements to make string matching even faster, while still providing the same results. These changes result in a 5-100x Speedup in String Matching. More details on benchmark results can be found [here](https://github.com/rhasspy/rapidfuzz/blob/master/Benchmarks.md)
|
||||
2) It is mostly written in C++ and on top of this comes with a lot of Algorithmic improvements to make string matching even faster, while still providing the same results. These changes result in a 2-100x Speedup in String Matching. More details on benchmark results can be found [here](https://github.com/rhasspy/rapidfuzz/blob/master/Benchmarks.md)
|
||||
|
||||
|
||||
## Installation
|
||||
|
|
|
@ -0,0 +1,137 @@
|
|||
from timeit import timeit
|
||||
import math
|
||||
import csv
|
||||
|
||||
iterations = 100000
|
||||
|
||||
|
||||
reader = csv.DictReader(open('titledata.csv'), delimiter='|')
|
||||
titles = [i['custom_title'] for i in reader]
|
||||
title_blob = '\n'.join(titles)
|
||||
|
||||
|
||||
cirque_strings = [
|
||||
"cirque du soleil - zarkana - las vegas",
|
||||
"cirque du soleil ",
|
||||
"cirque du soleil",
|
||||
"cirque du soleil las vegas",
|
||||
"zarkana las vegas",
|
||||
"las vegas cirque du soleil at the bellagio",
|
||||
"zarakana - cirque du soleil - bellagio"
|
||||
]
|
||||
|
||||
choices = [
|
||||
"",
|
||||
"new york yankees vs boston red sox",
|
||||
"",
|
||||
"zarakana - cirque du soleil - bellagio",
|
||||
None,
|
||||
"cirque du soleil las vegas",
|
||||
None
|
||||
]
|
||||
|
||||
mixed_strings = [
|
||||
"Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
|
||||
"C\\'est la vie",
|
||||
u"Ça va?",
|
||||
u"Cães danados",
|
||||
u"\xacCamarões assados",
|
||||
u"a\xac\u1234\u20ac\U00008000"
|
||||
]
|
||||
|
||||
common_setup = "from {} import fuzz, process; "
|
||||
|
||||
|
||||
def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
|
||||
"""
|
||||
Clean function to know how much time took the execution of one statement
|
||||
"""
|
||||
units = ["s", "ms", "us", "ns"]
|
||||
|
||||
setup_fuzzywuzzy = setup.format("fuzzywuzzy")
|
||||
duration_fuzzywuzzy = timeit(stmt, setup_fuzzywuzzy, number=int(number))
|
||||
avg_duration = duration_fuzzywuzzy / float(number)
|
||||
thousands = int(math.floor(math.log(avg_duration, 1000)))
|
||||
|
||||
print("Total time: %fs. Average run: %.3f%s." % (
|
||||
duration_fuzzywuzzy, avg_duration * (1000 ** -thousands), units[-thousands]))
|
||||
|
||||
setup_rapidfuzz = setup.format("rapidfuzz")
|
||||
duration_rapidfuzz = timeit(stmt, setup_rapidfuzz, number=int(number))
|
||||
avg_duration = duration_rapidfuzz / float(number)
|
||||
thousands = int(math.floor(math.log(avg_duration, 1000)))
|
||||
|
||||
print("Total time: %fs. Average run: %.3f%s." % (
|
||||
duration_rapidfuzz, avg_duration * (1000 ** -thousands), units[-thousands]))
|
||||
|
||||
relative_duration = duration_fuzzywuzzy / duration_rapidfuzz
|
||||
print("RapidFuzz is %.3f times faster than FuzzyWuzzy" % relative_duration)
|
||||
|
||||
print()
|
||||
|
||||
|
||||
# benchmarking the core matching methods...
|
||||
|
||||
for s in cirque_strings:
|
||||
print('Test fuzz.ratio for string: "%s"' % s)
|
||||
print('-------------------------------')
|
||||
print_result_from_timeit('fuzz.ratio(u\'cirque du soleil\', u\'%s\')' % s,
|
||||
common_setup, number=iterations / 100)
|
||||
|
||||
for s in cirque_strings:
|
||||
print('Test fuzz.partial_ratio for string: "%s"' % s)
|
||||
print('-------------------------------')
|
||||
print_result_from_timeit('fuzz.partial_ratio(u\'cirque du soleil\', u\'%s\')' % s,
|
||||
common_setup, number=iterations / 100)
|
||||
|
||||
for s in cirque_strings:
|
||||
print('Test fuzz.WRatio for string: "%s"' % s)
|
||||
print('-------------------------------')
|
||||
print_result_from_timeit('fuzz.WRatio(u\'cirque du soleil\', u\'%s\')' % s,
|
||||
common_setup, number=iterations / 100)
|
||||
|
||||
print('Test process.extract(scorer = fuzz.QRatio) for string: "%s"' % s)
|
||||
print('-------------------------------')
|
||||
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.QRatio)',
|
||||
common_setup + " import string,random; random.seed(18);"
|
||||
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
|
||||
number=10)
|
||||
|
||||
print('Test process.extract(scorer = fuzz.WRatio) for string: "%s"' % s)
|
||||
print('-------------------------------')
|
||||
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.WRatio)',
|
||||
common_setup + " import string,random; random.seed(18);"
|
||||
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
|
||||
number=10)
|
||||
|
||||
print('Test process.extractOne(scorer = fuzz.WRatio) for string: "%s"' % s)
|
||||
print('-------------------------------')
|
||||
print_result_from_timeit('process.extractOne(u\'cirque du soleil\', choices, scorer = fuzz.WRatio)',
|
||||
common_setup + " import string,random; random.seed(18);"
|
||||
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
|
||||
number=10)
|
||||
|
||||
print('Test process.extractOne(scorer = fuzz.WRatio, score_cutoff=90) for string: "%s"' % s)
|
||||
print('-------------------------------')
|
||||
print_result_from_timeit('process.extractOne(u\'cirque du soleil\', choices, scorer = fuzz.WRatio, score_cutoff=90)',
|
||||
common_setup + " import string,random; random.seed(18);"
|
||||
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
|
||||
number=10)
|
||||
|
||||
|
||||
# let me show you something
|
||||
|
||||
s = 'New York Yankees'
|
||||
|
||||
test = 'import functools\n'
|
||||
test += 'title_blob = """%s"""\n' % title_blob
|
||||
test += 'title_blob = title_blob.strip()\n'
|
||||
test += 'titles = title_blob.split("\\n")\n'
|
||||
|
||||
print('Real world ratio(): "%s"' % s)
|
||||
print('-------------------------------')
|
||||
test += 'prepared_ratio = functools.partial(fuzz.ratio, "%s")\n' % s
|
||||
test += 'titles.sort(key=prepared_ratio)\n'
|
||||
print_result_from_timeit(test,
|
||||
common_setup,
|
||||
number=100)
|
|
@ -0,0 +1,156 @@
|
|||
Test fuzz.ratio for string: "cirque du soleil - zarkana - las vegas"
|
||||
-------------------------------
|
||||
Total time: 0.002205s. Average run: 2.205us.
|
||||
Total time: 0.000522s. Average run: 522.423ns.
|
||||
RapidFuzz is 4.221 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.ratio for string: "cirque du soleil "
|
||||
-------------------------------
|
||||
Total time: 0.002163s. Average run: 2.163us.
|
||||
Total time: 0.000516s. Average run: 515.595ns.
|
||||
RapidFuzz is 4.194 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.ratio for string: "cirque du soleil"
|
||||
-------------------------------
|
||||
Total time: 0.000332s. Average run: 331.925ns.
|
||||
Total time: 0.000452s. Average run: 451.798ns.
|
||||
RapidFuzz is 0.735 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.ratio for string: "cirque du soleil las vegas"
|
||||
-------------------------------
|
||||
Total time: 0.002166s. Average run: 2.166us.
|
||||
Total time: 0.000481s. Average run: 481.355ns.
|
||||
RapidFuzz is 4.500 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.ratio for string: "zarkana las vegas"
|
||||
-------------------------------
|
||||
Total time: 0.002404s. Average run: 2.404us.
|
||||
Total time: 0.000701s. Average run: 701.098ns.
|
||||
RapidFuzz is 3.428 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.ratio for string: "las vegas cirque du soleil at the bellagio"
|
||||
-------------------------------
|
||||
Total time: 0.002753s. Average run: 2.753us.
|
||||
Total time: 0.001151s. Average run: 1.151us.
|
||||
RapidFuzz is 2.393 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.ratio for string: "zarakana - cirque du soleil - bellagio"
|
||||
-------------------------------
|
||||
Total time: 0.002731s. Average run: 2.731us.
|
||||
Total time: 0.001092s. Average run: 1.092us.
|
||||
RapidFuzz is 2.500 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.partial_ratio for string: "cirque du soleil - zarkana - las vegas"
|
||||
-------------------------------
|
||||
Total time: 0.004042s. Average run: 4.042us.
|
||||
Total time: 0.000834s. Average run: 833.545ns.
|
||||
RapidFuzz is 4.850 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.partial_ratio for string: "cirque du soleil "
|
||||
-------------------------------
|
||||
Total time: 0.003888s. Average run: 3.888us.
|
||||
Total time: 0.000595s. Average run: 594.831ns.
|
||||
RapidFuzz is 6.536 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.partial_ratio for string: "cirque du soleil"
|
||||
-------------------------------
|
||||
Total time: 0.000336s. Average run: 335.991ns.
|
||||
Total time: 0.000587s. Average run: 586.752ns.
|
||||
RapidFuzz is 0.573 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.partial_ratio for string: "cirque du soleil las vegas"
|
||||
-------------------------------
|
||||
Total time: 0.003847s. Average run: 3.847us.
|
||||
Total time: 0.000746s. Average run: 746.272ns.
|
||||
RapidFuzz is 5.155 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.partial_ratio for string: "zarkana las vegas"
|
||||
-------------------------------
|
||||
Total time: 0.010170s. Average run: 10.170us.
|
||||
Total time: 0.002544s. Average run: 2.544us.
|
||||
RapidFuzz is 3.998 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.partial_ratio for string: "las vegas cirque du soleil at the bellagio"
|
||||
-------------------------------
|
||||
Total time: 0.005372s. Average run: 5.372us.
|
||||
Total time: 0.002900s. Average run: 2.900us.
|
||||
RapidFuzz is 1.852 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.partial_ratio for string: "zarakana - cirque du soleil - bellagio"
|
||||
-------------------------------
|
||||
Total time: 0.005208s. Average run: 5.208us.
|
||||
Total time: 0.002584s. Average run: 2.584us.
|
||||
RapidFuzz is 2.016 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.WRatio for string: "cirque du soleil - zarkana - las vegas"
|
||||
-------------------------------
|
||||
Total time: 0.040031s. Average run: 40.031us.
|
||||
Total time: 0.001491s. Average run: 1.491us.
|
||||
RapidFuzz is 26.851 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.WRatio for string: "cirque du soleil "
|
||||
-------------------------------
|
||||
Total time: 0.008991s. Average run: 8.991us.
|
||||
Total time: 0.000563s. Average run: 562.574ns.
|
||||
RapidFuzz is 15.981 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.WRatio for string: "cirque du soleil"
|
||||
-------------------------------
|
||||
Total time: 0.008759s. Average run: 8.759us.
|
||||
Total time: 0.000503s. Average run: 502.509ns.
|
||||
RapidFuzz is 17.430 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.WRatio for string: "cirque du soleil las vegas"
|
||||
-------------------------------
|
||||
Total time: 0.038304s. Average run: 38.304us.
|
||||
Total time: 0.001285s. Average run: 1.285us.
|
||||
RapidFuzz is 29.812 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.WRatio for string: "zarkana las vegas"
|
||||
-------------------------------
|
||||
Total time: 0.021282s. Average run: 21.282us.
|
||||
Total time: 0.001889s. Average run: 1.889us.
|
||||
RapidFuzz is 11.264 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.WRatio for string: "las vegas cirque du soleil at the bellagio"
|
||||
-------------------------------
|
||||
Total time: 0.043843s. Average run: 43.843us.
|
||||
Total time: 0.004232s. Average run: 4.232us.
|
||||
RapidFuzz is 10.359 times faster than FuzzyWuzzy
|
||||
|
||||
Test fuzz.WRatio for string: "zarakana - cirque du soleil - bellagio"
|
||||
-------------------------------
|
||||
Total time: 0.038362s. Average run: 38.362us.
|
||||
Total time: 0.003854s. Average run: 3.854us.
|
||||
RapidFuzz is 9.953 times faster than FuzzyWuzzy
|
||||
|
||||
Test process.extract(scorer = fuzz.QRatio) for string: "zarakana - cirque du soleil - bellagio"
|
||||
-------------------------------
|
||||
Total time: 0.373829s. Average run: 37.383ms.
|
||||
Total time: 0.069651s. Average run: 6.965ms.
|
||||
RapidFuzz is 5.367 times faster than FuzzyWuzzy
|
||||
|
||||
Test process.extract(scorer = fuzz.WRatio) for string: "zarakana - cirque du soleil - bellagio"
|
||||
-------------------------------
|
||||
Total time: 2.659891s. Average run: 265.989ms.
|
||||
Total time: 0.262201s. Average run: 26.220ms.
|
||||
RapidFuzz is 10.144 times faster than FuzzyWuzzy
|
||||
|
||||
Test process.extractOne(scorer = fuzz.WRatio) for string: "zarakana - cirque du soleil - bellagio"
|
||||
-------------------------------
|
||||
Total time: 2.659970s. Average run: 265.997ms.
|
||||
Total time: 0.370528s. Average run: 37.053ms.
|
||||
RapidFuzz is 7.179 times faster than FuzzyWuzzy
|
||||
|
||||
Test process.extractOne(scorer = fuzz.WRatio, score_cutoff=90) for string: "zarakana - cirque du soleil - bellagio"
|
||||
-------------------------------
|
||||
Total time: 2.651922s. Average run: 265.192ms.
|
||||
Total time: 0.014315s. Average run: 1.431ms.
|
||||
RapidFuzz is 185.256 times faster than FuzzyWuzzy
|
||||
|
||||
Real world ratio(): "New York Yankees"
|
||||
-------------------------------
|
||||
Total time: 0.993305s. Average run: 9.933ms.
|
||||
Total time: 0.440019s. Average run: 4.400ms.
|
||||
RapidFuzz is 2.257 times faster than FuzzyWuzzy
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue