add benchmark
This commit is contained in:
parent
5b11f9ca26
commit
54609c7508
Binary file not shown.
After Width: | Height: | Size: 28 KiB |
Binary file not shown.
After Width: | Height: | Size: 28 KiB |
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
|
@ -1 +1,18 @@
|
||||||
Will be added soon
|
# Benchmarks
|
||||||
|
|
||||||
|
To compare the speed of FuzzyWuzzy and RapidFuzz the Benchmark of FuzzyWuzzy is used.
|
||||||
|
Therefore the Benchmark is always executed for both FuzzyWuzzy and RapidFuzz.
|
||||||
|
Afterwards a ratio between the runtime of both results is calculated. The benchmark can be found [here](https://github.com/rhasspy/rapidfuzz/blob/master/python/bench). The results of the benchmarks are visualised below.
|
||||||
|
|
||||||
|
## fuzz.ratio
|
||||||
|
|
||||||
|
![](.github/fuzz.ratio.png)
|
||||||
|
|
||||||
|
|
||||||
|
## fuzz.partial_ratio
|
||||||
|
|
||||||
|
![](.github/fuzz.partial_ratio.png)
|
||||||
|
|
||||||
|
## fuzz.WRatio
|
||||||
|
|
||||||
|
![](.github/fuzz.WRatio.png)
|
|
@ -34,7 +34,7 @@
|
||||||
## Description
|
## Description
|
||||||
RapidFuzz is a fast string matching library for Python and C++, which is using the string similarity calculations from [FuzzyWuzzy](https://github.com/seatgeek/fuzzywuzzy). However there are two aspects that set RapidFuzz apart from FuzzyWuzzy:
|
RapidFuzz is a fast string matching library for Python and C++, which is using the string similarity calculations from [FuzzyWuzzy](https://github.com/seatgeek/fuzzywuzzy). However there are two aspects that set RapidFuzz apart from FuzzyWuzzy:
|
||||||
1) It is MIT licensed so it can be used whichever License you might want to choose for your project, while you're forced to adopt the GPLv2 license when using FuzzyWuzzy
|
1) It is MIT licensed so it can be used whichever License you might want to choose for your project, while you're forced to adopt the GPLv2 license when using FuzzyWuzzy
|
||||||
2) It is mostly written in C++ and on top of this comes with a lot of Algorithmic improvements to make string matching even faster, while still providing the same results. These changes result in a 5-100x Speedup in String Matching. More details on benchmark results can be found [here](https://github.com/rhasspy/rapidfuzz/blob/master/Benchmarks.md)
|
2) It is mostly written in C++ and on top of this comes with a lot of Algorithmic improvements to make string matching even faster, while still providing the same results. These changes result in a 2-100x Speedup in String Matching. More details on benchmark results can be found [here](https://github.com/rhasspy/rapidfuzz/blob/master/Benchmarks.md)
|
||||||
|
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
|
@ -0,0 +1,137 @@
|
||||||
|
from timeit import timeit
|
||||||
|
import math
|
||||||
|
import csv
|
||||||
|
|
||||||
|
iterations = 100000
|
||||||
|
|
||||||
|
|
||||||
|
reader = csv.DictReader(open('titledata.csv'), delimiter='|')
|
||||||
|
titles = [i['custom_title'] for i in reader]
|
||||||
|
title_blob = '\n'.join(titles)
|
||||||
|
|
||||||
|
|
||||||
|
cirque_strings = [
|
||||||
|
"cirque du soleil - zarkana - las vegas",
|
||||||
|
"cirque du soleil ",
|
||||||
|
"cirque du soleil",
|
||||||
|
"cirque du soleil las vegas",
|
||||||
|
"zarkana las vegas",
|
||||||
|
"las vegas cirque du soleil at the bellagio",
|
||||||
|
"zarakana - cirque du soleil - bellagio"
|
||||||
|
]
|
||||||
|
|
||||||
|
choices = [
|
||||||
|
"",
|
||||||
|
"new york yankees vs boston red sox",
|
||||||
|
"",
|
||||||
|
"zarakana - cirque du soleil - bellagio",
|
||||||
|
None,
|
||||||
|
"cirque du soleil las vegas",
|
||||||
|
None
|
||||||
|
]
|
||||||
|
|
||||||
|
mixed_strings = [
|
||||||
|
"Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
|
||||||
|
"C\\'est la vie",
|
||||||
|
u"Ça va?",
|
||||||
|
u"Cães danados",
|
||||||
|
u"\xacCamarões assados",
|
||||||
|
u"a\xac\u1234\u20ac\U00008000"
|
||||||
|
]
|
||||||
|
|
||||||
|
common_setup = "from {} import fuzz, process; "
|
||||||
|
|
||||||
|
|
||||||
|
def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
|
||||||
|
"""
|
||||||
|
Clean function to know how much time took the execution of one statement
|
||||||
|
"""
|
||||||
|
units = ["s", "ms", "us", "ns"]
|
||||||
|
|
||||||
|
setup_fuzzywuzzy = setup.format("fuzzywuzzy")
|
||||||
|
duration_fuzzywuzzy = timeit(stmt, setup_fuzzywuzzy, number=int(number))
|
||||||
|
avg_duration = duration_fuzzywuzzy / float(number)
|
||||||
|
thousands = int(math.floor(math.log(avg_duration, 1000)))
|
||||||
|
|
||||||
|
print("Total time: %fs. Average run: %.3f%s." % (
|
||||||
|
duration_fuzzywuzzy, avg_duration * (1000 ** -thousands), units[-thousands]))
|
||||||
|
|
||||||
|
setup_rapidfuzz = setup.format("rapidfuzz")
|
||||||
|
duration_rapidfuzz = timeit(stmt, setup_rapidfuzz, number=int(number))
|
||||||
|
avg_duration = duration_rapidfuzz / float(number)
|
||||||
|
thousands = int(math.floor(math.log(avg_duration, 1000)))
|
||||||
|
|
||||||
|
print("Total time: %fs. Average run: %.3f%s." % (
|
||||||
|
duration_rapidfuzz, avg_duration * (1000 ** -thousands), units[-thousands]))
|
||||||
|
|
||||||
|
relative_duration = duration_fuzzywuzzy / duration_rapidfuzz
|
||||||
|
print("RapidFuzz is %.3f times faster than FuzzyWuzzy" % relative_duration)
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# benchmarking the core matching methods...
|
||||||
|
|
||||||
|
for s in cirque_strings:
|
||||||
|
print('Test fuzz.ratio for string: "%s"' % s)
|
||||||
|
print('-------------------------------')
|
||||||
|
print_result_from_timeit('fuzz.ratio(u\'cirque du soleil\', u\'%s\')' % s,
|
||||||
|
common_setup, number=iterations / 100)
|
||||||
|
|
||||||
|
for s in cirque_strings:
|
||||||
|
print('Test fuzz.partial_ratio for string: "%s"' % s)
|
||||||
|
print('-------------------------------')
|
||||||
|
print_result_from_timeit('fuzz.partial_ratio(u\'cirque du soleil\', u\'%s\')' % s,
|
||||||
|
common_setup, number=iterations / 100)
|
||||||
|
|
||||||
|
for s in cirque_strings:
|
||||||
|
print('Test fuzz.WRatio for string: "%s"' % s)
|
||||||
|
print('-------------------------------')
|
||||||
|
print_result_from_timeit('fuzz.WRatio(u\'cirque du soleil\', u\'%s\')' % s,
|
||||||
|
common_setup, number=iterations / 100)
|
||||||
|
|
||||||
|
print('Test process.extract(scorer = fuzz.QRatio) for string: "%s"' % s)
|
||||||
|
print('-------------------------------')
|
||||||
|
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.QRatio)',
|
||||||
|
common_setup + " import string,random; random.seed(18);"
|
||||||
|
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
|
||||||
|
number=10)
|
||||||
|
|
||||||
|
print('Test process.extract(scorer = fuzz.WRatio) for string: "%s"' % s)
|
||||||
|
print('-------------------------------')
|
||||||
|
print_result_from_timeit('process.extract(u\'cirque du soleil\', choices, scorer = fuzz.WRatio)',
|
||||||
|
common_setup + " import string,random; random.seed(18);"
|
||||||
|
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
|
||||||
|
number=10)
|
||||||
|
|
||||||
|
print('Test process.extractOne(scorer = fuzz.WRatio) for string: "%s"' % s)
|
||||||
|
print('-------------------------------')
|
||||||
|
print_result_from_timeit('process.extractOne(u\'cirque du soleil\', choices, scorer = fuzz.WRatio)',
|
||||||
|
common_setup + " import string,random; random.seed(18);"
|
||||||
|
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
|
||||||
|
number=10)
|
||||||
|
|
||||||
|
print('Test process.extractOne(scorer = fuzz.WRatio, score_cutoff=90) for string: "%s"' % s)
|
||||||
|
print('-------------------------------')
|
||||||
|
print_result_from_timeit('process.extractOne(u\'cirque du soleil\', choices, scorer = fuzz.WRatio, score_cutoff=90)',
|
||||||
|
common_setup + " import string,random; random.seed(18);"
|
||||||
|
" choices = [\'\'.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(30)) for s in range(5000)]",
|
||||||
|
number=10)
|
||||||
|
|
||||||
|
|
||||||
|
# let me show you something
|
||||||
|
|
||||||
|
s = 'New York Yankees'
|
||||||
|
|
||||||
|
test = 'import functools\n'
|
||||||
|
test += 'title_blob = """%s"""\n' % title_blob
|
||||||
|
test += 'title_blob = title_blob.strip()\n'
|
||||||
|
test += 'titles = title_blob.split("\\n")\n'
|
||||||
|
|
||||||
|
print('Real world ratio(): "%s"' % s)
|
||||||
|
print('-------------------------------')
|
||||||
|
test += 'prepared_ratio = functools.partial(fuzz.ratio, "%s")\n' % s
|
||||||
|
test += 'titles.sort(key=prepared_ratio)\n'
|
||||||
|
print_result_from_timeit(test,
|
||||||
|
common_setup,
|
||||||
|
number=100)
|
|
@ -0,0 +1,156 @@
|
||||||
|
Test fuzz.ratio for string: "cirque du soleil - zarkana - las vegas"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.002205s. Average run: 2.205us.
|
||||||
|
Total time: 0.000522s. Average run: 522.423ns.
|
||||||
|
RapidFuzz is 4.221 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.ratio for string: "cirque du soleil "
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.002163s. Average run: 2.163us.
|
||||||
|
Total time: 0.000516s. Average run: 515.595ns.
|
||||||
|
RapidFuzz is 4.194 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.ratio for string: "cirque du soleil"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.000332s. Average run: 331.925ns.
|
||||||
|
Total time: 0.000452s. Average run: 451.798ns.
|
||||||
|
RapidFuzz is 0.735 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.ratio for string: "cirque du soleil las vegas"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.002166s. Average run: 2.166us.
|
||||||
|
Total time: 0.000481s. Average run: 481.355ns.
|
||||||
|
RapidFuzz is 4.500 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.ratio for string: "zarkana las vegas"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.002404s. Average run: 2.404us.
|
||||||
|
Total time: 0.000701s. Average run: 701.098ns.
|
||||||
|
RapidFuzz is 3.428 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.ratio for string: "las vegas cirque du soleil at the bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.002753s. Average run: 2.753us.
|
||||||
|
Total time: 0.001151s. Average run: 1.151us.
|
||||||
|
RapidFuzz is 2.393 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.ratio for string: "zarakana - cirque du soleil - bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.002731s. Average run: 2.731us.
|
||||||
|
Total time: 0.001092s. Average run: 1.092us.
|
||||||
|
RapidFuzz is 2.500 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.partial_ratio for string: "cirque du soleil - zarkana - las vegas"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.004042s. Average run: 4.042us.
|
||||||
|
Total time: 0.000834s. Average run: 833.545ns.
|
||||||
|
RapidFuzz is 4.850 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.partial_ratio for string: "cirque du soleil "
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.003888s. Average run: 3.888us.
|
||||||
|
Total time: 0.000595s. Average run: 594.831ns.
|
||||||
|
RapidFuzz is 6.536 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.partial_ratio for string: "cirque du soleil"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.000336s. Average run: 335.991ns.
|
||||||
|
Total time: 0.000587s. Average run: 586.752ns.
|
||||||
|
RapidFuzz is 0.573 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.partial_ratio for string: "cirque du soleil las vegas"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.003847s. Average run: 3.847us.
|
||||||
|
Total time: 0.000746s. Average run: 746.272ns.
|
||||||
|
RapidFuzz is 5.155 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.partial_ratio for string: "zarkana las vegas"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.010170s. Average run: 10.170us.
|
||||||
|
Total time: 0.002544s. Average run: 2.544us.
|
||||||
|
RapidFuzz is 3.998 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.partial_ratio for string: "las vegas cirque du soleil at the bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.005372s. Average run: 5.372us.
|
||||||
|
Total time: 0.002900s. Average run: 2.900us.
|
||||||
|
RapidFuzz is 1.852 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.partial_ratio for string: "zarakana - cirque du soleil - bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.005208s. Average run: 5.208us.
|
||||||
|
Total time: 0.002584s. Average run: 2.584us.
|
||||||
|
RapidFuzz is 2.016 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.WRatio for string: "cirque du soleil - zarkana - las vegas"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.040031s. Average run: 40.031us.
|
||||||
|
Total time: 0.001491s. Average run: 1.491us.
|
||||||
|
RapidFuzz is 26.851 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.WRatio for string: "cirque du soleil "
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.008991s. Average run: 8.991us.
|
||||||
|
Total time: 0.000563s. Average run: 562.574ns.
|
||||||
|
RapidFuzz is 15.981 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.WRatio for string: "cirque du soleil"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.008759s. Average run: 8.759us.
|
||||||
|
Total time: 0.000503s. Average run: 502.509ns.
|
||||||
|
RapidFuzz is 17.430 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.WRatio for string: "cirque du soleil las vegas"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.038304s. Average run: 38.304us.
|
||||||
|
Total time: 0.001285s. Average run: 1.285us.
|
||||||
|
RapidFuzz is 29.812 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.WRatio for string: "zarkana las vegas"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.021282s. Average run: 21.282us.
|
||||||
|
Total time: 0.001889s. Average run: 1.889us.
|
||||||
|
RapidFuzz is 11.264 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.WRatio for string: "las vegas cirque du soleil at the bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.043843s. Average run: 43.843us.
|
||||||
|
Total time: 0.004232s. Average run: 4.232us.
|
||||||
|
RapidFuzz is 10.359 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test fuzz.WRatio for string: "zarakana - cirque du soleil - bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.038362s. Average run: 38.362us.
|
||||||
|
Total time: 0.003854s. Average run: 3.854us.
|
||||||
|
RapidFuzz is 9.953 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test process.extract(scorer = fuzz.QRatio) for string: "zarakana - cirque du soleil - bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.373829s. Average run: 37.383ms.
|
||||||
|
Total time: 0.069651s. Average run: 6.965ms.
|
||||||
|
RapidFuzz is 5.367 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test process.extract(scorer = fuzz.WRatio) for string: "zarakana - cirque du soleil - bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 2.659891s. Average run: 265.989ms.
|
||||||
|
Total time: 0.262201s. Average run: 26.220ms.
|
||||||
|
RapidFuzz is 10.144 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test process.extractOne(scorer = fuzz.WRatio) for string: "zarakana - cirque du soleil - bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 2.659970s. Average run: 265.997ms.
|
||||||
|
Total time: 0.370528s. Average run: 37.053ms.
|
||||||
|
RapidFuzz is 7.179 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Test process.extractOne(scorer = fuzz.WRatio, score_cutoff=90) for string: "zarakana - cirque du soleil - bellagio"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 2.651922s. Average run: 265.192ms.
|
||||||
|
Total time: 0.014315s. Average run: 1.431ms.
|
||||||
|
RapidFuzz is 185.256 times faster than FuzzyWuzzy
|
||||||
|
|
||||||
|
Real world ratio(): "New York Yankees"
|
||||||
|
-------------------------------
|
||||||
|
Total time: 0.993305s. Average run: 9.933ms.
|
||||||
|
Total time: 0.440019s. Average run: 4.400ms.
|
||||||
|
RapidFuzz is 2.257 times faster than FuzzyWuzzy
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue