release v2.6.0

This commit is contained in:
Max Bachmann 2022-08-20 04:04:18 +02:00
parent 397d7dea7b
commit dcf6746767
13 changed files with 1694 additions and 12 deletions

View File

@ -1,6 +1,6 @@
## Changelog
### [2.6.0] - 2022-08-
### [2.6.0] - 2022-08-20
#### Fixed
- fix hashing for custom classes

View File

@ -37,7 +37,7 @@ else()
add_library(Taskflow::Taskflow ALIAS Taskflow)
endif()
find_package(rapidfuzz 1.1.1 QUIET)
find_package(rapidfuzz 1.2.0 QUIET)
if (rapidfuzz_FOUND)
message("Using system supplied version of rapidfuzz-cpp")
else()

View File

@ -0,0 +1,46 @@
# todo combine benchmarks of scorers into common code base
import timeit
import pandas
def benchmark(name, func, setup, lengths, count):
print(f"starting {name}")
start = timeit.default_timer()
results = []
from tqdm import tqdm
for length in tqdm(lengths):
#for length in lengths:
test = timeit.Timer(func, setup=setup.format(length, count))
results.append(min(test.timeit(number=1) for _ in range(7)) / count)
stop = timeit.default_timer()
print(f"finished {name}, Runtime: ", stop - start)
return results
setup ="""
from rapidfuzz.distance.DamerauLevenshtein import distance
from jellyfish import damerau_levenshtein_distance
import string
import random
random.seed(18)
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
a = ''.join(random.choice(characters) for _ in range({0}))
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
"""
lengths = list(range(1,256,2))
count = 1000
time_rapidfuzz = benchmark("rapidfuzz",
'[distance(a, b) for b in b_list]',
setup, lengths, count)
time_jellyfish = benchmark("jellyfish",
'[damerau_levenshtein_distance(a, b) for b in b_list]',
setup, lengths, count)
df = pandas.DataFrame(data={
"length": lengths,
"rapidfuzz": time_rapidfuzz,
"jellyfish": time_jellyfish
})
df.to_csv("results/levenshtein_damerau.csv", sep=',',index=False)

View File

@ -0,0 +1,20 @@
import pandas as pd
import matplotlib.pyplot as plt
df=pd.read_csv("results/levenshtein_damerau.csv")
df *= 1000 * 1000
df["length"] /= 1000 * 1000
ax=df.plot(x="length")
plt.xticks(list(range(0, 257, 64)))
plt.title("Performance comparision of the \nDamerauLevenshtein similarity in different libraries")
plt.xlabel("string length [in characters]")
plt.ylabel("runtime [μs]")
ax.set_xlim(xmin=0)
ax.set_ylim(bottom=0)
plt.grid()
plt.show()

129
bench/results/levenshtein_damerau.csv vendored Normal file
View File

@ -0,0 +1,129 @@
length,rapidfuzz,jellyfish
1,1.3186500291340052e-07,2.2917869937373324e-06
3,1.5496299602091313e-07,2.3576690000481903e-06
5,1.9305800378788262e-07,2.6081079995492475e-06
7,2.5326300237793474e-07,2.7545159973669795e-06
9,3.2603699946776033e-07,2.835453997249715e-06
11,4.1582799167372286e-07,3.0755670013604687e-06
13,5.237079894868657e-07,3.348548008943908e-06
15,6.503320037154481e-07,3.5760050086537376e-06
17,8.075779915088788e-07,4.094708987395279e-06
19,9.830609924392775e-07,4.471113003091886e-06
21,1.1693169944919646e-06,4.878691994235851e-06
23,1.3880559999961407e-06,5.320055002812296e-06
25,1.6101539949886501e-06,5.812328003230505e-06
27,1.8436819955240934e-06,6.365544002619572e-06
29,2.1035520039731635e-06,7.1151800075313074e-06
31,2.3763119970681144e-06,7.682306997594423e-06
33,2.6762380002764983e-06,8.129071007715539e-06
35,3.0065760074649004e-06,8.772588000283577e-06
37,3.334196007926948e-06,9.501428008661605e-06
39,3.7296579976100476e-06,1.0552032996201888e-05
41,4.054303994053043e-06,1.1043670005165041e-05
43,4.501789997448213e-06,1.1927387007744983e-05
45,4.832107006222941e-06,1.2792505003744736e-05
47,5.278729004203342e-06,1.3642278994666412e-05
49,5.832850991282612e-06,1.5023024010588416e-05
51,6.299954999121837e-06,1.601715901051648e-05
53,6.834098006947898e-06,1.6562082004384137e-05
55,8.818272995995358e-06,1.7615651988307947e-05
57,8.89724399894476e-06,1.8693470992729998e-05
59,9.524909997708164e-06,1.9757230998948215e-05
61,9.858479999820703e-06,2.0921860006637873e-05
63,1.0892339007114061e-05,2.2059237002395092e-05
65,1.1153511994052679e-05,2.3281863002921454e-05
67,1.1815436999313534e-05,2.4583477003034204e-05
69,1.2513476991443895e-05,2.5820324997766875e-05
71,1.2912972990307025e-05,2.7160885001649148e-05
73,1.3612305003334767e-05,2.8512088989373295e-05
75,1.4474694995442406e-05,2.994079899508506e-05
77,1.5050637011881918e-05,3.1362107998575085e-05
79,1.6278285998851062e-05,3.2830506999744104e-05
81,1.7094506009016185e-05,3.4311275012441914e-05
83,1.7455348002840766e-05,3.5927311007981186e-05
85,1.8258039010106587e-05,3.744289500173181e-05
87,1.910153501376044e-05,3.909152599226218e-05
89,1.9952228001784534e-05,4.0773032989818604e-05
91,2.1386908003478312e-05,4.260684800101444e-05
93,2.1697513002436608e-05,4.424196299805771e-05
95,2.2654768006759697e-05,4.599233198678121e-05
97,2.3587388001033103e-05,4.7841726001934146e-05
99,2.4578259995905682e-05,4.9719257003744136e-05
101,2.555219200439751e-05,5.1562903987360186e-05
103,2.6544245003606194e-05,5.344945400429424e-05
105,2.7561848997720517e-05,5.546425998909399e-05
107,2.8663237986620516e-05,5.749664599716198e-05
109,2.9656843005795963e-05,5.9485555000719614e-05
111,3.071375100989826e-05,6.161657799384556e-05
113,3.182354199816473e-05,6.362505399738438e-05
115,3.29423270013649e-05,6.59617450000951e-05
117,3.406323600211181e-05,6.807827700686175e-05
119,3.523617400787771e-05,7.033873800537548e-05
121,3.641733099357225e-05,7.259436600725166e-05
123,3.760053600126412e-05,7.488666100834962e-05
125,3.8775731009081935e-05,7.721088700054678e-05
127,4.053250300057698e-05,7.95256010023877e-05
129,4.242038099619094e-05,8.207386899448466e-05
131,4.2548033001367e-05,8.45927530026529e-05
133,4.381964699132368e-05,8.702767000067979e-05
135,4.640341601043474e-05,8.967516900156625e-05
137,4.6470957007841206e-05,9.215094700630289e-05
139,4.785054900276009e-05,9.477479199995287e-05
141,4.920196099556051e-05,9.737256199878174e-05
143,5.0588134006829936e-05,0.00010015238399500959
145,5.2004214012413286e-05,0.00010274529000162146
147,5.339522199938074e-05,0.0001055051699950127
149,5.4872838998562654e-05,0.00010825543300597928
151,5.630636200658046e-05,0.00011108420000527986
153,5.778362399723846e-05,0.00011400010000215843
155,5.929304200981278e-05,0.0001169092579948483
157,6.082800000149291e-05,0.00011980927299009637
159,6.234696898900438e-05,0.00012275251699611544
161,6.389497400959953e-05,0.00012580490300024395
163,6.546421999519225e-05,0.00012879740999778731
165,6.706594899878838e-05,0.00013189049999346025
167,6.86899949942017e-05,0.00013508607700350695
169,7.03255730040837e-05,0.0001384543679887429
171,7.194524399528745e-05,0.0001415441660064971
173,7.364011400204617e-05,0.00014470038999570535
175,7.530159399902914e-05,0.00014812490799522493
177,7.703056299942545e-05,0.00015151044201047625
179,7.877000598818994e-05,0.0001550259229989024
181,8.056354499422014e-05,0.00015840976098843384
183,8.230255900707561e-05,0.00016146046300127636
185,8.411060999787878e-05,0.0001648237220069859
187,8.593102199665736e-05,0.00016880192600365263
189,8.775954999146052e-05,0.00017214770999271421
191,8.957593599916436e-05,0.0001756833649997134
193,9.143969698925502e-05,0.00017933695799729322
195,9.335860000282991e-05,0.00018320062900602352
197,9.528399800183252e-05,0.00018721782499051188
199,9.717094000370707e-05,0.00019091358900186605
201,9.915663600258995e-05,0.00019457660699845293
203,0.00010111445700749755,0.00019843029799812938
205,0.0001030436420114711,0.00020250573099474421
207,0.00010511462000431493,0.0002057212560030166
209,0.00010707404400454834,0.00020976610900834202
211,0.00010923221000120975,0.00021393066599557643
213,0.00011123345700616483,0.000218106002008426
215,0.00011331091000465676,0.0002226826880068984
217,0.00011541820199636276,0.00022656015200482217
219,0.00011745439701189752,0.00023122245600097814
221,0.00011965750799572561,0.00023512257800030055
223,0.0001217683919967385,0.00023922805799520575
225,0.00012395016000664327,0.00024466573499375957
227,0.00012613116498687304,0.00024804236000636594
229,0.00012834949900570792,0.000252487404999556
231,0.00013057007901079486,0.0002572519809909863
233,0.00013282599799276795,0.0002619540010055061
235,0.00013514574500732125,0.00026779574999818577
237,0.00013738959899637848,0.0002701820999936899
239,0.00013969385100062937,0.00027524539300065955
241,0.00014208142399729695,0.0002794181229983224
243,0.00014448049099883065,0.00028530672899796625
245,0.00014690193100250326,0.00028930913399381096
247,0.0001492562710045604,0.00029400941300264095
249,0.0001517066380038159,0.00029908111700206063
251,0.00015404325399140362,0.0003059594859951176
253,0.0001565057249972597,0.0003120929349970538
255,0.00015890150600171183,0.00031580220701289365
1 length rapidfuzz jellyfish
2 1 1.3186500291340052e-07 2.2917869937373324e-06
3 3 1.5496299602091313e-07 2.3576690000481903e-06
4 5 1.9305800378788262e-07 2.6081079995492475e-06
5 7 2.5326300237793474e-07 2.7545159973669795e-06
6 9 3.2603699946776033e-07 2.835453997249715e-06
7 11 4.1582799167372286e-07 3.0755670013604687e-06
8 13 5.237079894868657e-07 3.348548008943908e-06
9 15 6.503320037154481e-07 3.5760050086537376e-06
10 17 8.075779915088788e-07 4.094708987395279e-06
11 19 9.830609924392775e-07 4.471113003091886e-06
12 21 1.1693169944919646e-06 4.878691994235851e-06
13 23 1.3880559999961407e-06 5.320055002812296e-06
14 25 1.6101539949886501e-06 5.812328003230505e-06
15 27 1.8436819955240934e-06 6.365544002619572e-06
16 29 2.1035520039731635e-06 7.1151800075313074e-06
17 31 2.3763119970681144e-06 7.682306997594423e-06
18 33 2.6762380002764983e-06 8.129071007715539e-06
19 35 3.0065760074649004e-06 8.772588000283577e-06
20 37 3.334196007926948e-06 9.501428008661605e-06
21 39 3.7296579976100476e-06 1.0552032996201888e-05
22 41 4.054303994053043e-06 1.1043670005165041e-05
23 43 4.501789997448213e-06 1.1927387007744983e-05
24 45 4.832107006222941e-06 1.2792505003744736e-05
25 47 5.278729004203342e-06 1.3642278994666412e-05
26 49 5.832850991282612e-06 1.5023024010588416e-05
27 51 6.299954999121837e-06 1.601715901051648e-05
28 53 6.834098006947898e-06 1.6562082004384137e-05
29 55 8.818272995995358e-06 1.7615651988307947e-05
30 57 8.89724399894476e-06 1.8693470992729998e-05
31 59 9.524909997708164e-06 1.9757230998948215e-05
32 61 9.858479999820703e-06 2.0921860006637873e-05
33 63 1.0892339007114061e-05 2.2059237002395092e-05
34 65 1.1153511994052679e-05 2.3281863002921454e-05
35 67 1.1815436999313534e-05 2.4583477003034204e-05
36 69 1.2513476991443895e-05 2.5820324997766875e-05
37 71 1.2912972990307025e-05 2.7160885001649148e-05
38 73 1.3612305003334767e-05 2.8512088989373295e-05
39 75 1.4474694995442406e-05 2.994079899508506e-05
40 77 1.5050637011881918e-05 3.1362107998575085e-05
41 79 1.6278285998851062e-05 3.2830506999744104e-05
42 81 1.7094506009016185e-05 3.4311275012441914e-05
43 83 1.7455348002840766e-05 3.5927311007981186e-05
44 85 1.8258039010106587e-05 3.744289500173181e-05
45 87 1.910153501376044e-05 3.909152599226218e-05
46 89 1.9952228001784534e-05 4.0773032989818604e-05
47 91 2.1386908003478312e-05 4.260684800101444e-05
48 93 2.1697513002436608e-05 4.424196299805771e-05
49 95 2.2654768006759697e-05 4.599233198678121e-05
50 97 2.3587388001033103e-05 4.7841726001934146e-05
51 99 2.4578259995905682e-05 4.9719257003744136e-05
52 101 2.555219200439751e-05 5.1562903987360186e-05
53 103 2.6544245003606194e-05 5.344945400429424e-05
54 105 2.7561848997720517e-05 5.546425998909399e-05
55 107 2.8663237986620516e-05 5.749664599716198e-05
56 109 2.9656843005795963e-05 5.9485555000719614e-05
57 111 3.071375100989826e-05 6.161657799384556e-05
58 113 3.182354199816473e-05 6.362505399738438e-05
59 115 3.29423270013649e-05 6.59617450000951e-05
60 117 3.406323600211181e-05 6.807827700686175e-05
61 119 3.523617400787771e-05 7.033873800537548e-05
62 121 3.641733099357225e-05 7.259436600725166e-05
63 123 3.760053600126412e-05 7.488666100834962e-05
64 125 3.8775731009081935e-05 7.721088700054678e-05
65 127 4.053250300057698e-05 7.95256010023877e-05
66 129 4.242038099619094e-05 8.207386899448466e-05
67 131 4.2548033001367e-05 8.45927530026529e-05
68 133 4.381964699132368e-05 8.702767000067979e-05
69 135 4.640341601043474e-05 8.967516900156625e-05
70 137 4.6470957007841206e-05 9.215094700630289e-05
71 139 4.785054900276009e-05 9.477479199995287e-05
72 141 4.920196099556051e-05 9.737256199878174e-05
73 143 5.0588134006829936e-05 0.00010015238399500959
74 145 5.2004214012413286e-05 0.00010274529000162146
75 147 5.339522199938074e-05 0.0001055051699950127
76 149 5.4872838998562654e-05 0.00010825543300597928
77 151 5.630636200658046e-05 0.00011108420000527986
78 153 5.778362399723846e-05 0.00011400010000215843
79 155 5.929304200981278e-05 0.0001169092579948483
80 157 6.082800000149291e-05 0.00011980927299009637
81 159 6.234696898900438e-05 0.00012275251699611544
82 161 6.389497400959953e-05 0.00012580490300024395
83 163 6.546421999519225e-05 0.00012879740999778731
84 165 6.706594899878838e-05 0.00013189049999346025
85 167 6.86899949942017e-05 0.00013508607700350695
86 169 7.03255730040837e-05 0.0001384543679887429
87 171 7.194524399528745e-05 0.0001415441660064971
88 173 7.364011400204617e-05 0.00014470038999570535
89 175 7.530159399902914e-05 0.00014812490799522493
90 177 7.703056299942545e-05 0.00015151044201047625
91 179 7.877000598818994e-05 0.0001550259229989024
92 181 8.056354499422014e-05 0.00015840976098843384
93 183 8.230255900707561e-05 0.00016146046300127636
94 185 8.411060999787878e-05 0.0001648237220069859
95 187 8.593102199665736e-05 0.00016880192600365263
96 189 8.775954999146052e-05 0.00017214770999271421
97 191 8.957593599916436e-05 0.0001756833649997134
98 193 9.143969698925502e-05 0.00017933695799729322
99 195 9.335860000282991e-05 0.00018320062900602352
100 197 9.528399800183252e-05 0.00018721782499051188
101 199 9.717094000370707e-05 0.00019091358900186605
102 201 9.915663600258995e-05 0.00019457660699845293
103 203 0.00010111445700749755 0.00019843029799812938
104 205 0.0001030436420114711 0.00020250573099474421
105 207 0.00010511462000431493 0.0002057212560030166
106 209 0.00010707404400454834 0.00020976610900834202
107 211 0.00010923221000120975 0.00021393066599557643
108 213 0.00011123345700616483 0.000218106002008426
109 215 0.00011331091000465676 0.0002226826880068984
110 217 0.00011541820199636276 0.00022656015200482217
111 219 0.00011745439701189752 0.00023122245600097814
112 221 0.00011965750799572561 0.00023512257800030055
113 223 0.0001217683919967385 0.00023922805799520575
114 225 0.00012395016000664327 0.00024466573499375957
115 227 0.00012613116498687304 0.00024804236000636594
116 229 0.00012834949900570792 0.000252487404999556
117 231 0.00013057007901079486 0.0002572519809909863
118 233 0.00013282599799276795 0.0002619540010055061
119 235 0.00013514574500732125 0.00026779574999818577
120 237 0.00013738959899637848 0.0002701820999936899
121 239 0.00013969385100062937 0.00027524539300065955
122 241 0.00014208142399729695 0.0002794181229983224
123 243 0.00014448049099883065 0.00028530672899796625
124 245 0.00014690193100250326 0.00028930913399381096
125 247 0.0001492562710045604 0.00029400941300264095
126 249 0.0001517066380038159 0.00029908111700206063
127 251 0.00015404325399140362 0.0003059594859951176
128 253 0.0001565057249972597 0.0003120929349970538
129 255 0.00015890150600171183 0.00031580220701289365

View File

@ -0,0 +1,31 @@
Damerau Levenshtein
-------------------
Functions
^^^^^^^^^
distance
~~~~~~~~
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.distance
normalized_distance
~~~~~~~~~~~~~~~~~~~
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.normalized_distance
similarity
~~~~~~~~~~
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.similarity
normalized_similarity
~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.normalized_similarity
Performance
^^^^^^^^^^^
The following image shows a benchmark of the Damerau Levenshtein distance in
RapidFuzz and jellyfish. Both have a time complexity of ``O(NM)``. However RapidFuzz
only requires ``O(N + M)`` while the implementation in jellyfish requires
has a memory usage of ``O(NM)``.
.. image:: img/damerau_levenshtein.svg
:align: center

View File

@ -30,12 +30,6 @@ opcodes
Performance
^^^^^^^^^^^
Since the Levenshtein module uses different implementations based on the weights
used, this leads to different performance characteristics. The following sections
show the performance for the different possible weights.
Indel
~~~~~
The following image shows a benchmark of the Indel distance in RapidFuzz
and python-Levenshtein. Similar to the normal Levenshtein distance
python-Levenshtein uses an implementation with a time complexity of ``O(NM)``,

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 38 KiB

View File

@ -25,6 +25,7 @@ Opcodes
:maxdepth: 1
Levenshtein
DamerauLevenshtein
Indel
Hamming
Jaro

View File

@ -22,7 +22,7 @@ copyright = '2021, Max Bachmann'
author = 'Max Bachmann'
# The full version, including alpha/beta/rc tags
release = '2.5.0'
release = '2.6.0'
# -- General configuration ---------------------------------------------------

@ -1 +1 @@
Subproject commit 4fba002adcd18b8847d2ff8ab96add16959ec004
Subproject commit f860f64052db553351613ac57bc12a843675e660

View File

@ -11,7 +11,7 @@ with open('README.md', 'rt', encoding="utf8") as f:
setup_args = {
"name": "rapidfuzz",
"version": "2.5.0",
"version": "2.6.0",
"install_requires": ["jarowinkler >= 1.2.0, < 2.0.0"],
"extras_require": {'full': ['numpy']},
"url": "https://github.com/maxbachmann/RapidFuzz",

View File

@ -3,6 +3,6 @@ rapid string matching library
"""
__author__: str = "Max Bachmann"
__license__: str = "MIT"
__version__: str = "2.5.0"
__version__: str = "2.6.0"
from rapidfuzz import process, distance, fuzz, string_metric, utils