release v2.6.0
This commit is contained in:
parent
397d7dea7b
commit
dcf6746767
|
@ -1,6 +1,6 @@
|
|||
## Changelog
|
||||
|
||||
### [2.6.0] - 2022-08-
|
||||
### [2.6.0] - 2022-08-20
|
||||
#### Fixed
|
||||
- fix hashing for custom classes
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ else()
|
|||
add_library(Taskflow::Taskflow ALIAS Taskflow)
|
||||
endif()
|
||||
|
||||
find_package(rapidfuzz 1.1.1 QUIET)
|
||||
find_package(rapidfuzz 1.2.0 QUIET)
|
||||
if (rapidfuzz_FOUND)
|
||||
message("Using system supplied version of rapidfuzz-cpp")
|
||||
else()
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
# todo combine benchmarks of scorers into common code base
|
||||
import timeit
|
||||
import pandas
|
||||
|
||||
def benchmark(name, func, setup, lengths, count):
|
||||
print(f"starting {name}")
|
||||
start = timeit.default_timer()
|
||||
results = []
|
||||
from tqdm import tqdm
|
||||
for length in tqdm(lengths):
|
||||
#for length in lengths:
|
||||
test = timeit.Timer(func, setup=setup.format(length, count))
|
||||
results.append(min(test.timeit(number=1) for _ in range(7)) / count)
|
||||
stop = timeit.default_timer()
|
||||
print(f"finished {name}, Runtime: ", stop - start)
|
||||
return results
|
||||
|
||||
setup ="""
|
||||
from rapidfuzz.distance.DamerauLevenshtein import distance
|
||||
from jellyfish import damerau_levenshtein_distance
|
||||
import string
|
||||
import random
|
||||
random.seed(18)
|
||||
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
|
||||
a = ''.join(random.choice(characters) for _ in range({0}))
|
||||
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
|
||||
"""
|
||||
|
||||
lengths = list(range(1,256,2))
|
||||
count = 1000
|
||||
|
||||
time_rapidfuzz = benchmark("rapidfuzz",
|
||||
'[distance(a, b) for b in b_list]',
|
||||
setup, lengths, count)
|
||||
|
||||
time_jellyfish = benchmark("jellyfish",
|
||||
'[damerau_levenshtein_distance(a, b) for b in b_list]',
|
||||
setup, lengths, count)
|
||||
|
||||
df = pandas.DataFrame(data={
|
||||
"length": lengths,
|
||||
"rapidfuzz": time_rapidfuzz,
|
||||
"jellyfish": time_jellyfish
|
||||
})
|
||||
|
||||
df.to_csv("results/levenshtein_damerau.csv", sep=',',index=False)
|
|
@ -0,0 +1,20 @@
|
|||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
df=pd.read_csv("results/levenshtein_damerau.csv")
|
||||
|
||||
df *= 1000 * 1000
|
||||
df["length"] /= 1000 * 1000
|
||||
|
||||
|
||||
ax=df.plot(x="length")
|
||||
|
||||
plt.xticks(list(range(0, 257, 64)))
|
||||
|
||||
plt.title("Performance comparision of the \nDamerauLevenshtein similarity in different libraries")
|
||||
plt.xlabel("string length [in characters]")
|
||||
plt.ylabel("runtime [μs]")
|
||||
ax.set_xlim(xmin=0)
|
||||
ax.set_ylim(bottom=0)
|
||||
plt.grid()
|
||||
plt.show()
|
|
@ -0,0 +1,129 @@
|
|||
length,rapidfuzz,jellyfish
|
||||
1,1.3186500291340052e-07,2.2917869937373324e-06
|
||||
3,1.5496299602091313e-07,2.3576690000481903e-06
|
||||
5,1.9305800378788262e-07,2.6081079995492475e-06
|
||||
7,2.5326300237793474e-07,2.7545159973669795e-06
|
||||
9,3.2603699946776033e-07,2.835453997249715e-06
|
||||
11,4.1582799167372286e-07,3.0755670013604687e-06
|
||||
13,5.237079894868657e-07,3.348548008943908e-06
|
||||
15,6.503320037154481e-07,3.5760050086537376e-06
|
||||
17,8.075779915088788e-07,4.094708987395279e-06
|
||||
19,9.830609924392775e-07,4.471113003091886e-06
|
||||
21,1.1693169944919646e-06,4.878691994235851e-06
|
||||
23,1.3880559999961407e-06,5.320055002812296e-06
|
||||
25,1.6101539949886501e-06,5.812328003230505e-06
|
||||
27,1.8436819955240934e-06,6.365544002619572e-06
|
||||
29,2.1035520039731635e-06,7.1151800075313074e-06
|
||||
31,2.3763119970681144e-06,7.682306997594423e-06
|
||||
33,2.6762380002764983e-06,8.129071007715539e-06
|
||||
35,3.0065760074649004e-06,8.772588000283577e-06
|
||||
37,3.334196007926948e-06,9.501428008661605e-06
|
||||
39,3.7296579976100476e-06,1.0552032996201888e-05
|
||||
41,4.054303994053043e-06,1.1043670005165041e-05
|
||||
43,4.501789997448213e-06,1.1927387007744983e-05
|
||||
45,4.832107006222941e-06,1.2792505003744736e-05
|
||||
47,5.278729004203342e-06,1.3642278994666412e-05
|
||||
49,5.832850991282612e-06,1.5023024010588416e-05
|
||||
51,6.299954999121837e-06,1.601715901051648e-05
|
||||
53,6.834098006947898e-06,1.6562082004384137e-05
|
||||
55,8.818272995995358e-06,1.7615651988307947e-05
|
||||
57,8.89724399894476e-06,1.8693470992729998e-05
|
||||
59,9.524909997708164e-06,1.9757230998948215e-05
|
||||
61,9.858479999820703e-06,2.0921860006637873e-05
|
||||
63,1.0892339007114061e-05,2.2059237002395092e-05
|
||||
65,1.1153511994052679e-05,2.3281863002921454e-05
|
||||
67,1.1815436999313534e-05,2.4583477003034204e-05
|
||||
69,1.2513476991443895e-05,2.5820324997766875e-05
|
||||
71,1.2912972990307025e-05,2.7160885001649148e-05
|
||||
73,1.3612305003334767e-05,2.8512088989373295e-05
|
||||
75,1.4474694995442406e-05,2.994079899508506e-05
|
||||
77,1.5050637011881918e-05,3.1362107998575085e-05
|
||||
79,1.6278285998851062e-05,3.2830506999744104e-05
|
||||
81,1.7094506009016185e-05,3.4311275012441914e-05
|
||||
83,1.7455348002840766e-05,3.5927311007981186e-05
|
||||
85,1.8258039010106587e-05,3.744289500173181e-05
|
||||
87,1.910153501376044e-05,3.909152599226218e-05
|
||||
89,1.9952228001784534e-05,4.0773032989818604e-05
|
||||
91,2.1386908003478312e-05,4.260684800101444e-05
|
||||
93,2.1697513002436608e-05,4.424196299805771e-05
|
||||
95,2.2654768006759697e-05,4.599233198678121e-05
|
||||
97,2.3587388001033103e-05,4.7841726001934146e-05
|
||||
99,2.4578259995905682e-05,4.9719257003744136e-05
|
||||
101,2.555219200439751e-05,5.1562903987360186e-05
|
||||
103,2.6544245003606194e-05,5.344945400429424e-05
|
||||
105,2.7561848997720517e-05,5.546425998909399e-05
|
||||
107,2.8663237986620516e-05,5.749664599716198e-05
|
||||
109,2.9656843005795963e-05,5.9485555000719614e-05
|
||||
111,3.071375100989826e-05,6.161657799384556e-05
|
||||
113,3.182354199816473e-05,6.362505399738438e-05
|
||||
115,3.29423270013649e-05,6.59617450000951e-05
|
||||
117,3.406323600211181e-05,6.807827700686175e-05
|
||||
119,3.523617400787771e-05,7.033873800537548e-05
|
||||
121,3.641733099357225e-05,7.259436600725166e-05
|
||||
123,3.760053600126412e-05,7.488666100834962e-05
|
||||
125,3.8775731009081935e-05,7.721088700054678e-05
|
||||
127,4.053250300057698e-05,7.95256010023877e-05
|
||||
129,4.242038099619094e-05,8.207386899448466e-05
|
||||
131,4.2548033001367e-05,8.45927530026529e-05
|
||||
133,4.381964699132368e-05,8.702767000067979e-05
|
||||
135,4.640341601043474e-05,8.967516900156625e-05
|
||||
137,4.6470957007841206e-05,9.215094700630289e-05
|
||||
139,4.785054900276009e-05,9.477479199995287e-05
|
||||
141,4.920196099556051e-05,9.737256199878174e-05
|
||||
143,5.0588134006829936e-05,0.00010015238399500959
|
||||
145,5.2004214012413286e-05,0.00010274529000162146
|
||||
147,5.339522199938074e-05,0.0001055051699950127
|
||||
149,5.4872838998562654e-05,0.00010825543300597928
|
||||
151,5.630636200658046e-05,0.00011108420000527986
|
||||
153,5.778362399723846e-05,0.00011400010000215843
|
||||
155,5.929304200981278e-05,0.0001169092579948483
|
||||
157,6.082800000149291e-05,0.00011980927299009637
|
||||
159,6.234696898900438e-05,0.00012275251699611544
|
||||
161,6.389497400959953e-05,0.00012580490300024395
|
||||
163,6.546421999519225e-05,0.00012879740999778731
|
||||
165,6.706594899878838e-05,0.00013189049999346025
|
||||
167,6.86899949942017e-05,0.00013508607700350695
|
||||
169,7.03255730040837e-05,0.0001384543679887429
|
||||
171,7.194524399528745e-05,0.0001415441660064971
|
||||
173,7.364011400204617e-05,0.00014470038999570535
|
||||
175,7.530159399902914e-05,0.00014812490799522493
|
||||
177,7.703056299942545e-05,0.00015151044201047625
|
||||
179,7.877000598818994e-05,0.0001550259229989024
|
||||
181,8.056354499422014e-05,0.00015840976098843384
|
||||
183,8.230255900707561e-05,0.00016146046300127636
|
||||
185,8.411060999787878e-05,0.0001648237220069859
|
||||
187,8.593102199665736e-05,0.00016880192600365263
|
||||
189,8.775954999146052e-05,0.00017214770999271421
|
||||
191,8.957593599916436e-05,0.0001756833649997134
|
||||
193,9.143969698925502e-05,0.00017933695799729322
|
||||
195,9.335860000282991e-05,0.00018320062900602352
|
||||
197,9.528399800183252e-05,0.00018721782499051188
|
||||
199,9.717094000370707e-05,0.00019091358900186605
|
||||
201,9.915663600258995e-05,0.00019457660699845293
|
||||
203,0.00010111445700749755,0.00019843029799812938
|
||||
205,0.0001030436420114711,0.00020250573099474421
|
||||
207,0.00010511462000431493,0.0002057212560030166
|
||||
209,0.00010707404400454834,0.00020976610900834202
|
||||
211,0.00010923221000120975,0.00021393066599557643
|
||||
213,0.00011123345700616483,0.000218106002008426
|
||||
215,0.00011331091000465676,0.0002226826880068984
|
||||
217,0.00011541820199636276,0.00022656015200482217
|
||||
219,0.00011745439701189752,0.00023122245600097814
|
||||
221,0.00011965750799572561,0.00023512257800030055
|
||||
223,0.0001217683919967385,0.00023922805799520575
|
||||
225,0.00012395016000664327,0.00024466573499375957
|
||||
227,0.00012613116498687304,0.00024804236000636594
|
||||
229,0.00012834949900570792,0.000252487404999556
|
||||
231,0.00013057007901079486,0.0002572519809909863
|
||||
233,0.00013282599799276795,0.0002619540010055061
|
||||
235,0.00013514574500732125,0.00026779574999818577
|
||||
237,0.00013738959899637848,0.0002701820999936899
|
||||
239,0.00013969385100062937,0.00027524539300065955
|
||||
241,0.00014208142399729695,0.0002794181229983224
|
||||
243,0.00014448049099883065,0.00028530672899796625
|
||||
245,0.00014690193100250326,0.00028930913399381096
|
||||
247,0.0001492562710045604,0.00029400941300264095
|
||||
249,0.0001517066380038159,0.00029908111700206063
|
||||
251,0.00015404325399140362,0.0003059594859951176
|
||||
253,0.0001565057249972597,0.0003120929349970538
|
||||
255,0.00015890150600171183,0.00031580220701289365
|
|
|
@ -0,0 +1,31 @@
|
|||
Damerau Levenshtein
|
||||
-------------------
|
||||
|
||||
Functions
|
||||
^^^^^^^^^
|
||||
|
||||
distance
|
||||
~~~~~~~~
|
||||
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.distance
|
||||
|
||||
normalized_distance
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.normalized_distance
|
||||
|
||||
similarity
|
||||
~~~~~~~~~~
|
||||
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.similarity
|
||||
|
||||
normalized_similarity
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.normalized_similarity
|
||||
|
||||
Performance
|
||||
^^^^^^^^^^^
|
||||
The following image shows a benchmark of the Damerau Levenshtein distance in
|
||||
RapidFuzz and jellyfish. Both have a time complexity of ``O(NM)``. However RapidFuzz
|
||||
only requires ``O(N + M)`` while the implementation in jellyfish requires
|
||||
has a memory usage of ``O(NM)``.
|
||||
|
||||
.. image:: img/damerau_levenshtein.svg
|
||||
:align: center
|
|
@ -30,12 +30,6 @@ opcodes
|
|||
|
||||
Performance
|
||||
^^^^^^^^^^^
|
||||
Since the Levenshtein module uses different implementations based on the weights
|
||||
used, this leads to different performance characteristics. The following sections
|
||||
show the performance for the different possible weights.
|
||||
|
||||
Indel
|
||||
~~~~~
|
||||
The following image shows a benchmark of the Indel distance in RapidFuzz
|
||||
and python-Levenshtein. Similar to the normal Levenshtein distance
|
||||
python-Levenshtein uses an implementation with a time complexity of ``O(NM)``,
|
||||
|
|
File diff suppressed because it is too large
Load Diff
After Width: | Height: | Size: 38 KiB |
|
@ -25,6 +25,7 @@ Opcodes
|
|||
:maxdepth: 1
|
||||
|
||||
Levenshtein
|
||||
DamerauLevenshtein
|
||||
Indel
|
||||
Hamming
|
||||
Jaro
|
||||
|
|
|
@ -22,7 +22,7 @@ copyright = '2021, Max Bachmann'
|
|||
author = 'Max Bachmann'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '2.5.0'
|
||||
release = '2.6.0'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 4fba002adcd18b8847d2ff8ab96add16959ec004
|
||||
Subproject commit f860f64052db553351613ac57bc12a843675e660
|
2
setup.py
2
setup.py
|
@ -11,7 +11,7 @@ with open('README.md', 'rt', encoding="utf8") as f:
|
|||
|
||||
setup_args = {
|
||||
"name": "rapidfuzz",
|
||||
"version": "2.5.0",
|
||||
"version": "2.6.0",
|
||||
"install_requires": ["jarowinkler >= 1.2.0, < 2.0.0"],
|
||||
"extras_require": {'full': ['numpy']},
|
||||
"url": "https://github.com/maxbachmann/RapidFuzz",
|
||||
|
|
|
@ -3,6 +3,6 @@ rapid string matching library
|
|||
"""
|
||||
__author__: str = "Max Bachmann"
|
||||
__license__: str = "MIT"
|
||||
__version__: str = "2.5.0"
|
||||
__version__: str = "2.6.0"
|
||||
|
||||
from rapidfuzz import process, distance, fuzz, string_metric, utils
|
||||
|
|
Loading…
Reference in New Issue