diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 3ba1e5f..c179db3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -17,7 +17,7 @@ jobs: - name: Install dependencies run: | - python -m pip install Sphinx sphinx_rtd_theme numpy + python -m pip install Sphinx furo numpy python -m pip install . - name: Build Site diff --git a/LICENSE b/LICENSE index 5b55a39..a1313a3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright © 2020 maxbachmann +Copyright © 2020-present Max Bachmann Copyright © 2011 Adam Cohen Permission is hereby granted, free of charge, to any person obtaining diff --git a/docs/contributing.rst b/docs/Contributing.rst similarity index 100% rename from docs/contributing.rst rename to docs/Contributing.rst diff --git a/docs/installation.rst b/docs/Installation.rst similarity index 100% rename from docs/installation.rst rename to docs/Installation.rst diff --git a/docs/license.rst b/docs/License.rst similarity index 100% rename from docs/license.rst rename to docs/License.rst diff --git a/docs/Usage/distance/Hamming.rst b/docs/Usage/distance/Hamming.rst new file mode 100644 index 0000000..aaf2749 --- /dev/null +++ b/docs/Usage/distance/Hamming.rst @@ -0,0 +1,10 @@ +Hamming +-------------------------- + +distance +~~~~~~~~ +.. autofunction:: rapidfuzz.distance.Hamming.distance + +normalized_distance +~~~~~~~~~~~~~~~~~~~ +.. autofunction:: rapidfuzz.distance.Hamming.normalized_distance diff --git a/docs/Usage/distance/Indel.rst b/docs/Usage/distance/Indel.rst new file mode 100644 index 0000000..3c6c406 --- /dev/null +++ b/docs/Usage/distance/Indel.rst @@ -0,0 +1,18 @@ +Indel +------------------------ + +distance +~~~~~~~~ +.. autofunction:: rapidfuzz.distance.Indel.distance + +normalized_distance +~~~~~~~~~~~~~~~~~~~ +.. autofunction:: rapidfuzz.distance.Indel.normalized_distance + +editops +~~~~~~~ +.. autofunction:: rapidfuzz.distance.Indel.editops + +opcodes +~~~~~~~ +.. autofunction:: rapidfuzz.distance.Indel.opcodes diff --git a/docs/Usage/distance/Jaro.rst b/docs/Usage/distance/Jaro.rst new file mode 100644 index 0000000..e03250a --- /dev/null +++ b/docs/Usage/distance/Jaro.rst @@ -0,0 +1,6 @@ +Jaro +----------------------- + +similarity +~~~~~~~~~~ +.. autofunction:: rapidfuzz.distance.Jaro.similarity diff --git a/docs/Usage/distance/JaroWinkler.rst b/docs/Usage/distance/JaroWinkler.rst new file mode 100644 index 0000000..0c852ea --- /dev/null +++ b/docs/Usage/distance/JaroWinkler.rst @@ -0,0 +1,6 @@ +JaroWinkler +------------------------------ + +similarity +~~~~~~~~~~ +.. autofunction:: rapidfuzz.distance.JaroWinkler.similarity \ No newline at end of file diff --git a/docs/Usage/distance/Levenshtein.rst b/docs/Usage/distance/Levenshtein.rst new file mode 100644 index 0000000..28c7a7c --- /dev/null +++ b/docs/Usage/distance/Levenshtein.rst @@ -0,0 +1,25 @@ +Levenshtein +------------------------------ +distance +~~~~~~~~ +.. autofunction:: rapidfuzz.distance.Levenshtein.distance + +normalized_distance +~~~~~~~~~~~~~~~~~~~ +.. autofunction:: rapidfuzz.distance.Levenshtein.normalized_distance + +similarity +~~~~~~~~~~ +.. autofunction:: rapidfuzz.distance.Levenshtein.similarity + +normalized_similarity +~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: rapidfuzz.distance.Levenshtein.normalized_similarity + +editops +~~~~~~~ +.. autofunction:: rapidfuzz.distance.Levenshtein.editops + +opcodes +~~~~~~~ +.. autofunction:: rapidfuzz.distance.Levenshtein.opcodes \ No newline at end of file diff --git a/docs/img/indel_levenshtein.svg b/docs/Usage/distance/img/indel_levenshtein.svg similarity index 100% rename from docs/img/indel_levenshtein.svg rename to docs/Usage/distance/img/indel_levenshtein.svg diff --git a/docs/img/uniform_levenshtein.svg b/docs/Usage/distance/img/uniform_levenshtein.svg similarity index 100% rename from docs/img/uniform_levenshtein.svg rename to docs/Usage/distance/img/uniform_levenshtein.svg diff --git a/docs/Usage/distance/index.rst b/docs/Usage/distance/index.rst new file mode 100644 index 0000000..85fe325 --- /dev/null +++ b/docs/Usage/distance/index.rst @@ -0,0 +1,21 @@ +distance +================== + +rapidfuzz.distance.Editops +------------------------------ +.. autoclass:: rapidfuzz.distance.Editops + :members: + +rapidfuzz.distance.Opcodes +------------------------------ +.. autoclass:: rapidfuzz.distance.Opcodes + :members: + +.. toctree:: + :maxdepth: 1 + + Levenshtein + Indel + Hamming + Jaro + JaroWinkler diff --git a/docs/fuzz.rst b/docs/Usage/fuzz.rst similarity index 96% rename from docs/fuzz.rst rename to docs/Usage/fuzz.rst index b53fb94..bf74f2b 100644 --- a/docs/fuzz.rst +++ b/docs/Usage/fuzz.rst @@ -1,5 +1,5 @@ -fuzz module -=========== +rapidfuzz.fuzz +============== ratio ----- diff --git a/docs/img/RapidFuzz.svg b/docs/Usage/img/RapidFuzz.svg similarity index 100% rename from docs/img/RapidFuzz.svg rename to docs/Usage/img/RapidFuzz.svg diff --git a/docs/img/WRatio.svg b/docs/Usage/img/WRatio.svg similarity index 100% rename from docs/img/WRatio.svg rename to docs/Usage/img/WRatio.svg diff --git a/docs/img/extractOne.svg b/docs/Usage/img/extractOne.svg similarity index 100% rename from docs/img/extractOne.svg rename to docs/Usage/img/extractOne.svg diff --git a/docs/Usage/img/indel_levenshtein.svg b/docs/Usage/img/indel_levenshtein.svg new file mode 100644 index 0000000..3977ae5 --- /dev/null +++ b/docs/Usage/img/indel_levenshtein.svg @@ -0,0 +1,1597 @@ + + + + + + + + 2021-09-15T00:32:08.614768 + image/svg+xml + + + Matplotlib v3.4.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/img/partial_ratio_long_needle.svg b/docs/Usage/img/partial_ratio_long_needle.svg similarity index 100% rename from docs/img/partial_ratio_long_needle.svg rename to docs/Usage/img/partial_ratio_long_needle.svg diff --git a/docs/img/partial_ratio_short_needle.svg b/docs/Usage/img/partial_ratio_short_needle.svg similarity index 100% rename from docs/img/partial_ratio_short_needle.svg rename to docs/Usage/img/partial_ratio_short_needle.svg diff --git a/docs/img/partial_token_ratio.svg b/docs/Usage/img/partial_token_ratio.svg similarity index 100% rename from docs/img/partial_token_ratio.svg rename to docs/Usage/img/partial_token_ratio.svg diff --git a/docs/img/partial_token_set_ratio.svg b/docs/Usage/img/partial_token_set_ratio.svg similarity index 100% rename from docs/img/partial_token_set_ratio.svg rename to docs/Usage/img/partial_token_set_ratio.svg diff --git a/docs/img/partial_token_sort_ratio.svg b/docs/Usage/img/partial_token_sort_ratio.svg similarity index 100% rename from docs/img/partial_token_sort_ratio.svg rename to docs/Usage/img/partial_token_sort_ratio.svg diff --git a/docs/img/ratio.svg b/docs/Usage/img/ratio.svg similarity index 100% rename from docs/img/ratio.svg rename to docs/Usage/img/ratio.svg diff --git a/docs/img/scorer.svg b/docs/Usage/img/scorer.svg similarity index 100% rename from docs/img/scorer.svg rename to docs/Usage/img/scorer.svg diff --git a/docs/img/token_ratio.svg b/docs/Usage/img/token_ratio.svg similarity index 100% rename from docs/img/token_ratio.svg rename to docs/Usage/img/token_ratio.svg diff --git a/docs/img/token_set_ratio.svg b/docs/Usage/img/token_set_ratio.svg similarity index 100% rename from docs/img/token_set_ratio.svg rename to docs/Usage/img/token_set_ratio.svg diff --git a/docs/img/token_sort_ratio.svg b/docs/Usage/img/token_sort_ratio.svg similarity index 100% rename from docs/img/token_sort_ratio.svg rename to docs/Usage/img/token_sort_ratio.svg diff --git a/docs/Usage/img/uniform_levenshtein.svg b/docs/Usage/img/uniform_levenshtein.svg new file mode 100644 index 0000000..36d6685 --- /dev/null +++ b/docs/Usage/img/uniform_levenshtein.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/Usage/index.rst b/docs/Usage/index.rst new file mode 100644 index 0000000..e9e46db --- /dev/null +++ b/docs/Usage/index.rst @@ -0,0 +1,12 @@ +Usage +===== + +.. toctree:: + :maxdepth: 3 + + process + distance/index + fuzz + string_metric + utils + diff --git a/docs/process.rst b/docs/Usage/process.rst similarity index 88% rename from docs/process.rst rename to docs/Usage/process.rst index 314cd20..95a6a6f 100644 --- a/docs/process.rst +++ b/docs/Usage/process.rst @@ -1,5 +1,5 @@ -process module -============== +rapidfuzz.process +================= cdist ---------- diff --git a/docs/string_metric.rst b/docs/Usage/string_metric.rst similarity index 91% rename from docs/string_metric.rst rename to docs/Usage/string_metric.rst index 5bb756c..e9d730a 100644 --- a/docs/string_metric.rst +++ b/docs/Usage/string_metric.rst @@ -1,5 +1,5 @@ -string_metric module -==================== +rapidfuzz.string_metric +======================= levenshtein ----------- diff --git a/docs/utils.rst b/docs/Usage/utils.rst similarity index 72% rename from docs/utils.rst rename to docs/Usage/utils.rst index e8a0872..b1f6493 100644 --- a/docs/utils.rst +++ b/docs/Usage/utils.rst @@ -1,5 +1,5 @@ -utils module -============== +rapidfuzz.utils +=============== default_process --------------- diff --git a/docs/conf.py b/docs/conf.py index 6869e4f..7eddbba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,7 +48,9 @@ exclude_patterns = [] # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = 'furo' +pygments_style = "monokai" +pygments_dark_style = "monokai" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/index.rst b/docs/index.rst index 48af0ea..34fd81d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,37 +1,25 @@ -.. RapidFuzz documentation master file, created by - sphinx-quickstart on Fri Jan 1 19:02:29 2021. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - Welcome to RapidFuzz's documentation! ===================================== .. toctree:: :maxdepth: 2 - :caption: Installation: - installation + Installation .. toctree:: - :maxdepth: 2 - :caption: Usage: + :maxdepth: 3 - fuzz - string_metric - process - utils + Usage/index .. toctree:: :maxdepth: 1 - :caption: Contributing: - contributing + Contributing .. toctree:: :maxdepth: 2 - :caption: License: - license + License Indices and tables ================== diff --git a/src/cython/cpp_process_cdist.pyx b/src/cython/cpp_process_cdist.pyx index 0eed650..991e170 100644 --- a/src/cython/cpp_process_cdist.pyx +++ b/src/cython/cpp_process_cdist.pyx @@ -207,10 +207,12 @@ def cdist(queries, choices, *, scorer=ratio, processor=None, score_cutoff=None, scorer : Callable, optional Optional callable that is used to calculate the matching score between the query and each choice. This can be: + - a scorer using the RapidFuzz C-API like the builtin scorers in RapidFuzz, which can return a distance or similarity between two strings. Further details can be found here. - a Python function which returns a similarity between two strings in the range 0-100. This is not recommended, since it is far slower than a scorer using the RapidFuzz C-API. + fuzz.ratio is used by default. processor : Callable, optional Optional callable that is used to preprocess the strings before @@ -224,11 +226,13 @@ def cdist(queries, choices, *, scorer=ratio, processor=None, score_cutoff=None, dtype : data-type, optional The desired data-type for the result array.Depending on the scorer type the following dtypes are supported: + - similarity: - - np.float32, np.float64 - - np.uint8 -> stores fixed point representation of the result scaled to a range 0-100 + - np.float32, np.float64 + - np.uint8 -> stores fixed point representation of the result scaled to a range 0-100 - distance: - - np.int8, np.int16, np.int32, np.int64 + - np.int8, np.int16, np.int32, np.int64 + If not given, then the type will be np.float32 for similarities and np.int32 for distances. workers : int, optional The calculation is subdivided into workers sections and evaluated in parallel. diff --git a/src/cython/distance/Indel.pyx b/src/cython/distance/Indel.pyx index bb87b01..0dd73ac 100644 --- a/src/cython/distance/Indel.pyx +++ b/src/cython/distance/Indel.pyx @@ -108,14 +108,14 @@ def distance(s1, s2, *, weights=(1,1,1), processor=None, score_cutoff=None): - If the length of the shorter string is ≤ 64 after removing the common affix Hyyrös' lcs algorithm is used, which calculates the Indel distance in - parallel. The algorithm is described by [1]_ and is extended with support + parallel. The algorithm is described by [5]_ and is extended with support for UTF32 in this implementation. The time complexity of this algorithm is ``O(N)``. - If the length of the shorter string is ≥ 64 after removing the common affix a blockwise implementation of the Hyyrös' lcs algorithm is used, which calculates the Levenshtein distance in parallel (64 characters at a time). - The algorithm is described by [1]_. The time complexity of this + The algorithm is described by [5]_. The time complexity of this algorithm is ``O([N/64]M)``. The following image shows a benchmark of the Indel distance in RapidFuzz @@ -128,7 +128,7 @@ def distance(s1, s2, *, weights=(1,1,1), processor=None, score_cutoff=None): References ---------- - .. [4] Hyyrö, Heikki. "Bit-Parallel LCS-length Computation Revisited" + .. [5] Hyyrö, Heikki. "Bit-Parallel LCS-length Computation Revisited" Proc. 15th Australasian Workshop on Combinatorial Algorithms (AWOCA 2004). Examples @@ -243,11 +243,11 @@ def editops(s1, s2, *, processor=None): Notes ----- The alignment is calculated using an algorithm of Heikki Hyyrö, which is - described [1]_. It has a time complexity and memory usage of ``O([N/64] * M)``. + described [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``. References ---------- - .. [1] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation." + .. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation." Stringology (2004). Examples @@ -289,11 +289,11 @@ def opcodes(s1, s2, *, processor=None): Notes ----- The alignment is calculated using an algorithm of Heikki Hyyrö, which is - described [1]_. It has a time complexity and memory usage of ``O([N/64] * M)``. + described [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``. References ---------- - .. [1] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation." + .. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation." Stringology (2004). Examples diff --git a/src/cython/distance/Levenshtein.pyx b/src/cython/distance/Levenshtein.pyx index 5a79af3..351a153 100644 --- a/src/cython/distance/Levenshtein.pyx +++ b/src/cython/distance/Levenshtein.pyx @@ -450,11 +450,11 @@ def editops(s1, s2, *, processor=None): Notes ----- The alignment is calculated using an algorithm of Heikki Hyyrö, which is - described [1]_. It has a time complexity and memory usage of ``O([N/64] * M)``. + described [8]_. It has a time complexity and memory usage of ``O([N/64] * M)``. References ---------- - .. [1] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation." + .. [8] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation." Stringology (2004). Examples @@ -496,11 +496,11 @@ def opcodes(s1, s2, *, processor=None): Notes ----- The alignment is calculated using an algorithm of Heikki Hyyrö, which is - described [1]_. It has a time complexity and memory usage of ``O([N/64] * M)``. + described [9]_. It has a time complexity and memory usage of ``O([N/64] * M)``. References ---------- - .. [1] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation." + .. [9] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation." Stringology (2004). Examples diff --git a/src/cython/distance/_initialize.pyx b/src/cython/distance/_initialize.pyx index 0fd7eeb..a16d6cd 100644 --- a/src/cython/distance/_initialize.pyx +++ b/src/cython/distance/_initialize.pyx @@ -174,9 +174,10 @@ cdef class Editops: Each tuple is of the form (tag, src_pos, dest_pos). The tags are strings, with these meanings: - 'replace': s1[src_pos] should be replaced by s2[dest_pos] - 'delete': s1[src_pos] should be deleted. - 'insert': s2[dest_pos] should be inserted at s1[src_pos]. + + 'replace': s1[src_pos] should be replaced by s2[dest_pos] + 'delete': s1[src_pos] should be deleted + 'insert': s2[dest_pos] should be inserted at s1[src_pos] """ def __init__(self, editops=None, src_len=0, dest_len=0): @@ -308,15 +309,14 @@ cdef class Opcodes: tuple preceding it, and likewise for j1 == the previous j2. The tags are strings, with these meanings: - 'replace': s1[i1:i2] should be replaced by s2[j1:j2] - 'delete': s1[i1:i2] should be deleted. - Note that j1==j2 in this case. - 'insert': s2[j1:j2] should be inserted at s1[i1:i1]. - Note that i1==i2 in this case. - 'equal': s1[i1:i2] == s2[j1:j2] + + 'replace': s1[i1:i2] should be replaced by s2[j1:j2] + 'delete': s1[i1:i2] should be deleted. Note that j1==j2 in this case. + 'insert': s2[j1:j2] should be inserted at s1[i1:i1]. Note that i1==i2 in this case. + 'equal': s1[i1:i2] == s2[j1:j2] Note - -------- + ---- Opcodes uses tuples similar to difflib's SequenceMatcher to make them interoperable """