complete basic implementation of rapidfuzz
This commit is contained in:
parent
b8aa20d838
commit
e157e11fa7
|
@ -0,0 +1 @@
|
|||
cpp/test/catch2/catch.hpp -linguist-vendored
|
|
@ -0,0 +1,195 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Generated by Microsoft Visio, SVG Export RapidFuzz.svg Page-1 -->
|
||||
|
||||
<svg
|
||||
xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="5.2963533in"
|
||||
height="1.7699279in"
|
||||
viewBox="0 0 381.33799 127.43484"
|
||||
xml:space="preserve"
|
||||
class="st6"
|
||||
version="1.1"
|
||||
id="svg933"
|
||||
sodipodi:docname="RapidFuzz.svg"
|
||||
style="font-size:12px;overflow:visible;color-interpolation-filters:sRGB;fill:none;fill-rule:evenodd;stroke-linecap:square;stroke-miterlimit:3"
|
||||
inkscape:version="0.92.4 (5da689c313, 2019-01-14)"><metadata
|
||||
id="metadata939"><rdf:RDF><cc:Work
|
||||
rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" /></cc:Work></rdf:RDF></metadata><defs
|
||||
id="defs937" /><sodipodi:namedview
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1"
|
||||
objecttolerance="10"
|
||||
gridtolerance="10"
|
||||
guidetolerance="10"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:window-width="1920"
|
||||
inkscape:window-height="1137"
|
||||
id="namedview935"
|
||||
showgrid="false"
|
||||
inkscape:zoom="0.60980132"
|
||||
inkscape:cx="165.11513"
|
||||
inkscape:cy="-104.98284"
|
||||
inkscape:window-x="1592"
|
||||
inkscape:window-y="-8"
|
||||
inkscape:window-maximized="1"
|
||||
inkscape:current-layer="svg933" />
|
||||
<v:documentProperties
|
||||
v:langID="1033"
|
||||
v:metric="true"
|
||||
v:viewMarkup="false">
|
||||
<v:userDefs>
|
||||
<v:ud
|
||||
v:nameU="msvNoAutoConnect"
|
||||
v:val="VT0(1):26" />
|
||||
</v:userDefs>
|
||||
</v:documentProperties>
|
||||
|
||||
<style
|
||||
type="text/css"
|
||||
id="style899">
|
||||
<![CDATA[
|
||||
.st1 {fill:none;stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:10}
|
||||
.st2 {fill:#0070c0;stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:10}
|
||||
.st3 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
|
||||
.st4 {fill:#0070c0;font-family:Calibri;font-size:5.33334em}
|
||||
.st5 {fill:#ffffff;font-family:Calibri;font-size:5.33334em;font-style:italic}
|
||||
.st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
|
||||
]]>
|
||||
</style>
|
||||
|
||||
<g
|
||||
v:mID="0"
|
||||
v:index="1"
|
||||
v:groupContext="foregroundPage"
|
||||
id="g931"
|
||||
transform="translate(-142.402,-264.29138)">
|
||||
<title
|
||||
id="title901">Page-1</title>
|
||||
<v:pageProperties
|
||||
v:drawingScale="0.0393701"
|
||||
v:pageScale="0.0393701"
|
||||
v:drawingUnits="24"
|
||||
v:shadowOffsetX="8.50394"
|
||||
v:shadowOffsetY="-8.50394" />
|
||||
<g
|
||||
id="shape1-1"
|
||||
v:mID="1"
|
||||
v:groupContext="shape"
|
||||
transform="translate(147.402,-208.549)">
|
||||
<title
|
||||
id="title903">Rectangle</title>
|
||||
<v:userDefs>
|
||||
<v:ud
|
||||
v:nameU="visVersion"
|
||||
v:val="VT0(15):26" />
|
||||
</v:userDefs>
|
||||
<rect
|
||||
x="0"
|
||||
y="477.84"
|
||||
width="185.994"
|
||||
height="117.435"
|
||||
class="st1"
|
||||
id="rect905"
|
||||
style="fill:none;stroke:#0070c0;stroke-width:10;stroke-linecap:round;stroke-linejoin:round" />
|
||||
</g>
|
||||
<g
|
||||
id="shape2-3"
|
||||
v:mID="2"
|
||||
v:groupContext="shape"
|
||||
transform="translate(332.746,-208.549)">
|
||||
<title
|
||||
id="title908">Rectangle.2</title>
|
||||
<v:userDefs>
|
||||
<v:ud
|
||||
v:nameU="visVersion"
|
||||
v:val="VT0(15):26" />
|
||||
</v:userDefs>
|
||||
<rect
|
||||
x="0"
|
||||
y="477.84"
|
||||
width="185.994"
|
||||
height="117.435"
|
||||
class="st2"
|
||||
id="rect910"
|
||||
style="fill:#0070c0;stroke:#0070c0;stroke-width:10;stroke-linecap:round;stroke-linejoin:round" />
|
||||
</g>
|
||||
<g
|
||||
id="shape3-5"
|
||||
v:mID="3"
|
||||
v:groupContext="shape"
|
||||
transform="translate(162.076,-256.961)">
|
||||
<title
|
||||
id="title913">Sheet.3</title>
|
||||
<desc
|
||||
id="desc915">Rapid</desc>
|
||||
<v:textBlock
|
||||
v:margins="rect(4,4,4,4)"
|
||||
v:tabSpace="42.5197" />
|
||||
<v:textRect
|
||||
cx="78.322"
|
||||
cy="584.97"
|
||||
width="156.65"
|
||||
height="20.611" />
|
||||
<rect
|
||||
x="0"
|
||||
y="574.66498"
|
||||
width="156.644"
|
||||
height="20.611"
|
||||
class="st3"
|
||||
id="rect917"
|
||||
style="fill:none;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round" />
|
||||
<text
|
||||
x="4.6500001"
|
||||
y="604.16998"
|
||||
class="st4"
|
||||
v:langID="1031"
|
||||
id="text919"
|
||||
style="font-size:64.00008392px;font-family:Calibri;fill:#0070c0"><v:paragraph
|
||||
v:horizAlign="1" /><v:tabList />Rapid</text>
|
||||
</g>
|
||||
<g
|
||||
id="shape4-8"
|
||||
v:mID="4"
|
||||
v:groupContext="shape"
|
||||
transform="translate(347.421,-256.961)">
|
||||
<title
|
||||
id="title922">Sheet.4</title>
|
||||
<desc
|
||||
id="desc924">Fuzz</desc>
|
||||
<v:textBlock
|
||||
v:margins="rect(4,4,4,4)"
|
||||
v:tabSpace="42.5197" />
|
||||
<v:textRect
|
||||
cx="78.322"
|
||||
cy="584.97"
|
||||
width="156.65"
|
||||
height="20.611" />
|
||||
<rect
|
||||
x="0"
|
||||
y="574.66498"
|
||||
width="156.644"
|
||||
height="20.611"
|
||||
class="st3"
|
||||
id="rect926"
|
||||
style="fill:none;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:round" />
|
||||
<text
|
||||
x="21.879999"
|
||||
y="604.16998"
|
||||
class="st5"
|
||||
v:langID="1031"
|
||||
id="text928"
|
||||
style="font-style:italic;font-size:64.00008392px;font-family:Calibri;fill:#ffffff"><v:paragraph
|
||||
v:horizAlign="1" /><v:tabList />Fuzz</text>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 5.4 KiB |
|
@ -0,0 +1,40 @@
|
|||
name: Python package
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
branches: [ master ]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [3.5, 3.6, 3.7, 3.8]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install wheel pybind11
|
||||
#pip install -r requirements.txt
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
pip install flake8
|
||||
# stop the build if there are Python syntax errors or undefined names
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
- name: build package
|
||||
run: python3 setup.py bdist_wheel
|
||||
#- name: Test with pytest
|
||||
# run: |
|
||||
# pip install pytest
|
||||
# pytest
|
|
@ -1,4 +1,8 @@
|
|||
.vscode/
|
||||
rapidfuzz.egg-info/
|
||||
dist/
|
||||
*.data
|
||||
*.so
|
||||
*.o
|
||||
*.out
|
||||
test.py
|
|
@ -0,0 +1,3 @@
|
|||
include README.md
|
||||
include VERSION
|
||||
include cpp/src/*.hpp
|
71
README.md
71
README.md
|
@ -1,11 +1,64 @@
|
|||
# RapidFuzz
|
||||
<h1 align="center">
|
||||
<img src="https://raw.githubusercontent.com/maxbachmann/rapidfuzz/master/.github/RapidFuzz.svg?sanitize=true" alt="RapidFuzz" width="400">
|
||||
</h1>
|
||||
<h4 align="center">Rapid fuzzy string matching in Python and C++ using the Levenshtein Distance</h4>
|
||||
|
||||
# Roadmap
|
||||
- [ ] add string matching for strings with a big length difference
|
||||
- [ ] add Python wrapper
|
||||
- [ ] add Rust version of the code
|
||||
<p align="center">
|
||||
<a href="https://github.com/maxbachmann/rapidfuzz/actions">
|
||||
<img src="https://github.com/maxbachmann/rapidfuzz/workflows/Python%20package/badge.svg"
|
||||
alt="Continous Integration">
|
||||
</a>
|
||||
<a href="https://github.com/maxbachmann/rapidfuzz/blob/dev/LICENSE">
|
||||
<img src="https://img.shields.io/github/license/rhasspy/rapidfuzz">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
# License
|
||||
RapidFuzz itself is licensed under the MIT license, so feel free to do what you want with the code.
|
||||
It is based on an old version of fuzzywuzzy that was Licensed under a MIT License.
|
||||
A Fork of this old version can be found [here](https://github.com/rhasspy/fuzzywuzzy).
|
||||
<p align="center">
|
||||
<a href="#why-should-you-care">Why Should You Care?</a> •
|
||||
<a href="#installation">Installation</a> •
|
||||
<a href="#usage">Usage</a> •
|
||||
<a href="#roadmap">Roadmap</a> •
|
||||
<a href="#license">License</a>
|
||||
</p>
|
||||
|
||||
---
|
||||
|
||||
## Why Should You Care?
|
||||
Since there is already [FuzzyWuzzy](https://github.com/seatgeek/fuzzywuzzy) that implements the same string similarity calculations you might wonder why you would want to use RapidFuzz. There are mainly two reasons:
|
||||
1) It is MIT licensed so in contrast to FuzzyWuzzy it can be used in projects where you do not want to adopt the GPL License
|
||||
2) While FuzzyWuzzy only used python-Levenshtein for the levenshtein calculations and implements the other functionalities in Python, RapidFuzz's implementation is mostly written in C++ and on Top of this comes with a lot of Algorithmic improvements. This results in a 5-300x Speedup in String Matching.
|
||||
|
||||
|
||||
## Installation
|
||||
RapidFuzz can be installed using [pip](https://pypi.org/project/rapidfuzz/)
|
||||
```bash
|
||||
$ pip install rapidfuzz
|
||||
```
|
||||
it requires Python 3.5 or later and a C++ Compiler with C++17 support, which should be given on all current systems
|
||||
|
||||
|
||||
## Usage
|
||||
```
|
||||
> from rapidfuzz import fuzz
|
||||
> from rapidfuzz import process
|
||||
```
|
||||
|
||||
### Simple Ratio
|
||||
|
||||
### Partial Ratio
|
||||
|
||||
### Token Sort Ratio
|
||||
|
||||
### Token Set Ratio
|
||||
|
||||
### Process
|
||||
|
||||
|
||||
## Roadmap
|
||||
- [ ] build python wheels using manylinux container in CI
|
||||
- [ ] add more Unit tests and run them in CI
|
||||
- [ ] add more Benchmarks and run them in CI
|
||||
|
||||
## License
|
||||
RapidFuzz is licensed under the MIT license since we believe that everyone should be able to use it without being forced to adopt our license. Thats why the library is based on an older version of fuzzywuzzy that was MIT licensed aswell.
|
||||
A Fork of this old version of fuzzywuzzy can be found [here](https://github.com/rhasspy/fuzzywuzzy).
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
#include <benchmark/benchmark.h>
|
||||
#include "../src/levenshtein.hpp"
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
// Define another benchmark
|
||||
static void BM_LevWeightedDist1(benchmark::State &state) {
|
||||
std::string_view a = "aaaaaaaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(levenshtein::weighted_distance(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_LevWeightedDist2(benchmark::State &state) {
|
||||
std::string_view a = "aaaaaaaaaa";
|
||||
std::string_view b = "bbbbbbbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(levenshtein::weighted_distance(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
static void BM_LevWeightedDist3(benchmark::State &state) {
|
||||
std::string_view a = "aaaaaaaaaa";
|
||||
std::string_view b = "bbbbbbbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(levenshtein::weighted_distance(a, b, 20));
|
||||
}
|
||||
state.SetLabel("Different Strings with max distance (no early exit)");
|
||||
}
|
||||
|
||||
static void BM_LevWeightedDist4(benchmark::State &state) {
|
||||
std::string_view a = "aaaaaaaaaa";
|
||||
std::string_view b = "bbbbbbbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(levenshtein::weighted_distance(a, b, 5));
|
||||
}
|
||||
state.SetLabel("Different Strings with max distance (early exit)");
|
||||
}
|
||||
|
||||
static void BM_LevNormWeightedDist1(benchmark::State &state) {
|
||||
std::string_view a = "aaaaaaaaaa";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(levenshtein::normalized_weighted_distance(a, a));
|
||||
}
|
||||
state.SetLabel("Similar Strings");
|
||||
}
|
||||
|
||||
static void BM_LevNormWeightedDist2(benchmark::State &state) {
|
||||
std::string_view a = "aaaaaaaaaa";
|
||||
std::string_view b = "bbbbbbbbbb";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(levenshtein::normalized_weighted_distance(a, b));
|
||||
}
|
||||
state.SetLabel("Different Strings");
|
||||
}
|
||||
|
||||
|
||||
BENCHMARK(BM_LevWeightedDist1);
|
||||
BENCHMARK(BM_LevWeightedDist2);
|
||||
BENCHMARK(BM_LevWeightedDist3);
|
||||
BENCHMARK(BM_LevWeightedDist4);
|
||||
|
||||
BENCHMARK(BM_LevNormWeightedDist1);
|
||||
BENCHMARK(BM_LevNormWeightedDist2);
|
||||
|
||||
BENCHMARK_MAIN();
|
104
cpp/fuzz.hpp
104
cpp/fuzz.hpp
|
@ -1,104 +0,0 @@
|
|||
#pragma once
|
||||
#include "levenshtein.hpp"
|
||||
#include "utils.hpp"
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
||||
float token_ratio(const std::string &a, const std::string &b) {
|
||||
std::vector<std::string_view> tokens_a = splitSV(a);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<std::string_view> tokens_b = splitSV(b);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
float result = normalized_levenshtein(tokens_a, tokens_b);
|
||||
|
||||
tokens_a.erase(std::unique(tokens_a.begin(), tokens_a.end()), tokens_a.end());
|
||||
tokens_b.erase(std::unique(tokens_b.begin(), tokens_b.end()), tokens_b.end());
|
||||
|
||||
auto intersection = intersection_count_sorted_vec(tokens_a, tokens_b);
|
||||
|
||||
size_t ab_len = joinedStringViewLength(intersection.ab);
|
||||
size_t ba_len = joinedStringViewLength(intersection.ba);
|
||||
|
||||
if (!ab_len || !ba_len) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
size_t double_prefix = 2 * joinedStringViewLength(intersection.ba);
|
||||
if (double_prefix) {
|
||||
++ab_len;
|
||||
++ba_len;
|
||||
}
|
||||
|
||||
result = std::max(result,
|
||||
(float)1.0 - (float)ab_len / (float)(ab_len + double_prefix));
|
||||
result = std::max(result,
|
||||
(float)1.0 - (float)ba_len / (float)(ba_len + double_prefix));
|
||||
size_t lensum = ab_len + ba_len + double_prefix;
|
||||
return std::max(result,
|
||||
(float)1.0 - levenshtein(intersection.ab, intersection.ba) / (float)lensum);
|
||||
}
|
||||
|
||||
|
||||
uint8_t full_ratio(const std::string &query, const std::string &choice,
|
||||
uint8_t score_cutoff) {
|
||||
float sratio = normalized_levenshtein(query, choice);
|
||||
const float UNBASE_SCALE = 0.95;
|
||||
float min_ratio = std::max((float)score_cutoff / (float)100.0, sratio);
|
||||
if (min_ratio < UNBASE_SCALE) {
|
||||
sratio = std::max(sratio, token_ratio(query, choice) * UNBASE_SCALE);
|
||||
}
|
||||
return static_cast<uint8_t>(std::round(sratio * 100.0));
|
||||
}
|
||||
|
||||
|
||||
/*uint8_t partial_ratio(const std::string &query, const std::string &choice,
|
||||
uint8_t partial_scale, uint8_t score_cutoff)
|
||||
{
|
||||
float sratio = normalized_levenshtein(query, choice);
|
||||
float min_ratio = std::max(sratio, (float)score_cutoff / (float)100);
|
||||
if (min_ratio < partial_scale) {
|
||||
sratio = std::max(sratio, partial_string_ratio(query, choice) * partial_scale);
|
||||
min_ratio = std::max(sratio, min_ratio);
|
||||
const float UNBASE_SCALE = 0.95;
|
||||
if (min_ratio < UNBASE_SCALE * partial_scale) {
|
||||
sratio = std::max(sratio, partial_token_ratio(query, choice) * UNBASE_SCALE * partial_scale );
|
||||
}
|
||||
}
|
||||
return static_cast<uint8_t>(std::round(sratio * 100.0));
|
||||
}*/
|
||||
|
||||
|
||||
uint8_t ratio(const std::string &query, const std::string &choice,
|
||||
uint8_t score_cutoff) {
|
||||
if (query == choice) {
|
||||
return 100;
|
||||
}
|
||||
|
||||
if (query.empty() || choice.empty() || score_cutoff == 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t len_a = query.length();
|
||||
size_t len_b = choice.length();
|
||||
float len_ratio;
|
||||
if (len_a > len_b) {
|
||||
len_ratio = (float)len_a / (float)len_b;
|
||||
} else {
|
||||
len_ratio = (float)len_b / (float)len_a;
|
||||
}
|
||||
|
||||
if (len_ratio < 1.5) {
|
||||
return full_ratio(query, choice, score_cutoff);
|
||||
// TODO: this is still missing
|
||||
} else if (len_ratio < 8.0) {
|
||||
return 0.0;
|
||||
// return partial_ratio(query, choice, 0.9, score_cutoff);
|
||||
} else {
|
||||
return 0.0;
|
||||
// return partial_ratio(query, choice, 0.6, score_cutoff);
|
||||
}
|
||||
}
|
|
@ -1,156 +0,0 @@
|
|||
#pragma once
|
||||
#include "utils.hpp"
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
|
||||
void levenshtein_word_cmp(const char &letter_cmp,
|
||||
const std::vector<std::string_view> &words,
|
||||
std::vector<size_t> &cache, size_t distance_b)
|
||||
{
|
||||
size_t result = distance_b + 1;
|
||||
auto cache_iter = cache.begin();
|
||||
auto word_iter = words.begin();
|
||||
|
||||
auto charCmp = [&] (const char &char2) {
|
||||
if (letter_cmp == char2) { result = distance_b - 1; }
|
||||
else { ++result; }
|
||||
|
||||
distance_b = *cache_iter;
|
||||
if (result > distance_b + 1) {
|
||||
result = distance_b + 1;
|
||||
}
|
||||
*cache_iter = result;
|
||||
++cache_iter;
|
||||
};
|
||||
|
||||
// words | view::join(' ') would be a bit nicer to write here but is a lot slower
|
||||
// might be worth a retry when it is added in c++20 since then compilers might
|
||||
// improve the runtime
|
||||
|
||||
// no whitespace should be added in front of the first word
|
||||
for (const auto &letter : *word_iter) {
|
||||
charCmp(letter);
|
||||
}
|
||||
++word_iter;
|
||||
|
||||
for (; word_iter != words.end(); ++word_iter) {
|
||||
// between every word there should be a whitespace
|
||||
charCmp(' ');
|
||||
// check following word
|
||||
for (const auto &letter : *word_iter) {
|
||||
charCmp(letter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
size_t levenshtein(std::vector<std::string_view> sentence1,
|
||||
std::vector<std::string_view> sentence2) {
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
size_t sentence1_len = joinedStringViewLength(sentence1);
|
||||
size_t sentence2_len = joinedStringViewLength(sentence2);
|
||||
|
||||
// exit early when one sentence is empty
|
||||
// (empty sentence would cause undefinded behaviour)
|
||||
if (!sentence1_len) {
|
||||
return sentence2_len;
|
||||
}
|
||||
if (!sentence2_len) {
|
||||
return sentence1_len;
|
||||
}
|
||||
|
||||
std::vector<size_t> cache(sentence2_len);
|
||||
std::iota(cache.begin(), cache.end(), 1);
|
||||
|
||||
size_t range1_pos = 0;
|
||||
auto word1_iter = sentence1.begin();
|
||||
|
||||
// no whitespace in front of first word
|
||||
size_t distance_b = range1_pos;
|
||||
for (const auto &letter : *word1_iter) {
|
||||
distance_b = range1_pos;
|
||||
levenshtein_word_cmp(letter, sentence2, cache, distance_b);
|
||||
++range1_pos;
|
||||
}
|
||||
|
||||
++word1_iter;
|
||||
for (; word1_iter != sentence1.end(); ++word1_iter) {
|
||||
distance_b = range1_pos;
|
||||
|
||||
// whitespace between words
|
||||
distance_b = range1_pos;
|
||||
levenshtein_word_cmp(' ', sentence2, cache, distance_b);
|
||||
++range1_pos;
|
||||
|
||||
for (const auto &letter : *word1_iter) {
|
||||
distance_b = range1_pos;
|
||||
levenshtein_word_cmp(letter, sentence2, cache, distance_b);
|
||||
++range1_pos;
|
||||
}
|
||||
}
|
||||
return cache.back();
|
||||
}
|
||||
|
||||
|
||||
float normalized_levenshtein(std::vector<std::string_view> sentence1,
|
||||
std::vector<std::string_view> sentence2) {
|
||||
if (sentence1.empty() && sentence2.empty()) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
size_t lensum = joinedStringViewLength(sentence1) + joinedStringViewLength(sentence2);
|
||||
size_t distance = levenshtein(sentence1, sentence2);
|
||||
return 1.0 - (float)distance / (float)lensum;
|
||||
}
|
||||
|
||||
|
||||
size_t levenshtein(std::string_view sentence1, std::string_view sentence2) {
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
|
||||
if (sentence1.empty()) {
|
||||
return sentence2.length();
|
||||
}
|
||||
if (sentence2.empty()) {
|
||||
return sentence1.length();
|
||||
}
|
||||
|
||||
std::vector<size_t> cache(sentence2.length());
|
||||
std::iota(cache.begin(), cache.end(), 1);
|
||||
|
||||
size_t sentence1_pos = 0;
|
||||
size_t result = sentence2.length();
|
||||
for (const auto &char1 : sentence1) {
|
||||
size_t distance_b = sentence1_pos;
|
||||
result = sentence1_pos + 1;
|
||||
auto cache_iter = cache.begin();
|
||||
for (const auto &char2 : sentence2) {
|
||||
if (char1 == char2) {
|
||||
result = distance_b - 1;
|
||||
} else {
|
||||
++result;
|
||||
}
|
||||
distance_b = *cache_iter;
|
||||
if (result > distance_b + 1) {
|
||||
result = distance_b + 1;
|
||||
}
|
||||
*cache_iter = result;
|
||||
++cache_iter;
|
||||
}
|
||||
++sentence1_pos;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
float normalized_levenshtein(std::string_view sentence1, std::string_view sentence2) {
|
||||
if (sentence1.empty() && sentence2.empty()) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
size_t lensum = sentence1.length() + sentence2.length();
|
||||
size_t distance = levenshtein(sentence1, sentence2);
|
||||
return 1.0 - (float)distance / (float)lensum;
|
||||
}
|
20
cpp/main.cpp
20
cpp/main.cpp
|
@ -1,20 +0,0 @@
|
|||
#include "process.hpp"
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// Define another benchmark
|
||||
static void BM_StringCopy(benchmark::State &state) {
|
||||
std::string a =
|
||||
"please add bananas to my shopping list I am a really reeally cool guy";
|
||||
std::vector<std::string> b(
|
||||
1000,
|
||||
"whats the weather like in Paris I am john Peter the new guy in class");
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(extract_one(a, b));
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK(BM_StringCopy);
|
||||
|
||||
BENCHMARK_MAIN();
|
|
@ -1,22 +0,0 @@
|
|||
#pragma once
|
||||
#include "fuzz.hpp"
|
||||
#include <execution>
|
||||
|
||||
uint8_t extract_one(std::string query, std::vector<std::string> choices,
|
||||
uint8_t score_cutoff = 0) {
|
||||
uint8_t max_score = 0;
|
||||
for (const auto &choice : choices) {
|
||||
uint8_t score = ratio(query, choice, score_cutoff);
|
||||
if (score > score_cutoff) {
|
||||
score_cutoff = score;
|
||||
max_score = score;
|
||||
}
|
||||
}
|
||||
return max_score;
|
||||
}
|
||||
|
||||
/*std::transform(std::execution::par,
|
||||
b.begin(), b.end(), out.begin(),
|
||||
[a](std::string elem) { return ratio(a, elem, 0); }
|
||||
|
||||
);*/
|
|
@ -0,0 +1,193 @@
|
|||
#include "fuzz.hpp"
|
||||
#include "levenshtein.hpp"
|
||||
#include "utils.hpp"
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
|
||||
decimal partial_string_ratio(const std::string &a, const std::string &b, decimal score_cutoff=0.0) {
|
||||
if (a.empty() || b.empty()) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
std::string_view shorter;
|
||||
std::string_view longer;
|
||||
|
||||
if (a.length() > b.length()) {
|
||||
shorter = b;
|
||||
longer = a;
|
||||
} else {
|
||||
shorter = a;
|
||||
longer = b;
|
||||
}
|
||||
|
||||
auto blocks = levenshtein::matching_blocks(shorter, longer);
|
||||
float max_ratio = 0;
|
||||
for (const auto &block : blocks) {
|
||||
size_t long_start = (block.second_start > block.first_start) ? block.second_start - block.first_start : 0;
|
||||
std::string_view long_substr = longer.substr(long_start, shorter.length());
|
||||
|
||||
float ls_ratio = levenshtein::normalized_weighted_distance(shorter, long_substr, score_cutoff);
|
||||
|
||||
if (ls_ratio > 0.995) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if (ls_ratio > max_ratio) {
|
||||
max_ratio = ls_ratio;
|
||||
}
|
||||
}
|
||||
|
||||
return max_ratio;
|
||||
}
|
||||
|
||||
|
||||
static percent full_ratio(const std::string &a, const std::string &b, percent score_cutoff=0) {
|
||||
float sratio = fuzz::ratio(a, b, score_cutoff);
|
||||
|
||||
const float UNBASE_SCALE = 95;
|
||||
float min_ratio = std::max(score_cutoff, sratio);
|
||||
if (min_ratio < UNBASE_SCALE) {
|
||||
sratio = std::max(sratio, fuzz::token_ratio(a, b, score_cutoff/UNBASE_SCALE) * UNBASE_SCALE);
|
||||
}
|
||||
|
||||
return sratio;
|
||||
}
|
||||
|
||||
|
||||
percent fuzz::ratio(const std::string &a, const std::string &b, percent score_cutoff) {
|
||||
return levenshtein::normalized_weighted_distance(a, b, score_cutoff / 100) * 100;
|
||||
}
|
||||
|
||||
|
||||
decimal fuzz::token_ratio(const std::string &a, const std::string &b, decimal score_cutoff) {
|
||||
std::vector<std::string_view> tokens_a = splitSV(a);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<std::string_view> tokens_b = splitSV(b);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
auto decomposition = set_decomposition(tokens_a, tokens_b);
|
||||
|
||||
size_t ab_len = recursiveIterableSize(decomposition.difference_ab, 1);
|
||||
size_t ba_len = recursiveIterableSize(decomposition.difference_ba, 1);
|
||||
size_t double_prefix = 2 * recursiveIterableSize(decomposition.intersection, 1);
|
||||
|
||||
// fuzzywuzzy joined sect and ab/ba for comparisions
|
||||
// this is not done here as an optimisation, so the lengths get incremented by 1
|
||||
// since there would be a whitespace between the joined strings
|
||||
if (double_prefix) {
|
||||
// exit early since this will always result in a ratio of 1
|
||||
if (!ab_len || !ba_len) return 1.0;
|
||||
|
||||
++ab_len;
|
||||
++ba_len;
|
||||
}
|
||||
|
||||
float result = levenshtein::normalized_weighted_distance(tokens_a, tokens_b, score_cutoff, " ");
|
||||
size_t lensum = ab_len + ba_len + double_prefix;
|
||||
|
||||
// could add score cutoff aswell, but would need to copy most things from normalized_score_cutoff
|
||||
// as an alternative add another utility function to levenshtein for this case
|
||||
size_t sect_distance = levenshtein::weighted_distance(decomposition.difference_ab, decomposition.difference_ba, " ");
|
||||
if (sect_distance != std::numeric_limits<size_t>::max()) {
|
||||
result = std::max(result, (float)1.0 - sect_distance / (float)lensum);
|
||||
}
|
||||
|
||||
// exit early since the other ratios are 0
|
||||
// (when a or b was empty they would cause a segfault)
|
||||
if (!double_prefix) {
|
||||
return result;
|
||||
}
|
||||
|
||||
return std::max({
|
||||
result,
|
||||
// levenshtein distances sect+ab <-> sect and sect+ba <-> sect
|
||||
// would exit early after removing the prefix sect, so the distance can be directly calculated
|
||||
(float)1.0 - (float)ab_len / (float)(ab_len + double_prefix),
|
||||
(float)1.0 - (float)ba_len / (float)(ba_len + double_prefix)
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// combines token_set and token_sort ratio from fuzzywuzzy so it is only required to
|
||||
// do a lot of operations once
|
||||
decimal partial_token_ratio(const std::string &a, const std::string &b, decimal score_cutoff=0.0) {
|
||||
// probably faster to split the String view already sorted
|
||||
std::vector<std::string_view> tokens_a = splitSV(a);
|
||||
std::sort(tokens_a.begin(), tokens_a.end());
|
||||
std::vector<std::string_view> tokens_b = splitSV(b);
|
||||
std::sort(tokens_b.begin(), tokens_b.end());
|
||||
|
||||
auto unique_a = tokens_a;
|
||||
auto unique_b = tokens_b;
|
||||
unique_a.erase(std::unique(unique_a.begin(), unique_a.end()), unique_a.end());
|
||||
unique_b.erase(std::unique(unique_b.begin(), unique_b.end()), unique_b.end());
|
||||
|
||||
std::vector<std::string_view> difference_ab;
|
||||
std::vector<std::string_view> difference_ba;
|
||||
|
||||
std::set_difference(unique_a.begin(), unique_a.end(), unique_b.begin(), unique_b.end(),
|
||||
std::inserter(difference_ab, difference_ab.begin()));
|
||||
std::set_difference(unique_b.begin(), unique_b.end(), unique_a.begin(), unique_a.end(),
|
||||
std::inserter(difference_ba, difference_ba.begin()));
|
||||
|
||||
// exit early when there is a common word in both sequences
|
||||
if (difference_ab.size() < unique_a.size()) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
return std::max(
|
||||
partial_string_ratio(sentence_join(tokens_a), sentence_join(tokens_b), score_cutoff),
|
||||
partial_string_ratio(sentence_join(difference_ab), sentence_join(difference_ba), score_cutoff)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
percent partial_ratio(const std::string &query, const std::string &choice, decimal partial_scale, percent score_cutoff) {
|
||||
const float UNBASE_SCALE = 0.95;
|
||||
|
||||
float sratio = levenshtein::normalized_weighted_distance(query, choice, score_cutoff/100);
|
||||
|
||||
float min_ratio = std::max(score_cutoff/100, sratio);
|
||||
if (min_ratio < partial_scale) {
|
||||
min_ratio /= partial_scale;
|
||||
sratio = std::max(sratio, partial_string_ratio(query, choice, min_ratio) * partial_scale);
|
||||
min_ratio = std::max(min_ratio, sratio);
|
||||
|
||||
if (min_ratio < UNBASE_SCALE) {
|
||||
min_ratio /= UNBASE_SCALE;
|
||||
sratio = std::max(sratio, partial_token_ratio(query, choice, min_ratio) * UNBASE_SCALE * partial_scale );
|
||||
}
|
||||
}
|
||||
return sratio * 100;
|
||||
}
|
||||
|
||||
|
||||
percent fuzz::QRatio(const std::string &a, const std::string &b, percent score_cutoff) {
|
||||
if (score_cutoff == 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return ratio(a, b, score_cutoff);
|
||||
}
|
||||
|
||||
|
||||
percent fuzz::WRatio(const std::string &a, const std::string &b, percent score_cutoff) {
|
||||
if (score_cutoff == 100) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t len_a = a.length();
|
||||
size_t len_b = b.length();
|
||||
float len_ratio = (len_a > len_b) ? (float)len_a / (float)len_b : (float)len_b / (float)len_a;
|
||||
|
||||
if (len_ratio < 1.5) {
|
||||
return full_ratio(a, b, score_cutoff);
|
||||
} else if (len_ratio < 8.0) {
|
||||
return partial_ratio(a, b, 0.9, score_cutoff);
|
||||
} else {
|
||||
return partial_ratio(a, b, 0.6, score_cutoff);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
#pragma once
|
||||
#include <string>
|
||||
|
||||
// 0.0% - 100.0%
|
||||
using percent = float;
|
||||
|
||||
// 0.0 - 1.0
|
||||
using decimal = float;
|
||||
|
||||
namespace fuzz {
|
||||
float ratio(const std::string &a, const std::string &b, float score_cutoff=0.0);
|
||||
float token_ratio(const std::string &a, const std::string &b, float score_cutoff=0.0);
|
||||
|
||||
percent QRatio(const std::string &a, const std::string &b, percent score_cutoff = 0);
|
||||
percent WRatio(const std::string &a, const std::string &b, percent score_cutoff = 0);
|
||||
}
|
|
@ -0,0 +1,406 @@
|
|||
#include "levenshtein.hpp"
|
||||
#include <numeric>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
levenshtein::Matrix levenshtein::matrix(std::string_view sentence1, std::string_view sentence2) {
|
||||
Affix affix = remove_common_affix(sentence1, sentence2);
|
||||
|
||||
size_t matrix_columns = sentence1.length() + 1;
|
||||
size_t matrix_rows = sentence2.length() + 1;
|
||||
|
||||
std::vector<size_t> cache_matrix(matrix_rows*matrix_columns, 0);
|
||||
|
||||
for (size_t i = 0; i < matrix_rows; ++i) {
|
||||
cache_matrix[i] = i;
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < matrix_columns; ++i) {
|
||||
cache_matrix[matrix_rows*i] = i;
|
||||
}
|
||||
|
||||
size_t sentence1_pos = 0;
|
||||
for (const auto &char1 : sentence1) {
|
||||
auto prev_cache = cache_matrix.begin() + sentence1_pos * matrix_rows;
|
||||
auto result_cache = cache_matrix.begin() + (sentence1_pos + 1) * matrix_rows + 1;
|
||||
size_t result = sentence1_pos + 1;
|
||||
for (const auto &char2 : sentence2) {
|
||||
result = std::min({
|
||||
result + 1,
|
||||
*prev_cache + (char1 != char2),
|
||||
*(++prev_cache) + 1
|
||||
});
|
||||
*result_cache = result;
|
||||
++result_cache;
|
||||
}
|
||||
++sentence1_pos;
|
||||
}
|
||||
|
||||
return Matrix {
|
||||
affix.prefix_len,
|
||||
cache_matrix,
|
||||
matrix_columns,
|
||||
matrix_rows
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
std::vector<levenshtein::EditOp> levenshtein::editops(std::string_view sentence1, std::string_view sentence2) {
|
||||
auto lev_matrix = matrix(sentence1, sentence2);
|
||||
size_t matrix_columns = lev_matrix.matrix_columns;
|
||||
size_t matrix_rows = lev_matrix.matrix_rows;
|
||||
size_t prefix_len = lev_matrix.prefix_len;
|
||||
auto matrix = lev_matrix.matrix;
|
||||
|
||||
std::vector<EditOp> ops;
|
||||
ops.reserve(matrix[matrix_columns * matrix_rows - 1]);
|
||||
|
||||
size_t i = matrix_columns - 1;
|
||||
size_t j = matrix_rows - 1;
|
||||
size_t pos = matrix_columns * matrix_rows - 1;
|
||||
|
||||
auto is_replace = [=](size_t pos) {
|
||||
return matrix[pos - matrix_rows - 1] < matrix[pos];
|
||||
};
|
||||
auto is_insert = [=](size_t pos) {
|
||||
return matrix[pos - 1] < matrix[pos];
|
||||
};
|
||||
auto is_delete = [=](size_t pos) {
|
||||
return matrix[pos - matrix_rows] < matrix[pos];
|
||||
};
|
||||
auto is_keep = [=](size_t pos) {
|
||||
return matrix[pos - matrix_rows - 1] == matrix[pos];
|
||||
};
|
||||
|
||||
while (i > 0 || j > 0) {
|
||||
EditType op_type;
|
||||
|
||||
if (i && j && is_replace(pos)) {
|
||||
op_type = EditType::EditReplace;
|
||||
--i;
|
||||
--j;
|
||||
pos -= matrix_rows + 1;
|
||||
} else if (j && is_insert(pos)) {
|
||||
op_type = EditType::EditInsert;
|
||||
--j;
|
||||
--pos;
|
||||
} else if (i && is_delete(pos)) {
|
||||
op_type = EditType::EditDelete;
|
||||
--i;
|
||||
pos -= matrix_rows;
|
||||
} else if (is_keep(pos)) {
|
||||
--i;
|
||||
--j;
|
||||
pos -= matrix_rows + 1;
|
||||
// EditKeep does not has to be stored
|
||||
continue;
|
||||
} else {
|
||||
throw std::logic_error("something went wrong extracting the editops from the levenshtein matrix");
|
||||
}
|
||||
|
||||
ops.emplace_back(op_type, i + prefix_len, j + prefix_len);
|
||||
}
|
||||
|
||||
std::reverse(ops.begin(), ops.end());
|
||||
return ops;
|
||||
}
|
||||
|
||||
|
||||
std::vector<levenshtein::MatchingBlock> levenshtein::matching_blocks(std::string_view sentence1, std::string_view sentence2) {
|
||||
auto edit_ops = editops(sentence1, sentence2);
|
||||
size_t first_start = 0;
|
||||
size_t second_start = 0;
|
||||
std::vector<MatchingBlock> mblocks;
|
||||
|
||||
for (const auto &op : edit_ops) {
|
||||
if (op.op_type == EditType::EditKeep) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (first_start < op.first_start || second_start < op.second_start) {
|
||||
mblocks.emplace_back(first_start, second_start, op.first_start - first_start);
|
||||
first_start = op.first_start;
|
||||
second_start = op.second_start;
|
||||
}
|
||||
|
||||
switch (op.op_type) {
|
||||
case EditType::EditReplace:
|
||||
first_start += 1;
|
||||
second_start += 1;
|
||||
break;
|
||||
case EditType::EditDelete:
|
||||
first_start += 1;
|
||||
break;
|
||||
case EditType::EditInsert:
|
||||
second_start += 1;
|
||||
break;
|
||||
case EditType::EditKeep:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mblocks.emplace_back(sentence1.length(), sentence2.length(), 0);
|
||||
return mblocks;
|
||||
}
|
||||
|
||||
|
||||
void levenshtein_word_cmp(const char &letter_cmp, const std::vector<std::string_view> &words,
|
||||
std::vector<size_t> &cache, size_t current_cache, std::string_view delimiter="")
|
||||
{
|
||||
size_t result = current_cache + 1;
|
||||
auto cache_iter = cache.begin();
|
||||
auto word_iter = words.begin();
|
||||
|
||||
auto charCmp = [&] (const char &char2) {
|
||||
if (letter_cmp == char2) { result = current_cache; }
|
||||
else { ++result; }
|
||||
|
||||
current_cache = *cache_iter;
|
||||
if (result > current_cache + 1) {
|
||||
result = current_cache + 1;
|
||||
}
|
||||
*cache_iter = result;
|
||||
++cache_iter;
|
||||
};
|
||||
|
||||
// no delimiter should be added in front of the first word
|
||||
for (const auto &letter : *word_iter) {
|
||||
charCmp(letter);
|
||||
}
|
||||
++word_iter;
|
||||
|
||||
for (; word_iter != words.end(); ++word_iter) {
|
||||
// between every word there should be a delimiter
|
||||
for (const auto &letter : delimiter) {
|
||||
charCmp(letter);
|
||||
}
|
||||
// check following word
|
||||
for (const auto &letter : *word_iter) {
|
||||
charCmp(letter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
size_t levenshtein::weighted_distance(std::vector<std::string_view> sentence1, std::vector<std::string_view> sentence2, std::string_view delimiter) {
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
size_t sentence1_len = recursiveIterableSize(sentence1, delimiter.length());
|
||||
size_t sentence2_len = recursiveIterableSize(sentence2, delimiter.length());
|
||||
|
||||
if (sentence2_len > sentence1_len) {
|
||||
std::swap(sentence1, sentence2);
|
||||
std::swap(sentence1_len, sentence2_len);
|
||||
}
|
||||
|
||||
if (!sentence2_len) {
|
||||
return sentence1_len;
|
||||
}
|
||||
|
||||
std::vector<size_t> cache(sentence2_len);
|
||||
std::iota(cache.begin(), cache.end(), 1);
|
||||
|
||||
size_t range1_pos = 0;
|
||||
auto word_iter = sentence1.begin();
|
||||
|
||||
// no delimiter in front of first word
|
||||
for (const auto &letter : *word_iter) {
|
||||
levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter);
|
||||
++range1_pos;
|
||||
}
|
||||
|
||||
++word_iter;
|
||||
for (; word_iter != sentence1.end(); ++word_iter) {
|
||||
// delimiter between words
|
||||
for (const auto &letter : delimiter) {
|
||||
levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter);
|
||||
++range1_pos;
|
||||
}
|
||||
|
||||
for (const auto &letter : *word_iter) {
|
||||
levenshtein_word_cmp(letter, sentence2, cache, range1_pos, delimiter);
|
||||
++range1_pos;
|
||||
}
|
||||
}
|
||||
|
||||
return cache.back();
|
||||
}
|
||||
|
||||
|
||||
size_t levenshtein_word_cmp_limited(const char &letter_cmp,
|
||||
const std::vector<std::string_view> &words,
|
||||
std::vector<size_t> &cache, size_t current_cache,
|
||||
std::string_view delimiter="")
|
||||
{
|
||||
size_t result = current_cache + 1;
|
||||
auto cache_iter = cache.begin();
|
||||
auto word_iter = words.begin();
|
||||
auto min_distance = std::numeric_limits<size_t>::max();
|
||||
|
||||
auto charCmp = [&] (const char &char2) {
|
||||
if (letter_cmp == char2) { result = current_cache; }
|
||||
else { ++result; }
|
||||
|
||||
current_cache = *cache_iter;
|
||||
if (result > current_cache + 1) {
|
||||
result = current_cache + 1;
|
||||
}
|
||||
|
||||
if (current_cache < min_distance) {
|
||||
min_distance = current_cache;
|
||||
}
|
||||
*cache_iter = result;
|
||||
++cache_iter;
|
||||
};
|
||||
|
||||
// no delimiter should be added in front of the first word
|
||||
for (const auto &letter : *word_iter) {
|
||||
charCmp(letter);
|
||||
}
|
||||
++word_iter;
|
||||
|
||||
for (; word_iter != words.end(); ++word_iter) {
|
||||
// between every word there should be a delimiter
|
||||
for (const auto &letter : delimiter) {
|
||||
charCmp(letter);
|
||||
}
|
||||
// check following word
|
||||
for (const auto &letter : *word_iter) {
|
||||
charCmp(letter);
|
||||
}
|
||||
}
|
||||
return min_distance;
|
||||
}
|
||||
|
||||
|
||||
size_t levenshtein::weighted_distance(std::vector<std::string_view> sentence1, std::vector<std::string_view> sentence2, size_t max_distance, std::string_view delimiter) {
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
size_t sentence1_len = recursiveIterableSize(sentence1, delimiter.length());
|
||||
size_t sentence2_len = recursiveIterableSize(sentence2, delimiter.length());
|
||||
|
||||
if (sentence2_len > sentence1_len) {
|
||||
std::swap(sentence1, sentence2);
|
||||
std::swap(sentence1_len, sentence2_len);
|
||||
}
|
||||
|
||||
if (!sentence2_len) {
|
||||
return sentence1_len;
|
||||
}
|
||||
|
||||
std::vector<size_t> cache(sentence2_len);
|
||||
std::iota(cache.begin(), cache.end(), 1);
|
||||
|
||||
size_t range1_pos = 0;
|
||||
auto word_iter = sentence1.begin();
|
||||
|
||||
// no delimiter in front of first word
|
||||
for (const auto &letter : *word_iter) {
|
||||
auto min_distance = levenshtein_word_cmp_limited(letter, sentence2, cache, range1_pos, delimiter);
|
||||
if (min_distance > max_distance) {
|
||||
return std::numeric_limits<size_t>::max();
|
||||
}
|
||||
++range1_pos;
|
||||
}
|
||||
|
||||
++word_iter;
|
||||
for (; word_iter != sentence1.end(); ++word_iter) {
|
||||
// delimiter between words
|
||||
for (const auto &letter : delimiter) {
|
||||
auto min_distance = levenshtein_word_cmp_limited(letter, sentence2, cache, range1_pos, delimiter);
|
||||
if (min_distance > max_distance) {
|
||||
return std::numeric_limits<size_t>::max();
|
||||
}
|
||||
++range1_pos;
|
||||
}
|
||||
|
||||
for (const auto &letter : *word_iter) {
|
||||
auto min_distance = levenshtein_word_cmp_limited(letter, sentence2, cache, range1_pos, delimiter);
|
||||
if (min_distance > max_distance) {
|
||||
return std::numeric_limits<size_t>::max();
|
||||
}
|
||||
++range1_pos;
|
||||
}
|
||||
}
|
||||
|
||||
return cache.back();
|
||||
}
|
||||
|
||||
|
||||
size_t levenshtein::weighted_distance(std::string_view sentence1, std::string_view sentence2, std::string_view delimiter) {
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
|
||||
if (sentence2.length() > sentence1.length()) std::swap(sentence1, sentence2);
|
||||
|
||||
if (sentence2.empty()) {
|
||||
return sentence1.length();
|
||||
}
|
||||
|
||||
std::vector<size_t> cache(sentence2.length());
|
||||
std::iota(cache.begin(), cache.end(), 1);
|
||||
|
||||
size_t sentence1_pos = 0;
|
||||
for (const auto &char1 : sentence1) {
|
||||
auto cache_iter = cache.begin();
|
||||
size_t current_cache = sentence1_pos;
|
||||
size_t result = sentence1_pos + 1;
|
||||
for (const auto &char2 : sentence2) {
|
||||
if (char1 == char2) {
|
||||
result = current_cache;
|
||||
} else {
|
||||
++result;
|
||||
}
|
||||
current_cache = *cache_iter;
|
||||
if (result > current_cache + 1) {
|
||||
result = current_cache + 1;
|
||||
}
|
||||
*cache_iter = result;
|
||||
++cache_iter;
|
||||
}
|
||||
++sentence1_pos;
|
||||
}
|
||||
|
||||
return cache.back();
|
||||
}
|
||||
|
||||
|
||||
size_t levenshtein::weighted_distance(std::string_view sentence1, std::string_view sentence2, size_t max_distance, std::string_view delimiter) {
|
||||
remove_common_affix(sentence1, sentence2);
|
||||
|
||||
if (sentence2.size() > sentence1.size()) std::swap(sentence1, sentence2);
|
||||
|
||||
if (sentence2.empty()) {
|
||||
return sentence1.length();
|
||||
}
|
||||
|
||||
std::vector<size_t> cache(sentence2.length());
|
||||
std::iota(cache.begin(), cache.end(), 1);
|
||||
|
||||
size_t sentence1_pos = 0;
|
||||
for (const auto &char1 : sentence1) {
|
||||
auto cache_iter = cache.begin();
|
||||
size_t current_cache = sentence1_pos;
|
||||
size_t result = sentence1_pos+1;
|
||||
auto min_distance = std::numeric_limits<size_t>::max();
|
||||
for (const auto &char2 : sentence2) {
|
||||
if (char1 == char2) {
|
||||
result = current_cache;
|
||||
} else {
|
||||
++result;
|
||||
}
|
||||
current_cache = *cache_iter;
|
||||
if (result > current_cache + 1) {
|
||||
result = current_cache + 1;
|
||||
}
|
||||
|
||||
if (current_cache < min_distance) {
|
||||
min_distance = current_cache;
|
||||
}
|
||||
*cache_iter = result;
|
||||
++cache_iter;
|
||||
}
|
||||
if (min_distance > max_distance) {
|
||||
return std::numeric_limits<size_t>::max();
|
||||
}
|
||||
++sentence1_pos;
|
||||
}
|
||||
return cache.back();
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
#pragma once
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include "utils.hpp"
|
||||
|
||||
|
||||
namespace levenshtein {
|
||||
enum EditType {
|
||||
EditKeep,
|
||||
EditReplace,
|
||||
EditInsert,
|
||||
EditDelete,
|
||||
};
|
||||
|
||||
struct EditOp {
|
||||
EditType op_type;
|
||||
size_t first_start;
|
||||
size_t second_start;
|
||||
EditOp(EditType op_type, size_t first_start, size_t second_start)
|
||||
: op_type(op_type), first_start(first_start), second_start(second_start) {}
|
||||
};
|
||||
|
||||
struct Matrix {
|
||||
size_t prefix_len;
|
||||
std::vector<size_t> matrix;
|
||||
size_t matrix_columns;
|
||||
size_t matrix_rows;
|
||||
};
|
||||
|
||||
Matrix matrix(std::string_view sentence1, std::string_view sentence2);
|
||||
|
||||
std::vector<EditOp> editops(std::string_view sentence1, std::string_view sentence2);
|
||||
|
||||
struct MatchingBlock {
|
||||
size_t first_start;
|
||||
size_t second_start;
|
||||
size_t len;
|
||||
MatchingBlock(size_t first_start, size_t second_start, size_t len)
|
||||
: first_start(first_start), second_start(second_start), len(len) {}
|
||||
};
|
||||
|
||||
std::vector<MatchingBlock> matching_blocks(std::string_view sentence1, std::string_view sentence2);
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the minimum number of insertions, deletions, and substitutions
|
||||
* required to change one sequence into the other according to Levenshtein.
|
||||
* Opposed to the normal distance function which has a cost of 1 for all edit operations,
|
||||
* it uses the following costs for edit operations:
|
||||
*
|
||||
* edit operation | cost
|
||||
* :------------- | :---
|
||||
* Insert | 1
|
||||
* Remove | 1
|
||||
* Replace | 2
|
||||
*/
|
||||
size_t weighted_distance(std::string_view sentence1, std::string_view sentence2,
|
||||
std::string_view delimiter="");
|
||||
size_t weighted_distance(std::vector<std::string_view> sentence1, std::vector<std::string_view> sentence2,
|
||||
std::string_view delimiter="");
|
||||
|
||||
|
||||
/**
|
||||
* These functions allow providing a max_distance parameter that can be used to exit early when the
|
||||
* calculated levenshtein distance is at least as big as max_distance and will return the maximal
|
||||
* possible value for size_t.
|
||||
* This range check makes the levenshtein calculation about 20% slower, so it should be only used
|
||||
* when it can usually exit early.
|
||||
*/
|
||||
size_t weighted_distance(std::string_view sentence1, std::string_view sentence2,
|
||||
size_t max_distance, std::string_view delimiter="");
|
||||
size_t weighted_distance(std::vector<std::string_view> sentence1, std::vector<std::string_view> sentence2,
|
||||
size_t max_distance, std::string_view delimiter="");
|
||||
|
||||
/**
|
||||
* Calculates a normalized score of the weighted Levenshtein algorithm between 0.0 and
|
||||
* 1.0 (inclusive), where 1.0 means the sequences are the same.
|
||||
*/
|
||||
template<typename Sentence1, typename Sentence2>
|
||||
float normalized_weighted_distance(const Sentence1 &sentence1, const Sentence2 &sentence2,
|
||||
float min_ratio=0.0, std::string_view delimiter="")
|
||||
{
|
||||
if (sentence1.empty() && sentence2.empty()) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if (sentence1.empty() || sentence1.empty()) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
size_t sentence1_len = recursiveIterableSize(sentence1, delimiter.size());
|
||||
size_t sentence2_len = recursiveIterableSize(sentence2, delimiter.size());
|
||||
size_t lensum = sentence1_len + sentence2_len;
|
||||
|
||||
// constant time calculation to find a string ratio based on the string length
|
||||
// so it can exit early without running any levenshtein calculations
|
||||
size_t min_distance = (sentence1_len > sentence2_len)
|
||||
? sentence1_len - sentence2_len
|
||||
: sentence2_len - sentence1_len;
|
||||
|
||||
float len_ratio = 1.0 - (float)min_distance / (float)lensum;
|
||||
if (len_ratio < min_ratio) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// TODO: this needs more thoughts when to start using score cutoff, since it performs slower when it can not exit early
|
||||
// -> just because it has a smaller ratio does not mean levenshtein can always exit early
|
||||
// has to be tested with some more real examples
|
||||
size_t distance = (min_ratio > 0.7)
|
||||
? weighted_distance(sentence1, sentence2, std::ceil((float)lensum - min_ratio * lensum), delimiter)
|
||||
: weighted_distance(sentence1, sentence2, delimiter);
|
||||
|
||||
if (distance == std::numeric_limits<size_t>::max()) {
|
||||
return 0.0;
|
||||
}
|
||||
return 1.0 - (float)distance / (float)lensum;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
#include <benchmark/benchmark.h>
|
||||
#include <iostream>
|
||||
#include "levenshtein.hpp"
|
||||
#include "fuzz.hpp"
|
||||
#include "process.hpp"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// Define another benchmark
|
||||
static void BM_LevMatrix(benchmark::State &state) {
|
||||
std::string a = "please add bananas to my shopping list";
|
||||
std::string b = "whats the weather like in Paris";
|
||||
for (auto _ : state) {
|
||||
benchmark::DoNotOptimize(ratio(a, b, 0.0));
|
||||
}
|
||||
}
|
||||
|
||||
static void BM_LevMatrix2(benchmark::State &state) {
|
||||
std::string a = "please add bananas to my shopping list peter";
|
||||
std::vector<std::string> b(1000, "can you add bananas to my shopping list please");
|
||||
for (auto _ : state) {
|
||||
//28
|
||||
benchmark::DoNotOptimize(extract_one(a, b));
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK(BM_LevMatrix2);
|
||||
|
||||
//BENCHMARK(BM_LevMatrix);
|
||||
BENCHMARK_MAIN();
|
|
@ -0,0 +1,37 @@
|
|||
#include "process.hpp"
|
||||
#include "fuzz.hpp"
|
||||
|
||||
std::vector<std::pair<float, std::string>>
|
||||
process::extract(std::string query, std::vector<std::string> choices, uint8_t score_cutoff) {
|
||||
std::vector<std::pair<float, std::string>> results;
|
||||
results.reserve(choices.size());
|
||||
|
||||
for (const auto &choice : choices) {
|
||||
float score = fuzz::WRatio(query, choice, score_cutoff);
|
||||
if (score > score_cutoff) {
|
||||
results.emplace_back(std::make_pair(score, choice));
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
std::optional<std::pair<float, std::string>>
|
||||
process::extract_one(std::string query, std::vector<std::string> choices, uint8_t score_cutoff) {
|
||||
float max_score = 0;
|
||||
std::string result_choice;
|
||||
for (const auto &choice : choices) {
|
||||
float score = fuzz::WRatio(query, choice, score_cutoff);
|
||||
if (score > score_cutoff) {
|
||||
score_cutoff = score;
|
||||
max_score = score;
|
||||
result_choice = choice;
|
||||
}
|
||||
}
|
||||
|
||||
if (!max_score) {
|
||||
return {};
|
||||
}
|
||||
return std::make_pair(max_score, result_choice);
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
namespace process {
|
||||
std::vector<std::pair<float, std::string>>
|
||||
extract(std::string query, std::vector<std::string> choices, uint8_t score_cutoff = 0);
|
||||
|
||||
std::optional<std::pair<float, std::string>>
|
||||
extract_one(std::string query, std::vector<std::string> choices, uint8_t score_cutoff = 0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
#pragma once
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
|
||||
inline std::vector<std::string_view> splitSV(std::string_view str,
|
||||
std::string_view delims = " ") {
|
||||
std::vector<std::string_view> output;
|
||||
// assume a word length of 6 + 1 whitespace
|
||||
output.reserve(str.size() / 7);
|
||||
|
||||
for (auto first = str.data(), second = str.data(), last = first + str.size();
|
||||
second != last && first != last; first = second + 1) {
|
||||
second =
|
||||
std::find_first_of(first, last, std::cbegin(delims), std::cend(delims));
|
||||
|
||||
if (first != second)
|
||||
output.emplace_back(first, second - first);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
struct Decomposition {
|
||||
std::vector<std::string_view> intersection;
|
||||
std::vector<std::string_view> difference_ab;
|
||||
std::vector<std::string_view> difference_ba;
|
||||
};
|
||||
|
||||
|
||||
inline Decomposition set_decomposition(std::vector<std::string_view> a, std::vector<std::string_view> b) {
|
||||
std::vector<std::string_view> intersection;
|
||||
std::vector<std::string_view> difference_ab;
|
||||
a.erase(std::unique(a.begin(), a.end()), a.end());
|
||||
b.erase(std::unique(b.begin(), b.end()), b.end());
|
||||
|
||||
for (const auto ¤t_a : a) {
|
||||
auto element_b = std::find(b.begin(), b.end(), current_a);
|
||||
if (element_b != b.end()) {
|
||||
b.erase(element_b);
|
||||
intersection.emplace_back(current_a);
|
||||
} else {
|
||||
difference_ab.emplace_back(current_a);
|
||||
}
|
||||
}
|
||||
|
||||
return Decomposition{intersection, difference_ab, b};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds the longest common prefix between two ranges
|
||||
*/
|
||||
template <typename InputIterator1, typename InputIterator2>
|
||||
inline auto common_prefix_length(InputIterator1 first1, InputIterator1 last1,
|
||||
InputIterator2 first2, InputIterator2 last2)
|
||||
{
|
||||
return std::distance(first1, std::mismatch(first1, last1, first2, last2).first);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common prefix of two string views
|
||||
*/
|
||||
inline size_t remove_common_prefix(std::string_view& a, std::string_view& b) {
|
||||
auto prefix = common_prefix_length(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.remove_prefix(prefix);
|
||||
b.remove_prefix(prefix);
|
||||
return prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes common suffix of two string views
|
||||
*/
|
||||
inline size_t remove_common_suffix(std::string_view& a, std::string_view& b) {
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.remove_suffix(suffix);
|
||||
b.remove_suffix(suffix);
|
||||
return suffix;
|
||||
}
|
||||
|
||||
struct Affix {
|
||||
size_t prefix_len;
|
||||
size_t suffix_len;
|
||||
};
|
||||
|
||||
/**
|
||||
* Removes common affix of two string views
|
||||
*/
|
||||
inline Affix remove_common_affix(std::string_view& a, std::string_view& b) {
|
||||
return Affix {
|
||||
remove_common_prefix(a, b),
|
||||
remove_common_suffix(a, b)
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline void vec_remove_common_affix(T &a, T &b) {
|
||||
auto prefix = std::mismatch(a.begin(), a.end(), b.begin(), b.end());
|
||||
a.erase(a.begin(), prefix.first);
|
||||
b.erase(b.begin(), prefix.second);
|
||||
|
||||
auto suffix = common_prefix_length(a.rbegin(), a.rend(), b.rbegin(), b.rend());
|
||||
a.erase(a.end()-suffix, a.end());
|
||||
b.erase(b.end()-suffix, b.end());
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void vec_common_affix(std::vector<T> &a, std::vector<T> &b) {
|
||||
iterable_remove_common_affix(a, b);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void remove_common_affix(std::vector<T> &a, std::vector<T> &b)
|
||||
{
|
||||
vec_remove_common_affix(a, b);
|
||||
if (!a.empty() && !b.empty()) {
|
||||
remove_common_prefix(a.front(), b.front());
|
||||
remove_common_suffix(a.back(), b.back());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline size_t recursiveIterableSize(const T &x, size_t delimiter_length=0){
|
||||
return x.size();
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline size_t recursiveIterableSize(const std::vector<T> &x, size_t delimiter_length=0){
|
||||
if (x.empty()) {
|
||||
return 0;
|
||||
}
|
||||
size_t result = (x.size() - 1) * delimiter_length;
|
||||
for (const auto &y: x) {
|
||||
result += recursiveIterableSize(y, delimiter_length);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
inline std::string sentence_join(const std::vector<std::string_view> &sentence) {
|
||||
if (sentence.empty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
auto sentence_iter = sentence.begin();
|
||||
std::string result = std::string {*sentence_iter};
|
||||
++sentence_iter;
|
||||
for (; sentence_iter != sentence.end(); ++sentence_iter) {
|
||||
result.append(" ").append(std::string {*sentence_iter});
|
||||
}
|
||||
return result;
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
Boost Software License - Version 1.0 - August 17th, 2003
|
||||
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,144 @@
|
|||
#include "catch2/catch.hpp"
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "../src/levenshtein.hpp"
|
||||
|
||||
TEST_CASE( "levenshtein works with string_views", "[string_view]" ) {
|
||||
std::string_view test = "aaaa";
|
||||
std::string_view no_suffix = "aaa";
|
||||
std::string_view no_suffix2 = "aaab";
|
||||
std::string_view swapped1 = "abaa";
|
||||
std::string_view swapped2 = "baaa";
|
||||
std::string_view replace_all = "bbbb";
|
||||
|
||||
SECTION( "weighted levenshtein calculates correct distances" ) {
|
||||
REQUIRE( levenshtein::weighted_distance(test, test) == 0 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, no_suffix) == 1 );
|
||||
REQUIRE( levenshtein::weighted_distance(swapped1, swapped2) == 2 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, no_suffix2) == 2 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, replace_all) == 8 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, replace_all, 3) == std::numeric_limits<size_t>::max() );
|
||||
}
|
||||
|
||||
SECTION( "weighted levenshtein calculates correct ratios" ) {
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, test) == 1.0 );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, no_suffix) == Approx(0.857).epsilon(0.01) );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(swapped1, swapped2) == Approx(0.75).epsilon(0.01) );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, no_suffix2) == Approx(0.75).epsilon(0.01) );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, replace_all) == 0.0 );
|
||||
}
|
||||
|
||||
|
||||
SECTION( "levenshtein calculates correct levenshtein matrix" ) {
|
||||
auto matrix_cmp = [](levenshtein::Matrix a, levenshtein::Matrix b) {
|
||||
REQUIRE( a.prefix_len == b.prefix_len);
|
||||
REQUIRE( a.matrix == b.matrix);
|
||||
REQUIRE( a.matrix_columns == b.matrix_columns);
|
||||
REQUIRE( a.matrix_rows == b.matrix_rows);
|
||||
};
|
||||
|
||||
|
||||
matrix_cmp(
|
||||
levenshtein::matrix(test, test),
|
||||
{4, std::vector<size_t>{0}, 1, 1});
|
||||
|
||||
matrix_cmp(
|
||||
levenshtein::matrix(test, no_suffix),
|
||||
{3, std::vector<size_t>{0, 1}, 2, 1});
|
||||
|
||||
matrix_cmp(
|
||||
levenshtein::matrix(swapped1, swapped2),
|
||||
{0, std::vector<size_t>{ 0, 1, 2, 1, 1, 1, 2, 1, 2 }, 3, 3});
|
||||
|
||||
matrix_cmp(
|
||||
levenshtein::matrix(test, no_suffix2),
|
||||
{3, std::vector<size_t>{0, 1, 1, 1}, 2, 2});
|
||||
|
||||
matrix_cmp(
|
||||
levenshtein::matrix(test, replace_all),
|
||||
{0, std::vector<size_t>{0, 1, 2, 3, 4, 1, 1, 2, 3, 4, 2, 2, 2, 3, 4, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4}, 5, 5});
|
||||
}
|
||||
|
||||
SECTION( "levenshtein calculates correct levenshtein editops" ) {
|
||||
auto edit_op_cmp = [](std::vector<levenshtein::EditOp> res, std::vector<levenshtein::EditOp> check) {
|
||||
auto res_iter = res.begin();
|
||||
auto check_iter = check.begin();
|
||||
while (res_iter != res.end() && check_iter != check.end()) {
|
||||
REQUIRE(res_iter->op_type == check_iter->op_type);
|
||||
REQUIRE(res_iter->first_start == check_iter->first_start);
|
||||
REQUIRE(res_iter->first_start == check_iter->first_start);
|
||||
++res_iter;
|
||||
++check_iter;
|
||||
}
|
||||
REQUIRE(res_iter == res.end());
|
||||
REQUIRE(check_iter == check.end());
|
||||
};
|
||||
|
||||
auto ed_replace = [](size_t pos1, size_t pos2) {
|
||||
return levenshtein::EditOp{levenshtein::EditType::EditReplace, pos1, pos2};
|
||||
};
|
||||
|
||||
auto ed_delete = [](size_t pos1, size_t pos2) {
|
||||
return levenshtein::EditOp{levenshtein::EditType::EditDelete, pos1, pos2};
|
||||
};
|
||||
|
||||
auto ed_insert = [](size_t pos1, size_t pos2) {
|
||||
return levenshtein::EditOp{levenshtein::EditType::EditInsert, pos1, pos2};
|
||||
};
|
||||
|
||||
|
||||
edit_op_cmp(levenshtein::editops(test, test), {});
|
||||
|
||||
edit_op_cmp(
|
||||
levenshtein::editops(test, no_suffix),
|
||||
{ ed_delete(3, 3) });
|
||||
|
||||
edit_op_cmp(
|
||||
levenshtein::editops(no_suffix, test),
|
||||
{ ed_insert(3, 3) });
|
||||
|
||||
edit_op_cmp(
|
||||
levenshtein::editops(swapped1, swapped2),
|
||||
{ ed_replace(0, 0), ed_replace(1, 1) });
|
||||
|
||||
edit_op_cmp(
|
||||
levenshtein::editops(test, no_suffix2),
|
||||
{ ed_replace(3, 3) });
|
||||
|
||||
edit_op_cmp(
|
||||
levenshtein::editops(test, replace_all),
|
||||
{ ed_replace(0, 0), ed_replace(1, 1), ed_replace(2, 2), ed_replace(3, 3) });
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE( "levenshtein works with vectors of string_views", "[vector<string_view>]" ) {
|
||||
std::vector<std::string_view> test {"test", "test"};
|
||||
std::vector<std::string_view> combined {"testtest"};
|
||||
std::vector<std::string_view> insert {"tes", "test"};
|
||||
std::vector<std::string_view> replace {"test", "tess"};
|
||||
std::vector<std::string_view> replace_all {"xxxx", "xxxx"};
|
||||
std::vector<std::string_view> insert_delete {"etst", "test"};
|
||||
|
||||
SECTION( "weighted levenshtein calculates correct distances") {
|
||||
REQUIRE( levenshtein::weighted_distance(test, test) == 0 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, insert) == 1 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, insert_delete) == 2 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, replace) == 2 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, replace_all) == 16 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, replace_all, 7) == std::numeric_limits<size_t>::max() );
|
||||
REQUIRE( levenshtein::weighted_distance(test, combined) == 0 );
|
||||
REQUIRE( levenshtein::weighted_distance(test, combined, 0.0, " ") == 1 );
|
||||
}
|
||||
|
||||
SECTION( "weighted levenshtein calculates correct ratio") {
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, test) == 1.0 );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, insert) == Approx(0.93).epsilon(0.01) );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, insert_delete) == Approx(0.875).epsilon(0.01) );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, replace) == Approx(0.875).epsilon(0.01) );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, replace_all) == 0.0 );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, combined) == 1.0 );
|
||||
REQUIRE( levenshtein::normalized_weighted_distance(test, combined, 0.0, " ") == Approx(0.94).epsilon(0.01) );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
// test main file so catch2 does not has to be recompiled
|
||||
#define CATCH_CONFIG_MAIN
|
||||
#include "catch2/catch.hpp"
|
192
cpp/utils.hpp
192
cpp/utils.hpp
|
@ -1,192 +0,0 @@
|
|||
#pragma once
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
struct Affix {
|
||||
size_t prefix_len;
|
||||
size_t suffix_len;
|
||||
};
|
||||
|
||||
class MatchingPair {
|
||||
public:
|
||||
std::string a;
|
||||
std::string b;
|
||||
|
||||
MatchingPair(std::string a, std::string b) : a(a), b(b) {}
|
||||
|
||||
Affix remove_affix() {
|
||||
size_t a_len = a.length();
|
||||
size_t b_len = b.length();
|
||||
|
||||
while (a_len > 0 && b_len > 0 && a[a_len - 1] == b[b_len - 1]) {
|
||||
--a_len;
|
||||
--b_len;
|
||||
}
|
||||
|
||||
size_t prefix_len = 0;
|
||||
while (a_len > 0 && b_len > 0 && a[prefix_len] == b[prefix_len]) {
|
||||
--a_len;
|
||||
--b_len;
|
||||
++prefix_len;
|
||||
}
|
||||
|
||||
size_t suffix_len = a.length() - a_len;
|
||||
a = a.substr(prefix_len, a_len);
|
||||
b = b.substr(prefix_len, b_len);
|
||||
return Affix{prefix_len, suffix_len};
|
||||
}
|
||||
};
|
||||
|
||||
// construct a sorted range both for intersections and differences between
|
||||
// sorted ranges based on reference implementations of set_difference and
|
||||
// set_intersection http://www.cplusplus.com/reference/algorithm/set_difference/
|
||||
// http://www.cplusplus.com/reference/algorithm/set_intersection/
|
||||
template <class InputIterator1, class InputIterator2, class OutputIterator1,
|
||||
class OutputIterator2, class OutputIterator3>
|
||||
OutputIterator3 set_decomposition(InputIterator1 first1, InputIterator1 last1,
|
||||
InputIterator2 first2, InputIterator2 last2,
|
||||
OutputIterator1 result1,
|
||||
OutputIterator2 result2,
|
||||
OutputIterator3 result3) {
|
||||
while (first1 != last1 && first2 != last2) {
|
||||
if (*first1 < *first2) {
|
||||
*result1++ = *first1++;
|
||||
} else if (*first2 < *first1) {
|
||||
*result2++ = *first2++;
|
||||
} else {
|
||||
*result3++ = *first1++;
|
||||
++first2;
|
||||
}
|
||||
}
|
||||
std::copy(first1, last1, result1);
|
||||
std::copy(first2, last2, result2);
|
||||
return result3;
|
||||
}
|
||||
|
||||
std::vector<std::string_view> splitSV(std::string_view str,
|
||||
std::string_view delims = " ") {
|
||||
std::vector<std::string_view> output;
|
||||
// assume a word length of 6 + 1 whitespace
|
||||
output.reserve(str.size() / 7);
|
||||
|
||||
for (auto first = str.data(), second = str.data(), last = first + str.size();
|
||||
second != last && first != last; first = second + 1) {
|
||||
second =
|
||||
std::find_first_of(first, last, std::cbegin(delims), std::cend(delims));
|
||||
|
||||
if (first != second)
|
||||
output.emplace_back(first, second - first);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
struct Intersection {
|
||||
std::vector<std::string_view> sect;
|
||||
std::vector<std::string_view> ab;
|
||||
std::vector<std::string_view> ba;
|
||||
};
|
||||
|
||||
// attention this is changing b !!!!
|
||||
Intersection intersection_count_sorted_vec(std::vector<std::string_view> a,
|
||||
std::vector<std::string_view> b) {
|
||||
std::vector<std::string_view> vec_sect;
|
||||
std::vector<std::string_view> vec_ab;
|
||||
|
||||
for (const auto ¤t_a : a) {
|
||||
auto element_b = std::find(b.begin(), b.end(), current_a);
|
||||
if (element_b != b.end()) {
|
||||
b.erase(element_b);
|
||||
vec_sect.emplace_back(current_a);
|
||||
} else {
|
||||
vec_ab.emplace_back(current_a);
|
||||
}
|
||||
}
|
||||
|
||||
return Intersection{vec_sect, vec_ab, b};
|
||||
}
|
||||
|
||||
void remove_common_affix(std::string_view &a, std::string_view &b) {
|
||||
size_t a_len = a.length();
|
||||
size_t b_len = b.length();
|
||||
|
||||
while (a_len > 0 && b_len > 0 && a[a_len - 1] == b[b_len - 1]) {
|
||||
--a_len;
|
||||
--b_len;
|
||||
}
|
||||
|
||||
size_t prefix_len = 0;
|
||||
while (a_len > 0 && b_len > 0 && a[prefix_len] == b[prefix_len]) {
|
||||
--a_len;
|
||||
--b_len;
|
||||
++prefix_len;
|
||||
}
|
||||
|
||||
size_t suffix_len = a.length() - a_len;
|
||||
|
||||
a = a.substr(prefix_len, a_len);
|
||||
b = b.substr(prefix_len, b_len);
|
||||
}
|
||||
|
||||
void remove_common_affix(std::vector<std::string_view> &a,
|
||||
std::vector<std::string_view> &b) {
|
||||
// remove common prefix
|
||||
// maybe erasing whole prefix at once is faster
|
||||
auto a_it = a.begin();
|
||||
auto b_it = b.begin();
|
||||
while (a_it != a.end() && b_it != b.end()) {
|
||||
size_t common_len = 0;
|
||||
auto a_letter_it = a_it->begin();
|
||||
auto b_letter_it = b_it->begin();
|
||||
while (a_letter_it != a_it->end() && b_letter_it != b_it->end() &&
|
||||
*a_letter_it == *b_letter_it) {
|
||||
++a_letter_it;
|
||||
++b_letter_it;
|
||||
++common_len;
|
||||
}
|
||||
if (a_letter_it != a_it->end() || b_letter_it != b_it->end()) {
|
||||
*a_it = a_it->substr(common_len);
|
||||
*b_it = b_it->substr(common_len);
|
||||
break;
|
||||
}
|
||||
a_it = a.erase(a_it);
|
||||
b_it = b.erase(b_it);
|
||||
}
|
||||
|
||||
// remove common suffix
|
||||
auto a_it_rev = a.rbegin();
|
||||
auto b_it_rev = b.rbegin();
|
||||
while (a_it_rev != a.rend() && b_it_rev != b.rend()) {
|
||||
size_t common_len = 0;
|
||||
auto a_letter_it = a_it_rev->rbegin();
|
||||
auto b_letter_it = b_it_rev->rbegin();
|
||||
while (a_letter_it != a_it_rev->rend() && b_letter_it != b_it_rev->rend() &&
|
||||
*a_letter_it == *b_letter_it) {
|
||||
++a_letter_it;
|
||||
++b_letter_it;
|
||||
++common_len;
|
||||
}
|
||||
if (a_letter_it != a_it_rev->rend() || b_letter_it != b_it_rev->rend()) {
|
||||
*a_it_rev = a_it_rev->substr(0, a_it_rev->size() - common_len);
|
||||
*b_it_rev = b_it_rev->substr(0, b_it_rev->size() - common_len);
|
||||
break;
|
||||
}
|
||||
++a_it_rev;
|
||||
++b_it_rev;
|
||||
a.pop_back();
|
||||
b.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
size_t joinedStringViewLength(const std::vector<std::string_view> &words) {
|
||||
if (words.empty()) {
|
||||
return 0;
|
||||
}
|
||||
// init length with whitespaces between words
|
||||
size_t length = words.size() - 1;
|
||||
for (const auto &word : words) {
|
||||
length += word.length();
|
||||
}
|
||||
return length;
|
||||
}
|
|
@ -1,16 +0,0 @@
|
|||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl.h>
|
||||
#include "process.hpp"
|
||||
|
||||
|
||||
PYBIND11_MODULE(rapidfuzz, m) {
|
||||
m.doc() = R"pbdoc(
|
||||
rapid string matching library
|
||||
)pbdoc";
|
||||
|
||||
m.def("extract_one", &extract_one, R"pbdoc(
|
||||
Find the best match in a list of matches
|
||||
)pbdoc");
|
||||
|
||||
m.attr("__version__") = VERSION_INFO;
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl.h>
|
||||
#include "process.hpp"
|
||||
#include "fuzz.hpp"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
PYBIND11_MODULE(rapidfuzz, m) {
|
||||
m.doc() = R"pbdoc(
|
||||
rapid string matching library
|
||||
)pbdoc";
|
||||
|
||||
auto mprocess = m.def_submodule("process");
|
||||
|
||||
mprocess.def("extract", &process::extract,
|
||||
py::arg("query"), py::arg("choices"), py::arg("score_cutoff") = 0,
|
||||
R"pbdoc(Find all matches with a ratio above score_cutoff)pbdoc");
|
||||
|
||||
mprocess.def("extractOne", &process::extract_one,
|
||||
py::arg("query"), py::arg("choices"), py::arg("score_cutoff") = 0,
|
||||
R"pbdoc(Find the best match in a list of matches)pbdoc");
|
||||
|
||||
|
||||
auto mfuzz = m.def_submodule("fuzz");
|
||||
mfuzz.def("ratio", &fuzz::ratio,
|
||||
py::arg("a"), py::arg("b"), py::arg("score_cutoff") = 0);
|
||||
|
||||
mfuzz.def("QRatio", &fuzz::QRatio,
|
||||
py::arg("a"), py::arg("b"), py::arg("score_cutoff") = 0);
|
||||
|
||||
mfuzz.def("WRatio", &fuzz::WRatio,
|
||||
py::arg("a"), py::arg("b"), py::arg("score_cutoff") = 0);
|
||||
|
||||
m.attr("__version__") = VERSION_INFO;
|
||||
}
|
51
setup.py
51
setup.py
|
@ -3,10 +3,16 @@ from setuptools.command.build_ext import build_ext
|
|||
import sys
|
||||
import setuptools
|
||||
|
||||
with open("VERSION", "r") as version_file:
|
||||
from os import path
|
||||
this_dir = path.abspath(path.dirname(__file__))
|
||||
with open(path.join(this_dir, "VERSION"), encoding='utf-8') as version_file:
|
||||
version = version_file.read().strip()
|
||||
|
||||
|
||||
with open(path.join(this_dir, 'README.md'), encoding='utf-8') as f:
|
||||
long_description = f.read()
|
||||
|
||||
|
||||
class get_pybind_include(object):
|
||||
"""Helper class to determine the pybind11 include path
|
||||
|
||||
|
@ -25,15 +31,19 @@ class get_pybind_include(object):
|
|||
ext_modules = [
|
||||
Extension(
|
||||
'rapidfuzz',
|
||||
['python/rapidfuzz.cpp'],
|
||||
[
|
||||
'python/src/rapidfuzz.cpp',
|
||||
'cpp/src/levenshtein.cpp',
|
||||
'cpp/src/fuzz.cpp',
|
||||
'cpp/src/process.cpp'
|
||||
],
|
||||
include_dirs=[
|
||||
# Path to pybind11 headers
|
||||
get_pybind_include(),
|
||||
get_pybind_include(user=True),
|
||||
"cpp"
|
||||
"cpp/src"
|
||||
],
|
||||
extra_compile_args = ["-O3"],
|
||||
language='c++'
|
||||
language='c++',
|
||||
),
|
||||
]
|
||||
|
||||
|
@ -55,11 +65,12 @@ def has_flag(compiler, flagname):
|
|||
|
||||
|
||||
def cpp_flag(compiler):
|
||||
"""Return the -std=c++17 compiler flag.
|
||||
|
||||
The newer version is prefered over c++17 (when it is available).
|
||||
"""
|
||||
if has_flag(compiler, '-std=c++17'): return '-std=c++17'
|
||||
Return the latest compiler flag supported by the compiler
|
||||
(at least c++17)
|
||||
"""
|
||||
if has_flag(compiler, '-std=c++2a'): return '-std=c++2a'
|
||||
elif has_flag(compiler, '-std=c++17'): return '-std=c++17'
|
||||
|
||||
raise RuntimeError('Unsupported compiler -- at least C++17 support '
|
||||
'is needed!')
|
||||
|
@ -68,8 +79,8 @@ def cpp_flag(compiler):
|
|||
class BuildExt(build_ext):
|
||||
"""A custom build extension for adding compiler-specific options."""
|
||||
c_opts = {
|
||||
'msvc': ['/EHsc'],
|
||||
'unix': ['-ffast-math'],
|
||||
'msvc': ['/EHsc', '-O3'],
|
||||
'unix': ['-O3'],
|
||||
}
|
||||
l_opts = {
|
||||
'msvc': [],
|
||||
|
@ -103,11 +114,23 @@ setup(
|
|||
author='Max Bachmann',
|
||||
author_email='contact@maxbachmann.de',
|
||||
url='https://github.com/rhasspy/rapidfuzz',
|
||||
description='rapid string matching library',
|
||||
long_description='',
|
||||
description='rapid fuzzy string matching',
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
ext_modules=ext_modules,
|
||||
install_requires=['pybind11>=2.4'],
|
||||
setup_requires=['pybind11>=2.4'],
|
||||
cmdclass={'build_ext': BuildExt},
|
||||
package_data={'': ['LICENSE', 'VERSION']},
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
)
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
],
|
||||
python_requires=">=3.5",
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue