2020-08-22 19:07:08 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2020-05-24 07:57:08 +00:00
|
|
|
import unittest
|
Release v1.0.0 (#68)
- all normalized string_metrics can now be used as scorer for process.extract/extractOne
- Implementation of the C++ Wrapper completely refactored to make it easier to add more scorers, processors and string matching algorithms in the future.
- increased test coverage, that already helped to fix some bugs and help to prevent regressions in the future
- improved docstrings of functions
- Added bitparallel implementation of the Levenshtein distance for the weights (1,1,1) and (1,1,2).
- Added specialized implementation of the Levenshtein distance for cases with a small maximum edit distance, that is even faster, than the bitparallel implementation.
- Improved performance of `fuzz.partial_ratio`
-> Since `fuzz.ratio` and `fuzz.partial_ratio` are used in most scorers, this improves the overall performance.
- Improved performance of `process.extract` and `process.extractOne`
- the `rapidfuzz.levenshtein` module is now deprecated and will be removed in v2.0.0
These functions are now placed in `rapidfuzz.string_metric`. `distance`, `normalized_distance`, `weighted_distance` and `weighted_normalized_distance` are combined into `levenshtein` and `normalized_levenshtein`.
- added normalized version of the hamming distance in `string_metric.normalized_hamming`
- process.extract_iter as a generator, that yields the similarity of all elements, that have a similarity >= score_cutoff
- multiple bugs in extractOne when used with a scorer, thats not from RapidFuzz
- fixed bug in `token_ratio`
- fixed bug in result normalisation causing zero division
2021-02-12 15:37:44 +00:00
|
|
|
import pytest
|
2020-05-24 07:57:08 +00:00
|
|
|
|
|
|
|
from rapidfuzz import process, fuzz, utils
|
|
|
|
|
|
|
|
class ProcessTest(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
|
|
self.baseball_strings = [
|
|
|
|
"new york mets vs chicago cubs",
|
|
|
|
"chicago cubs vs chicago white sox",
|
|
|
|
"philladelphia phillies vs atlanta braves",
|
|
|
|
"braves vs mets",
|
|
|
|
]
|
|
|
|
|
2020-11-15 19:18:46 +00:00
|
|
|
def testExtractOneExceptions(self):
|
|
|
|
self.assertRaises(TypeError, process.extractOne)
|
|
|
|
self.assertRaises(TypeError, process.extractOne, 1)
|
|
|
|
self.assertRaises(TypeError, process.extractOne, 1, [])
|
|
|
|
self.assertRaises(TypeError, process.extractOne, '', [1])
|
|
|
|
self.assertRaises(TypeError, process.extractOne, '', {1:1})
|
|
|
|
|
|
|
|
def testExtractExceptions(self):
|
|
|
|
self.assertRaises(TypeError, process.extract)
|
|
|
|
self.assertRaises(TypeError, process.extract, 1)
|
|
|
|
self.assertRaises(TypeError, process.extract, 1, [])
|
|
|
|
self.assertRaises(TypeError, process.extract, '', [1])
|
|
|
|
self.assertRaises(TypeError, process.extract, '', {1:1})
|
|
|
|
|
2021-02-21 18:42:36 +00:00
|
|
|
def testExtractIterExceptions(self):
|
|
|
|
self.assertRaises(TypeError, process.extract_iter)
|
|
|
|
self.assertRaises(TypeError, process.extract_iter, 1)
|
|
|
|
self.assertRaises(TypeError,
|
|
|
|
lambda *args, **kwargs: next(process.extract_iter(*args, **kwargs)),
|
|
|
|
1, []
|
|
|
|
)
|
|
|
|
self.assertRaises(TypeError,
|
|
|
|
lambda *args, **kwargs: next(process.extract_iter(*args, **kwargs)),
|
|
|
|
'', [1]
|
|
|
|
)
|
|
|
|
self.assertRaises(TypeError,
|
|
|
|
lambda *args, **kwargs: next(process.extract_iter(*args, **kwargs)),
|
|
|
|
'', {1:1}
|
|
|
|
)
|
|
|
|
|
2020-05-24 07:57:08 +00:00
|
|
|
def testGetBestChoice1(self):
|
|
|
|
query = "new york mets at atlanta braves"
|
|
|
|
best = process.extractOne(query, self.baseball_strings)
|
|
|
|
self.assertEqual(best[0], "braves vs mets")
|
|
|
|
|
|
|
|
def testGetBestChoice2(self):
|
|
|
|
query = "philadelphia phillies at atlanta braves"
|
|
|
|
best = process.extractOne(query, self.baseball_strings)
|
|
|
|
self.assertEqual(best[0], self.baseball_strings[2])
|
|
|
|
|
|
|
|
def testGetBestChoice3(self):
|
|
|
|
query = "atlanta braves at philadelphia phillies"
|
|
|
|
best = process.extractOne(query, self.baseball_strings)
|
|
|
|
self.assertEqual(best[0], self.baseball_strings[2])
|
|
|
|
|
|
|
|
def testGetBestChoice4(self):
|
|
|
|
query = "chicago cubs vs new york mets"
|
|
|
|
best = process.extractOne(query, self.baseball_strings)
|
|
|
|
self.assertEqual(best[0], self.baseball_strings[0])
|
|
|
|
|
|
|
|
def testWithProcessor(self):
|
2020-11-15 19:18:46 +00:00
|
|
|
"""
|
|
|
|
extractOne should accept any type as long as it is a string
|
|
|
|
after preprocessing
|
|
|
|
"""
|
2020-05-24 07:57:08 +00:00
|
|
|
events = [
|
|
|
|
["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"],
|
|
|
|
["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"],
|
|
|
|
["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"],
|
|
|
|
]
|
2020-11-15 19:18:46 +00:00
|
|
|
query = events[0]
|
2020-05-24 07:57:08 +00:00
|
|
|
|
|
|
|
best = process.extractOne(query, events, processor=lambda event: event[0])
|
|
|
|
self.assertEqual(best[0], events[0])
|
|
|
|
|
|
|
|
def testWithScorer(self):
|
|
|
|
choices = [
|
|
|
|
"new york mets vs chicago cubs",
|
|
|
|
"chicago cubs at new york mets",
|
|
|
|
"atlanta braves vs pittsbugh pirates",
|
|
|
|
"new york yankees vs boston red sox"
|
|
|
|
]
|
|
|
|
|
|
|
|
choices_mapping = {
|
|
|
|
1: "new york mets vs chicago cubs",
|
|
|
|
2: "chicago cubs at new york mets",
|
|
|
|
3: "atlanta braves vs pittsbugh pirates",
|
|
|
|
4: "new york yankees vs boston red sox"
|
|
|
|
}
|
|
|
|
|
|
|
|
# in this hypothetical example we care about ordering, so we use quick ratio
|
|
|
|
query = "new york mets at chicago cubs"
|
|
|
|
|
|
|
|
# first, as an example, the normal way would select the "more 'complete' match of choices[1]"
|
|
|
|
best = process.extractOne(query, choices)
|
|
|
|
self.assertEqual(best[0], choices[1])
|
|
|
|
best = process.extractOne(query, choices_mapping)
|
|
|
|
self.assertEqual(best[0], choices_mapping[2])
|
|
|
|
|
|
|
|
# now, use the custom scorer
|
|
|
|
best = process.extractOne(query, choices, scorer=fuzz.QRatio)
|
|
|
|
self.assertEqual(best[0], choices[0])
|
|
|
|
best = process.extractOne(query, choices_mapping, scorer=fuzz.QRatio)
|
|
|
|
self.assertEqual(best[0], choices_mapping[1])
|
|
|
|
|
|
|
|
def testWithCutoff(self):
|
|
|
|
choices = [
|
|
|
|
"new york mets vs chicago cubs",
|
|
|
|
"chicago cubs at new york mets",
|
|
|
|
"atlanta braves vs pittsbugh pirates",
|
|
|
|
"new york yankees vs boston red sox"
|
|
|
|
]
|
|
|
|
|
|
|
|
query = "los angeles dodgers vs san francisco giants"
|
|
|
|
|
|
|
|
# in this situation, this is an event that does not exist in the list
|
|
|
|
# we don't want to randomly match to something, so we use a reasonable cutoff
|
|
|
|
best = process.extractOne(query, choices, score_cutoff=50)
|
|
|
|
self.assertIsNone(best)
|
|
|
|
|
|
|
|
# however if we had no cutoff, something would get returned
|
|
|
|
best = process.extractOne(query, choices)
|
|
|
|
self.assertIsNotNone(best)
|
|
|
|
|
|
|
|
def testWithCutoffEdgeCases(self):
|
|
|
|
choices = [
|
|
|
|
"new york mets vs chicago cubs",
|
|
|
|
"chicago cubs at new york mets",
|
|
|
|
"atlanta braves vs pittsbugh pirates",
|
|
|
|
"new york yankees vs boston red sox"
|
|
|
|
]
|
|
|
|
|
|
|
|
query = "new york mets vs chicago cubs"
|
|
|
|
# Only find 100-score cases
|
|
|
|
best = process.extractOne(query, choices, score_cutoff=100)
|
|
|
|
self.assertIsNotNone(best)
|
|
|
|
self.assertEqual(best[0], choices[0])
|
|
|
|
|
|
|
|
# 0-score cases do not return None
|
|
|
|
best = process.extractOne("", choices)
|
|
|
|
self.assertIsNotNone(best)
|
|
|
|
self.assertEqual(best[1], 0)
|
|
|
|
|
|
|
|
def testEmptyStrings(self):
|
|
|
|
choices = [
|
|
|
|
"",
|
|
|
|
"new york mets vs chicago cubs",
|
|
|
|
"new york yankees vs boston red sox",
|
|
|
|
"",
|
|
|
|
""
|
|
|
|
]
|
|
|
|
|
|
|
|
query = "new york mets at chicago cubs"
|
|
|
|
|
|
|
|
best = process.extractOne(query, choices)
|
|
|
|
self.assertEqual(best[0], choices[1])
|
|
|
|
|
|
|
|
def testNullStrings(self):
|
|
|
|
choices = [
|
|
|
|
None,
|
|
|
|
"new york mets vs chicago cubs",
|
|
|
|
"new york yankees vs boston red sox",
|
|
|
|
None,
|
|
|
|
None
|
|
|
|
]
|
|
|
|
|
|
|
|
query = "new york mets at chicago cubs"
|
|
|
|
|
|
|
|
best = process.extractOne(query, choices)
|
|
|
|
self.assertEqual(best[0], choices[1])
|
|
|
|
|
Release v1.0.0 (#68)
- all normalized string_metrics can now be used as scorer for process.extract/extractOne
- Implementation of the C++ Wrapper completely refactored to make it easier to add more scorers, processors and string matching algorithms in the future.
- increased test coverage, that already helped to fix some bugs and help to prevent regressions in the future
- improved docstrings of functions
- Added bitparallel implementation of the Levenshtein distance for the weights (1,1,1) and (1,1,2).
- Added specialized implementation of the Levenshtein distance for cases with a small maximum edit distance, that is even faster, than the bitparallel implementation.
- Improved performance of `fuzz.partial_ratio`
-> Since `fuzz.ratio` and `fuzz.partial_ratio` are used in most scorers, this improves the overall performance.
- Improved performance of `process.extract` and `process.extractOne`
- the `rapidfuzz.levenshtein` module is now deprecated and will be removed in v2.0.0
These functions are now placed in `rapidfuzz.string_metric`. `distance`, `normalized_distance`, `weighted_distance` and `weighted_normalized_distance` are combined into `levenshtein` and `normalized_levenshtein`.
- added normalized version of the hamming distance in `string_metric.normalized_hamming`
- process.extract_iter as a generator, that yields the similarity of all elements, that have a similarity >= score_cutoff
- multiple bugs in extractOne when used with a scorer, thats not from RapidFuzz
- fixed bug in `token_ratio`
- fixed bug in result normalisation causing zero division
2021-02-12 15:37:44 +00:00
|
|
|
|
|
|
|
def custom_scorer(s1, s2, processor=None, score_cutoff=0):
|
|
|
|
return fuzz.ratio(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("processor", [False, None, lambda s: s])
|
|
|
|
@pytest.mark.parametrize("scorer", [fuzz.ratio, custom_scorer])
|
|
|
|
def test_extractOne_case_sensitive(processor, scorer):
|
|
|
|
assert process.extractOne("new york mets", ["new", "new YORK mets"], processor=processor, scorer=scorer)[1] != 100
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("scorer", [fuzz.ratio, custom_scorer])
|
|
|
|
def test_extractOne_use_first_match(scorer):
|
|
|
|
assert process.extractOne("new york mets", ["new york mets", "new york mets"], scorer=scorer)[2] == 0
|
|
|
|
|
2020-05-24 07:57:08 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
unittest.main()
|