RapidFuzz/tests/test_process.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import unittest
import pytest

from rapidfuzz import process, fuzz, utils
import pandas as pd

class ProcessTest(unittest.TestCase):
    def setUp(self):
        self.baseball_strings = [
            "new york mets vs chicago cubs",
            "chicago cubs vs chicago white sox",
            "philladelphia phillies vs atlanta braves",
            "braves vs mets",
        ]

    def testExtractOneExceptions(self):
        self.assertRaises(TypeError, process.extractOne)
        self.assertRaises(TypeError, process.extractOne, 1)
        self.assertRaises(TypeError, process.extractOne, 1, [])
        self.assertRaises(TypeError, process.extractOne, '', [1])
        self.assertRaises(TypeError, process.extractOne, '', {1:1})

    def testExtractExceptions(self):
        self.assertRaises(TypeError, process.extract)
        self.assertRaises(TypeError, process.extract, 1)
        self.assertRaises(TypeError, process.extract, 1, [])
        self.assertRaises(TypeError, process.extract, '', [1])
        self.assertRaises(TypeError, process.extract, '', {1:1})

    def testExtractIterExceptions(self):
        self.assertRaises(TypeError, process.extract_iter)
        self.assertRaises(TypeError, process.extract_iter, 1)
        self.assertRaises(TypeError,
            lambda *args, **kwargs: next(process.extract_iter(*args, **kwargs)),
            1, []
        )
        self.assertRaises(TypeError,
            lambda *args, **kwargs: next(process.extract_iter(*args, **kwargs)),
            '', [1]
        )
        self.assertRaises(TypeError,
            lambda *args, **kwargs: next(process.extract_iter(*args, **kwargs)),
            '', {1:1}
        )

    def testGetBestChoice1(self):
        query = "new york mets at atlanta braves"
        best = process.extractOne(query, self.baseball_strings)
        self.assertEqual(best[0], "braves vs mets")

    def testGetBestChoice2(self):
        query = "philadelphia phillies at atlanta braves"
        best = process.extractOne(query, self.baseball_strings)
        self.assertEqual(best[0], self.baseball_strings[2])

    def testGetBestChoice3(self):
        query = "atlanta braves at philadelphia phillies"
        best = process.extractOne(query, self.baseball_strings)
        self.assertEqual(best[0], self.baseball_strings[2])

    def testGetBestChoice4(self):
        query = "chicago cubs vs new york mets"
        best = process.extractOne(query, self.baseball_strings)
        self.assertEqual(best[0], self.baseball_strings[0])

    def testWithProcessor(self):
        """
        extractOne should accept any type as long as it is a string
        after preprocessing
        """
        events = [
            ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"],
            ["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"],
            ["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"],
        ]
        query = events[0]

        best = process.extractOne(query, events, processor=lambda event: event[0])
        self.assertEqual(best[0], events[0])

    def testWithScorer(self):
        choices = [
            "new york mets vs chicago cubs",
            "chicago cubs at new york mets",
            "atlanta braves vs pittsbugh pirates",
            "new york yankees vs boston red sox"
        ]

        choices_mapping = {
            1: "new york mets vs chicago cubs",
            2: "chicago cubs at new york mets",
            3: "atlanta braves vs pittsbugh pirates",
            4: "new york yankees vs boston red sox"
        }

        # in this hypothetical example we care about ordering, so we use quick ratio
        query = "new york mets at chicago cubs"

        # first, as an example, the normal way would select the "more 'complete' match of choices[1]"
        best = process.extractOne(query, choices)
        self.assertEqual(best[0], choices[1])
        best = process.extract(query, choices)[0]
        self.assertEqual(best[0], choices[1])
        # dict
        best = process.extractOne(query, choices_mapping)
        self.assertEqual(best[0], choices_mapping[2])
        best = process.extract(query, choices_mapping)[0]
        self.assertEqual(best[0], choices_mapping[2])

        # now, use the custom scorer
        best = process.extractOne(query, choices, scorer=fuzz.QRatio)
        self.assertEqual(best[0], choices[0])
        best = process.extract(query, choices, scorer=fuzz.QRatio)[0]
        self.assertEqual(best[0], choices[0])
        # dict
        best = process.extractOne(query, choices_mapping, scorer=fuzz.QRatio)
        self.assertEqual(best[0], choices_mapping[1])
        best = process.extract(query, choices_mapping, scorer=fuzz.QRatio)[0]
        self.assertEqual(best[0], choices_mapping[1])


    def testWithCutoff(self):
        choices = [
            "new york mets vs chicago cubs",
            "chicago cubs at new york mets",
            "atlanta braves vs pittsbugh pirates",
            "new york yankees vs boston red sox"
        ]

        query = "los angeles dodgers vs san francisco giants"

        # in this situation, this is an event that does not exist in the list
        # we don't want to randomly match to something, so we use a reasonable cutoff
        best = process.extractOne(query, choices, score_cutoff=50)
        self.assertIsNone(best)

        # however if we had no cutoff, something would get returned
        best = process.extractOne(query, choices)
        self.assertIsNotNone(best)

    def testWithCutoffEdgeCases(self):
        choices = [
            "new york mets vs chicago cubs",
            "chicago cubs at new york mets",
            "atlanta braves vs pittsbugh pirates",
            "new york yankees vs boston red sox"
        ]

        query = "new york mets vs chicago cubs"
        # Only find 100-score cases
        best = process.extractOne(query, choices, score_cutoff=100)
        self.assertIsNotNone(best)
        self.assertEqual(best[0], choices[0])

        # 0-score cases do not return None
        best = process.extractOne("", choices)
        self.assertIsNotNone(best)
        self.assertEqual(best[1], 0)

    def testNoneElements(self):
        """
        when a None element is used, it is skipped and the index is still correct
        """
        best = process.extractOne("test", [None, "tes"])
        self.assertEqual(best[2], 1)

        best = process.extract("test", [None, "tes"], limit=1)
        self.assertEqual(best[0][2], 1)

    def testResultOrder(self):
        """
        when multiple elements have the same score, the first one should be returned
        """
        best = process.extractOne("test", ["tes", "tes"])
        self.assertEqual(best[2], 0)

        best = process.extract("test", ["tes", "tes"], limit=1)
        self.assertEqual(best[0][2], 0)

    def testEmptyStrings(self):
        choices = [
            "",
            "new york mets vs chicago cubs",
            "new york yankees vs boston red sox",
            "",
            ""
        ]

        query = "new york mets at chicago cubs"

        best = process.extractOne(query, choices)
        self.assertEqual(best[0], choices[1])

    def testNullStrings(self):
        choices = [
            None,
            "new york mets vs chicago cubs",
            "new york yankees vs boston red sox",
            None,
            None
        ]

        query = "new york mets at chicago cubs"

        best = process.extractOne(query, choices)
        self.assertEqual(best[0], choices[1])

    def testIssue81(self):
        # this mostly tests whether this segfaults due to incorrect ref counting
        choices = pd.Series(['test color brightness', 'test lemon', 'test lavender'], index=[67478, 67479, 67480])
        matches = process.extract("test", choices)
        assert matches == [('test color brightness', 90.0, 67478), ('test lemon', 90.0, 67479), ('test lavender', 90.0, 67480)]


def custom_scorer(s1, s2, processor=None, score_cutoff=0):
    return fuzz.ratio(s1, s2, processor=processor, score_cutoff=score_cutoff)

@pytest.mark.parametrize("processor", [False, None, lambda s: s])
@pytest.mark.parametrize("scorer", [fuzz.ratio, custom_scorer])
def test_extractOne_case_sensitive(processor, scorer):
    assert process.extractOne("new york mets", ["new", "new YORK mets"], processor=processor, scorer=scorer)[1] != 100

@pytest.mark.parametrize("scorer", [fuzz.ratio, custom_scorer])
def test_extractOne_use_first_match(scorer):
    assert process.extractOne("new york mets", ["new york mets", "new york mets"], scorer=scorer)[2] == 0

if __name__ == '__main__':
    unittest.main()
add python 2.7 support 2020-08-22 19:07:08 +00:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`

add unit tests 2020-05-24 07:57:08 +00:00			`import unittest`
Release v1.0.0 (#68) - all normalized string_metrics can now be used as scorer for process.extract/extractOne - Implementation of the C++ Wrapper completely refactored to make it easier to add more scorers, processors and string matching algorithms in the future. - increased test coverage, that already helped to fix some bugs and help to prevent regressions in the future - improved docstrings of functions - Added bitparallel implementation of the Levenshtein distance for the weights (1,1,1) and (1,1,2). - Added specialized implementation of the Levenshtein distance for cases with a small maximum edit distance, that is even faster, than the bitparallel implementation. - Improved performance of `fuzz.partial_ratio` -> Since `fuzz.ratio` and `fuzz.partial_ratio` are used in most scorers, this improves the overall performance. - Improved performance of `process.extract` and `process.extractOne` - the `rapidfuzz.levenshtein` module is now deprecated and will be removed in v2.0.0 These functions are now placed in `rapidfuzz.string_metric`. `distance`, `normalized_distance`, `weighted_distance` and `weighted_normalized_distance` are combined into `levenshtein` and `normalized_levenshtein`. - added normalized version of the hamming distance in `string_metric.normalized_hamming` - process.extract_iter as a generator, that yields the similarity of all elements, that have a similarity >= score_cutoff - multiple bugs in extractOne when used with a scorer, thats not from RapidFuzz - fixed bug in `token_ratio` - fixed bug in result normalisation causing zero division 2021-02-12 15:37:44 +00:00			`import pytest`
add unit tests 2020-05-24 07:57:08 +00:00
			`from rapidfuzz import process, fuzz, utils`
fix incorrect ref counting 2021-03-03 14:20:31 +00:00			`import pandas as pd`
add unit tests 2020-05-24 07:57:08 +00:00
			`class ProcessTest(unittest.TestCase):`
			`def setUp(self):`
			`self.baseball_strings = [`
			`"new york mets vs chicago cubs",`
			`"chicago cubs vs chicago white sox",`
			`"philladelphia phillies vs atlanta braves",`
			`"braves vs mets",`
			`]`

implement process.extractOne in C++ (#53) * start to simplify complexion * start implementation * add extractOne to C++ * fix a couple of bugs in the implementation * start adressing performance issues 2020-11-15 19:18:46 +00:00			`def testExtractOneExceptions(self):`
			`self.assertRaises(TypeError, process.extractOne)`
			`self.assertRaises(TypeError, process.extractOne, 1)`
			`self.assertRaises(TypeError, process.extractOne, 1, [])`
			`self.assertRaises(TypeError, process.extractOne, '', [1])`
			`self.assertRaises(TypeError, process.extractOne, '', {1:1})`

			`def testExtractExceptions(self):`
			`self.assertRaises(TypeError, process.extract)`
			`self.assertRaises(TypeError, process.extract, 1)`
			`self.assertRaises(TypeError, process.extract, 1, [])`
			`self.assertRaises(TypeError, process.extract, '', [1])`
			`self.assertRaises(TypeError, process.extract, '', {1:1})`

Release v1.1.0 (#75) ## Changed - string_metric.normalized_levenshtein supports now all weights - when different weights are used for Insertion and Deletion the strings can not be swapped inside the Levenshtein implementation anymore. So different weights for Insertion and Deletion are now supported. - replace C++ implementation with a Cython implementation. This has the following advantages: - The implementation is less error prone, since a lot of the complex things are done by Cython - slighly faster than the current implementation (up to 10% for some parts) - about 33% smaller binary size - reduced compile time - Added **kwargs argument to process.extract/extractOne/extract_iter that is passed to the scorer - Add max argument to hamming distance - Add support for whole Unicode range to utils.default_process ## Performance - replaced Wagner Fischer usage in the normal Levenshtein distance with a bitparallel implementation 2021-02-21 18:42:36 +00:00			`def testExtractIterExceptions(self):`
			`self.assertRaises(TypeError, process.extract_iter)`
			`self.assertRaises(TypeError, process.extract_iter, 1)`
			`self.assertRaises(TypeError,`
			`lambda args, kwargs: next(process.extract_iter(args, **kwargs)),`
			`1, []`
			`)`
			`self.assertRaises(TypeError,`
			`lambda args, kwargs: next(process.extract_iter(args, **kwargs)),`
			`'', [1]`
			`)`
			`self.assertRaises(TypeError,`
			`lambda args, kwargs: next(process.extract_iter(args, **kwargs)),`
			`'', {1:1}`
			`)`

add unit tests 2020-05-24 07:57:08 +00:00			`def testGetBestChoice1(self):`
			`query = "new york mets at atlanta braves"`
			`best = process.extractOne(query, self.baseball_strings)`
			`self.assertEqual(best[0], "braves vs mets")`

			`def testGetBestChoice2(self):`
			`query = "philadelphia phillies at atlanta braves"`
			`best = process.extractOne(query, self.baseball_strings)`
			`self.assertEqual(best[0], self.baseball_strings[2])`

			`def testGetBestChoice3(self):`
			`query = "atlanta braves at philadelphia phillies"`
			`best = process.extractOne(query, self.baseball_strings)`
			`self.assertEqual(best[0], self.baseball_strings[2])`

			`def testGetBestChoice4(self):`
			`query = "chicago cubs vs new york mets"`
			`best = process.extractOne(query, self.baseball_strings)`
			`self.assertEqual(best[0], self.baseball_strings[0])`

			`def testWithProcessor(self):`
implement process.extractOne in C++ (#53) * start to simplify complexion * start implementation * add extractOne to C++ * fix a couple of bugs in the implementation * start adressing performance issues 2020-11-15 19:18:46 +00:00			`"""`
			`extractOne should accept any type as long as it is a string`
			`after preprocessing`
			`"""`
add unit tests 2020-05-24 07:57:08 +00:00			`events = [`
			`["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"],`
			`["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"],`
			`["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"],`
			`]`
implement process.extractOne in C++ (#53) * start to simplify complexion * start implementation * add extractOne to C++ * fix a couple of bugs in the implementation * start adressing performance issues 2020-11-15 19:18:46 +00:00			`query = events[0]`
add unit tests 2020-05-24 07:57:08 +00:00
			`best = process.extractOne(query, events, processor=lambda event: event[0])`
			`self.assertEqual(best[0], events[0])`

			`def testWithScorer(self):`
			`choices = [`
			`"new york mets vs chicago cubs",`
			`"chicago cubs at new york mets",`
			`"atlanta braves vs pittsbugh pirates",`
			`"new york yankees vs boston red sox"`
			`]`

			`choices_mapping = {`
			`1: "new york mets vs chicago cubs",`
			`2: "chicago cubs at new york mets",`
			`3: "atlanta braves vs pittsbugh pirates",`
			`4: "new york yankees vs boston red sox"`
			`}`

			`# in this hypothetical example we care about ordering, so we use quick ratio`
			`query = "new york mets at chicago cubs"`

			`# first, as an example, the normal way would select the "more 'complete' match of choices[1]"`
			`best = process.extractOne(query, choices)`
			`self.assertEqual(best[0], choices[1])`
Fix result conversion process.extract 2021-02-23 13:58:44 +00:00			`best = process.extract(query, choices)[0]`
			`self.assertEqual(best[0], choices[1])`
			`# dict`
add unit tests 2020-05-24 07:57:08 +00:00			`best = process.extractOne(query, choices_mapping)`
			`self.assertEqual(best[0], choices_mapping[2])`
Fix result conversion process.extract 2021-02-23 13:58:44 +00:00			`best = process.extract(query, choices_mapping)[0]`
			`self.assertEqual(best[0], choices_mapping[2])`
add unit tests 2020-05-24 07:57:08 +00:00
			`# now, use the custom scorer`
			`best = process.extractOne(query, choices, scorer=fuzz.QRatio)`
			`self.assertEqual(best[0], choices[0])`
Fix result conversion process.extract 2021-02-23 13:58:44 +00:00			`best = process.extract(query, choices, scorer=fuzz.QRatio)[0]`
			`self.assertEqual(best[0], choices[0])`
			`# dict`
add unit tests 2020-05-24 07:57:08 +00:00			`best = process.extractOne(query, choices_mapping, scorer=fuzz.QRatio)`
			`self.assertEqual(best[0], choices_mapping[1])`
Fix result conversion process.extract 2021-02-23 13:58:44 +00:00			`best = process.extract(query, choices_mapping, scorer=fuzz.QRatio)[0]`
			`self.assertEqual(best[0], choices_mapping[1])`

add unit tests 2020-05-24 07:57:08 +00:00
			`def testWithCutoff(self):`
			`choices = [`
			`"new york mets vs chicago cubs",`
			`"chicago cubs at new york mets",`
			`"atlanta braves vs pittsbugh pirates",`
			`"new york yankees vs boston red sox"`
			`]`

			`query = "los angeles dodgers vs san francisco giants"`

			`# in this situation, this is an event that does not exist in the list`
			`# we don't want to randomly match to something, so we use a reasonable cutoff`
			`best = process.extractOne(query, choices, score_cutoff=50)`
			`self.assertIsNone(best)`

			`# however if we had no cutoff, something would get returned`
			`best = process.extractOne(query, choices)`
			`self.assertIsNotNone(best)`

			`def testWithCutoffEdgeCases(self):`
			`choices = [`
			`"new york mets vs chicago cubs",`
			`"chicago cubs at new york mets",`
			`"atlanta braves vs pittsbugh pirates",`
			`"new york yankees vs boston red sox"`
			`]`

			`query = "new york mets vs chicago cubs"`
			`# Only find 100-score cases`
			`best = process.extractOne(query, choices, score_cutoff=100)`
			`self.assertIsNotNone(best)`
			`self.assertEqual(best[0], choices[0])`

			`# 0-score cases do not return None`
			`best = process.extractOne("", choices)`
			`self.assertIsNotNone(best)`
			`self.assertEqual(best[1], 0)`

add distance support to process.* ## Changed - added processor support to `levenshtein` and `hamming` - added distance support to extract/extractOne/extract_iter ## Fixes - incorrect results of `normalized_hamming` and `normalized_levenshtein` when used with `utils.default_process` as processor 2021-03-29 17:09:22 +00:00			`def testNoneElements(self):`
			`"""`
			`when a None element is used, it is skipped and the index is still correct`
			`"""`
			`best = process.extractOne("test", [None, "tes"])`
			`self.assertEqual(best[2], 1)`

			`best = process.extract("test", [None, "tes"], limit=1)`
			`self.assertEqual(best[0][2], 1)`

			`def testResultOrder(self):`
			`"""`
			`when multiple elements have the same score, the first one should be returned`
			`"""`
			`best = process.extractOne("test", ["tes", "tes"])`
			`self.assertEqual(best[2], 0)`

			`best = process.extract("test", ["tes", "tes"], limit=1)`
			`self.assertEqual(best[0][2], 0)`

add unit tests 2020-05-24 07:57:08 +00:00			`def testEmptyStrings(self):`
			`choices = [`
			`"",`
			`"new york mets vs chicago cubs",`
			`"new york yankees vs boston red sox",`
			`"",`
			`""`
			`]`

			`query = "new york mets at chicago cubs"`

			`best = process.extractOne(query, choices)`
			`self.assertEqual(best[0], choices[1])`

			`def testNullStrings(self):`
			`choices = [`
			`None,`
			`"new york mets vs chicago cubs",`
			`"new york yankees vs boston red sox",`
			`None,`
			`None`
			`]`

			`query = "new york mets at chicago cubs"`

			`best = process.extractOne(query, choices)`
			`self.assertEqual(best[0], choices[1])`

fix incorrect ref counting 2021-03-03 14:20:31 +00:00			`def testIssue81(self):`
			`# this mostly tests whether this segfaults due to incorrect ref counting`
			`choices = pd.Series(['test color brightness', 'test lemon', 'test lavender'], index=[67478, 67479, 67480])`
			`matches = process.extract("test", choices)`
			`assert matches == [('test color brightness', 90.0, 67478), ('test lemon', 90.0, 67479), ('test lavender', 90.0, 67480)]`

Release v1.0.0 (#68) - all normalized string_metrics can now be used as scorer for process.extract/extractOne - Implementation of the C++ Wrapper completely refactored to make it easier to add more scorers, processors and string matching algorithms in the future. - increased test coverage, that already helped to fix some bugs and help to prevent regressions in the future - improved docstrings of functions - Added bitparallel implementation of the Levenshtein distance for the weights (1,1,1) and (1,1,2). - Added specialized implementation of the Levenshtein distance for cases with a small maximum edit distance, that is even faster, than the bitparallel implementation. - Improved performance of `fuzz.partial_ratio` -> Since `fuzz.ratio` and `fuzz.partial_ratio` are used in most scorers, this improves the overall performance. - Improved performance of `process.extract` and `process.extractOne` - the `rapidfuzz.levenshtein` module is now deprecated and will be removed in v2.0.0 These functions are now placed in `rapidfuzz.string_metric`. `distance`, `normalized_distance`, `weighted_distance` and `weighted_normalized_distance` are combined into `levenshtein` and `normalized_levenshtein`. - added normalized version of the hamming distance in `string_metric.normalized_hamming` - process.extract_iter as a generator, that yields the similarity of all elements, that have a similarity >= score_cutoff - multiple bugs in extractOne when used with a scorer, thats not from RapidFuzz - fixed bug in `token_ratio` - fixed bug in result normalisation causing zero division 2021-02-12 15:37:44 +00:00
			`def custom_scorer(s1, s2, processor=None, score_cutoff=0):`
			`return fuzz.ratio(s1, s2, processor=processor, score_cutoff=score_cutoff)`

			`@pytest.mark.parametrize("processor", [False, None, lambda s: s])`
			`@pytest.mark.parametrize("scorer", [fuzz.ratio, custom_scorer])`
			`def test_extractOne_case_sensitive(processor, scorer):`
			`assert process.extractOne("new york mets", ["new", "new YORK mets"], processor=processor, scorer=scorer)[1] != 100`

			`@pytest.mark.parametrize("scorer", [fuzz.ratio, custom_scorer])`
			`def test_extractOne_use_first_match(scorer):`
			`assert process.extractOne("new york mets", ["new york mets", "new york mets"], scorer=scorer)[2] == 0`

add unit tests 2020-05-24 07:57:08 +00:00			`if __name__ == '__main__':`
			`unittest.main()`