fix incorrect ref counting

2021-03-03 15:20:31 +01:00 · 2021-03-03 15:20:31 +01:00 · c6eebb70a5
parent 74e41ad716
commit c6eebb70a5
8 changed files with 1474 additions and 1455 deletions
--- a/.github/workflows/pythonbuild.yml
+++ b/.github/workflows/pythonbuild.yml
@ -43,7 +43,7 @@ jobs:
      - name: Run Unit Tests
        run: |
          pip install .
-          pip install pytest hypothesis
+          pip install pytest hypothesis pandas
          pytest


--- a/2
+++ b/2
@ -1 +1 @@
-1.1.1
+1.1.2
--- a/setup.py
+++ b/setup.py
@ -38,8 +38,8 @@ class BuildExt(build_ext):
        elif ct == 'msvc':
            opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
        for ext in self.extensions:
-            ext.extra_compile_args = opts
-            ext.extra_link_args = link_opts
+            ext.extra_compile_args += opts
+            ext.extra_link_args += link_opts
        build_ext.build_extensions(self)

 setup(
--- a/src/cpp_process.cpp
+++ b/src/cpp_process.cpp
--- a/src/cpp_process.pyx
+++ b/src/cpp_process.pyx
@ -11,6 +11,7 @@ from cpython.list cimport PyList_New
 from cpython.list cimport PyList_SET_ITEM
 from cpython.object cimport PyObject
 from cpython.ref cimport Py_INCREF
+from cpython.ref cimport Py_DECREF

 import heapq

@ -358,6 +359,10 @@ cdef inline extract_dict(scorer_context context, choices, processor, size_t limi
            score = context.scorer(context.context, choice, score_cutoff)

            if score >= score_cutoff:
+                # especially the key object might be created on the fly by e.g. pandas.Dataframe
+                # so we need to ensure Python does not deallocate it
+                Py_INCREF(choice)
+                Py_INCREF(choice_key)
                results.push_back(DictMatchElem(score, i, <PyObject*>choice, <PyObject*>choice_key))
            index += 1

@ -379,10 +384,15 @@ cdef inline extract_dict(scorer_context context, choices, processor, size_t limi
        # https://stackoverflow.com/questions/43553763/cythonize-list-of-all-splits-of-a-string/43557675#43557675
        PyList_SET_ITEM(result_list, i,
            <object>Py_BuildValue("OdO",
-                <PyObject*>choices[<object>results[i].key],
+                <PyObject*>results[i].choice,
                results[i].score,
                <PyObject*>results[i].key))

+    # decref all reference counts
+    for i in range(results.size()):
+        Py_DECREF(<object>results[i].choice)
+        Py_DECREF(<object>results[i].key)
+
    return result_list


@ -393,7 +403,7 @@ cdef inline extract_list(scorer_context context, choices, processor, size_t limi
    # todo possibly a smaller vector would be good to reduce memory usage
    cdef vector[ListMatchElem] results
    results.reserve(<size_t>len(choices))
-    cdef object result_list
+    cdef list result_list

    if processor is not None:
        for choice in choices:
@ -751,4 +761,3 @@ def extract_iter(query, choices, scorer=fuzz.WRatio, processor=utils.default_pro
                    if py_score >= score_cutoff:
                        yield(choice, py_score, index)
                    index += 1
-        
--- a/src/rapidfuzz-cpp
+++ b/src/rapidfuzz-cpp
@ -1 +1 @@
-Subproject commit ea6f17dd4d3af1f15f46ff608da7cfa28625ed5a
+Subproject commit 91f20cd9930e620c7c250381bcca640570480dbd
--- a/src/rapidfuzz/init.py
+++ b/src/rapidfuzz/init.py
@ -3,6 +3,6 @@ rapid string matching library
 """
 __author__ = "Max Bachmann"
 __license__ = "MIT"
-__version__ = "1.1.1"
+__version__ = "1.1.2"

 from rapidfuzz import process, fuzz, utils, levenshtein, string_metric
--- a/tests/test_process.py
+++ b/tests/test_process.py
@ -5,6 +5,7 @@ import unittest
 import pytest

 from rapidfuzz import process, fuzz, utils
+import pandas as pd

 class ProcessTest(unittest.TestCase):
    def setUp(self):
@ -187,6 +188,12 @@ class ProcessTest(unittest.TestCase):
        best = process.extractOne(query, choices)
        self.assertEqual(best[0], choices[1])

+    def testIssue81(self):
+        # this mostly tests whether this segfaults due to incorrect ref counting
+        choices = pd.Series(['test color brightness', 'test lemon', 'test lavender'], index=[67478, 67479, 67480])
+        matches = process.extract("test", choices)
+        assert matches == [('test color brightness', 90.0, 67478), ('test lemon', 90.0, 67479), ('test lavender', 90.0, 67480)]
+

 def custom_scorer(s1, s2, processor=None, score_cutoff=0):
    return fuzz.ratio(s1, s2, processor=processor, score_cutoff=score_cutoff)
 @ -1 +1 @@
 .1.1
 .1.2