fix incorrect ref counting

This commit is contained in:
Max Bachmann 2021-03-03 15:20:31 +01:00
parent 74e41ad716
commit c6eebb70a5
8 changed files with 1474 additions and 1455 deletions

View File

@ -43,7 +43,7 @@ jobs:
- name: Run Unit Tests
run: |
pip install .
pip install pytest hypothesis
pip install pytest hypothesis pandas
pytest

View File

@ -1 +1 @@
1.1.1
1.1.2

View File

@ -38,8 +38,8 @@ class BuildExt(build_ext):
elif ct == 'msvc':
opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
for ext in self.extensions:
ext.extra_compile_args = opts
ext.extra_link_args = link_opts
ext.extra_compile_args += opts
ext.extra_link_args += link_opts
build_ext.build_extensions(self)
setup(

2895
src/cpp_process.cpp vendored

File diff suppressed because it is too large Load Diff

View File

@ -11,6 +11,7 @@ from cpython.list cimport PyList_New
from cpython.list cimport PyList_SET_ITEM
from cpython.object cimport PyObject
from cpython.ref cimport Py_INCREF
from cpython.ref cimport Py_DECREF
import heapq
@ -358,6 +359,10 @@ cdef inline extract_dict(scorer_context context, choices, processor, size_t limi
score = context.scorer(context.context, choice, score_cutoff)
if score >= score_cutoff:
# especially the key object might be created on the fly by e.g. pandas.Dataframe
# so we need to ensure Python does not deallocate it
Py_INCREF(choice)
Py_INCREF(choice_key)
results.push_back(DictMatchElem(score, i, <PyObject*>choice, <PyObject*>choice_key))
index += 1
@ -379,10 +384,15 @@ cdef inline extract_dict(scorer_context context, choices, processor, size_t limi
# https://stackoverflow.com/questions/43553763/cythonize-list-of-all-splits-of-a-string/43557675#43557675
PyList_SET_ITEM(result_list, i,
<object>Py_BuildValue("OdO",
<PyObject*>choices[<object>results[i].key],
<PyObject*>results[i].choice,
results[i].score,
<PyObject*>results[i].key))
# decref all reference counts
for i in range(results.size()):
Py_DECREF(<object>results[i].choice)
Py_DECREF(<object>results[i].key)
return result_list
@ -393,7 +403,7 @@ cdef inline extract_list(scorer_context context, choices, processor, size_t limi
# todo possibly a smaller vector would be good to reduce memory usage
cdef vector[ListMatchElem] results
results.reserve(<size_t>len(choices))
cdef object result_list
cdef list result_list
if processor is not None:
for choice in choices:
@ -751,4 +761,3 @@ def extract_iter(query, choices, scorer=fuzz.WRatio, processor=utils.default_pro
if py_score >= score_cutoff:
yield(choice, py_score, index)
index += 1

@ -1 +1 @@
Subproject commit ea6f17dd4d3af1f15f46ff608da7cfa28625ed5a
Subproject commit 91f20cd9930e620c7c250381bcca640570480dbd

View File

@ -3,6 +3,6 @@ rapid string matching library
"""
__author__ = "Max Bachmann"
__license__ = "MIT"
__version__ = "1.1.1"
__version__ = "1.1.2"
from rapidfuzz import process, fuzz, utils, levenshtein, string_metric

View File

@ -5,6 +5,7 @@ import unittest
import pytest
from rapidfuzz import process, fuzz, utils
import pandas as pd
class ProcessTest(unittest.TestCase):
def setUp(self):
@ -187,6 +188,12 @@ class ProcessTest(unittest.TestCase):
best = process.extractOne(query, choices)
self.assertEqual(best[0], choices[1])
def testIssue81(self):
# this mostly tests whether this segfaults due to incorrect ref counting
choices = pd.Series(['test color brightness', 'test lemon', 'test lavender'], index=[67478, 67479, 67480])
matches = process.extract("test", choices)
assert matches == [('test color brightness', 90.0, 67478), ('test lemon', 90.0, 67479), ('test lavender', 90.0, 67480)]
def custom_scorer(s1, s2, processor=None, score_cutoff=0):
return fuzz.ratio(s1, s2, processor=processor, score_cutoff=score_cutoff)