implement process.extractOne in C++ (#53)

* start to simplify complexion * start implementation * add extractOne to C++ * fix a couple of bugs in the implementation * start adressing performance issues
2020-11-15 20:18:46 +01:00 · 2020-11-15 20:18:46 +01:00 · 426fbb24e9
parent eee513f2c5
commit 426fbb24e9
15 changed files with 825 additions and 237 deletions
--- a/.github/workflows/pythonbuild.yml
+++ b/.github/workflows/pythonbuild.yml
@ -9,11 +9,11 @@ on:

 jobs:
  test_python:
-    name: run linting, tests and benchmarks for the python module
-    runs-on: ubuntu-latest
+    name: linting and tests on Python ${{ matrix.python-version }}
+    runs-on: ubuntu-18.04
    strategy:
      matrix:
-        python-version: [2.7, 3.5, 3.6, 3.7, 3.8]
+        python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9]

    steps:
      - uses: actions/checkout@v2
@ -41,7 +41,7 @@ jobs:
      - name: Run Unit Tests
        run: |
          pip install .
-          pip install pytest
+          pip install pytest hypothesis
          pytest


--- a/.gitignore
+++ b/.gitignore
@ -15,3 +15,7 @@ site/

 # benchmark results
 bench_results/
+
+# Hypothesis results
+.hypothesis/
+
--- a/2
+++ b/2
@ -1 +1 @@
-0.12.5
+0.13.0
--- a/setup.py
+++ b/setup.py
@ -17,7 +17,7 @@ class BuildExt(build_ext):
    """A custom build extension for adding compiler-specific options."""
    c_opts = {
        'msvc': ['/EHsc', '/O2', '/std:c++14'],
-        'unix': ['-O3', '-std=c++14'],
+        'unix': ['-O3', '-std=c++14', '-Wextra', '-Wall'],
    }
    l_opts = {
        'msvc': [],
--- a/src/py2_utils.hpp
+++ b/src/py2_utils.hpp
@ -21,10 +21,14 @@ bool valid_str(PyObject* str, const char* name)
    Py_InitModule3(#name, methods, doc);                                                           \
  }

+using python_string =
+    mpark::variant<std::basic_string<uint8_t>, std::basic_string<Py_UNICODE>,
+    rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<Py_UNICODE>>;
+
 using python_string_view =
    mpark::variant<rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<Py_UNICODE>>;

-python_string_view decode_python_string(PyObject* py_str)
+python_string decode_python_string(PyObject* py_str)
 {
  if (PyObject_TypeCheck(py_str, &PyString_Type)) {
    Py_ssize_t len = PyString_GET_SIZE(py_str);
@ -38,12 +42,27 @@ python_string_view decode_python_string(PyObject* py_str)
  }
 }

-PyObject* encode_python_string(std::basic_string<uint8_t> str)
+python_string_view decode_python_string_view(PyObject* py_str)
+{
+  if (PyObject_TypeCheck(py_str, &PyString_Type)) {
+    Py_ssize_t len = PyString_GET_SIZE(py_str);
+    uint8_t* str = reinterpret_cast<uint8_t*>(PyString_AS_STRING(py_str));
+    return rapidfuzz::basic_string_view<uint8_t>(str, len);
+  }
+  else {
+    Py_ssize_t len = PyUnicode_GET_SIZE(py_str);
+    Py_UNICODE* str = PyUnicode_AS_UNICODE(py_str);
+    return rapidfuzz::basic_string_view<Py_UNICODE>(str, len);
+  }
+}
+
+
+PyObject* encode_python_string(rapidfuzz::basic_string_view<uint8_t> str)
 {
  return PyString_FromStringAndSize(reinterpret_cast<const char*>(str.data()), str.size());
 }

-PyObject* encode_python_string(std::basic_string<Py_UNICODE> str)
+PyObject* encode_python_string(rapidfuzz::basic_string_view<Py_UNICODE> str)
 {
  return PyUnicode_FromUnicode(str.data(), str.size());
 }
--- a/src/py3_utils.hpp
+++ b/src/py3_utils.hpp
@ -6,12 +6,6 @@
 #include "details/types.hpp"
 #include <variant/variant.hpp>

-// PEP 623 deprecates legacy strings and therefor
-// deprecates e.g. PyUnicode_READY in Python 3.10
-#if PY_VERSION_HEX < 0x030A0000
-#define PY_BELOW_3_10
-#endif
-
 bool valid_str(PyObject* str, const char* name)
 {
  if (!PyUnicode_Check(str)) {
@ -19,7 +13,9 @@ bool valid_str(PyObject* str, const char* name)
    return false;
  }

-#ifdef PY_BELOW_3_10
+  // PEP 623 deprecates legacy strings and therefor
+  // deprecates e.g. PyUnicode_READY in Python 3.10
+#if PY_VERSION_HEX < PYTHON_VERSION(3,10,0)
  if (PyUnicode_READY(str)) {
    return false;
  }
@ -36,11 +32,16 @@ bool valid_str(PyObject* str, const char* name)
    return PyModule_Create(&moduledef);                                                            \
  }

+using python_string =
+    mpark::variant<std::basic_string<uint8_t>, std::basic_string<uint16_t>, std::basic_string<uint32_t>,
+    rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<uint16_t>,
+                   rapidfuzz::basic_string_view<uint32_t>>;
+
 using python_string_view =
    mpark::variant<rapidfuzz::basic_string_view<uint8_t>, rapidfuzz::basic_string_view<uint16_t>,
                   rapidfuzz::basic_string_view<uint32_t>>;

-python_string_view decode_python_string(PyObject* py_str)
+python_string decode_python_string(PyObject* py_str)
 {
  Py_ssize_t len = PyUnicode_GET_LENGTH(py_str);
  void* str = PyUnicode_DATA(py_str);
@ -55,17 +56,32 @@ python_string_view decode_python_string(PyObject* py_str)
  }
 }

-PyObject* encode_python_string(std::basic_string<uint8_t> str)
+python_string_view decode_python_string_view(PyObject* py_str)
+{
+  Py_ssize_t len = PyUnicode_GET_LENGTH(py_str);
+  void* str = PyUnicode_DATA(py_str);
+
+  switch (PyUnicode_KIND(py_str)) {
+  case PyUnicode_1BYTE_KIND:
+    return rapidfuzz::basic_string_view<uint8_t>(static_cast<uint8_t*>(str), len);
+  case PyUnicode_2BYTE_KIND:
+    return rapidfuzz::basic_string_view<uint16_t>(static_cast<uint16_t*>(str), len);
+  default:
+    return rapidfuzz::basic_string_view<uint32_t>(static_cast<uint32_t*>(str), len);
+  }
+}
+
+PyObject* encode_python_string(rapidfuzz::basic_string_view<uint8_t> str)
 {
  return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, str.data(), str.size());
 }

-PyObject* encode_python_string(std::basic_string<uint16_t> str)
+PyObject* encode_python_string(rapidfuzz::basic_string_view<uint16_t> str)
 {
  return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, str.data(), str.size());
 }

-PyObject* encode_python_string(std::basic_string<uint32_t> str)
+PyObject* encode_python_string(rapidfuzz::basic_string_view<uint32_t> str)
 {
  return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str.data(), str.size());
 }
--- a/src/py_abstraction.cpp
+++ b/src/py_abstraction.cpp
@ -22,7 +22,7 @@ static inline bool non_default_process(PyObject* processor)
 {
  if (processor) {
    if (PyCFunction_Check(processor)) {
-        if (PyCFunction_GetFunction(processor) == (PyCFunction)(void (*)(void))default_process) {
+        if (PyCFunction_GetFunction(processor) == PY_FUNC_CAST(default_process)) {
          return false;
        }
    }
@ -31,8 +31,21 @@ static inline bool non_default_process(PyObject* processor)
  return PyCallable_Check(processor);
 }

+static inline void free_owner_list(const std::vector<PyObject*>& owner_list)
+{
+  for (const auto owned : owner_list) {
+    Py_DecRef(owned);
+  }
+}
+
+template<typename Sentence>
+static inline python_string default_process_string(Sentence&& str)
+{
+  return rutils::default_process(std::forward<Sentence>(str));
+}
+
 template <typename MatchingFunc>
-static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* keywds)
+static inline PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* keywds)
 {
  PyObject* py_s1;
  PyObject* py_s2;
@ -50,10 +63,6 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key
    return PyFloat_FromDouble(0);
  }

-  if (!valid_str(py_s1, "s1") || !valid_str(py_s2, "s2")) {
-    return NULL;
-  }
-
  if (non_default_process(processor)) {
    PyObject* proc_s1 = PyObject_CallFunctionObjArgs(processor, py_s2, NULL);
    if (proc_s1 == NULL) {
@ -66,8 +75,12 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key
      return NULL;
    }

-    auto s1_view = decode_python_string(proc_s1);
-    auto s2_view = decode_python_string(proc_s2);
+    if (!valid_str(proc_s1, "s1") || !valid_str(proc_s2, "s2")) {
+      return NULL;
+    }
+
+    auto s1_view = decode_python_string_view(proc_s1);
+    auto s2_view = decode_python_string_view(proc_s2);

    double result = mpark::visit(
        [score_cutoff](auto&& val1, auto&& val2) {
@ -81,8 +94,12 @@ static PyObject* fuzz_call(bool processor_default, PyObject* args, PyObject* key
    return PyFloat_FromDouble(result);
  }

-  auto s1_view = decode_python_string(py_s1);
-  auto s2_view = decode_python_string(py_s2);
+  if (!valid_str(py_s1, "s1") || !valid_str(py_s2, "s2")) {
+    return NULL;
+  }
+
+  auto s1_view = decode_python_string_view(py_s1);
+  auto s2_view = decode_python_string_view(py_s2);

  double result;
  if (use_preprocessing(processor, processor_default)) {
@ -118,7 +135,24 @@ struct name##_func {                                                        \
 static PyObject* name(PyObject* /*self*/, PyObject* args, PyObject* keywds) \
 {                                                                           \
  return fuzz_call<name##_func>(process_default, args, keywds);             \
-}
+}                                                                           
+
+struct CachedFuzz {
+  virtual void str1_set(python_string str) {
+    m_str1 = std::move(str);
+  }
+
+  virtual void str2_set(python_string str) {
+    m_str2 = std::move(str);
+  }
+
+  virtual double call(double score_cutoff) = 0;
+
+protected:
+  python_string m_str1;
+  python_string m_str2;
+};
+

 FUZZ_FUNC(
  ratio, false,
@ -140,6 +174,17 @@ FUZZ_FUNC(
  "    96.55171966552734"
 )

+struct CachedRatio : public CachedFuzz {
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::ratio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};
+
+
 FUZZ_FUNC(
  partial_ratio, false,
  "partial_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -160,6 +205,15 @@ FUZZ_FUNC(
  "    100"
 )

+struct CachedPartialRatio : public CachedFuzz {
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::partial_ratio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};

 FUZZ_FUNC(
  token_sort_ratio, true,
@ -182,6 +236,26 @@ FUZZ_FUNC(
  "    100.0"
 )

+struct CachedTokenSortRatio : public CachedFuzz {
+  void str1_set(python_string str) override {
+    m_str1 = mpark::visit(
+      [](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
+  }
+
+  virtual void str2_set(python_string str) override {
+    m_str2 = mpark::visit(
+      [](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
+  }
+
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::ratio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};
+
 FUZZ_FUNC(
  partial_token_sort_ratio, true,
  "partial_token_sort_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -200,6 +274,26 @@ FUZZ_FUNC(
  "    float: ratio between s1 and s2 as a float between 0 and 100"
 )

+struct CachedPartialTokenSortRatio : public CachedFuzz {
+  void str1_set(python_string str) override {
+    m_str1 = mpark::visit(
+      [](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
+  }
+
+  virtual void str2_set(python_string str) override {
+    m_str2 = mpark::visit(
+      [](auto&& val) -> python_string {return rutils::sorted_split(val).join();}, str);
+  }
+
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::partial_ratio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};
+
 FUZZ_FUNC(
  token_set_ratio, true,
  "token_set_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -224,6 +318,16 @@ FUZZ_FUNC(
  "    100.0"
 )

+struct CachedTokenSetRatio : public CachedFuzz {
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::token_set_ratio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};
+
 FUZZ_FUNC(
  partial_token_set_ratio, true,
  "partial_token_set_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -243,6 +347,16 @@ FUZZ_FUNC(
  "    float: ratio between s1 and s2 as a float between 0 and 100"
 )

+struct CachedPartialTokenSetRatio : public CachedFuzz {
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::partial_token_set_ratio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};
+
 FUZZ_FUNC(
  token_ratio, true,
  "token_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -262,6 +376,16 @@ FUZZ_FUNC(
    "    float: ratio between s1 and s2 as a float between 0 and 100"
 )

+struct CachedTokenRatio : public CachedFuzz {
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::token_ratio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};
+
 FUZZ_FUNC(
  partial_token_ratio, true,
  "partial_token_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -282,6 +406,16 @@ FUZZ_FUNC(
  "    float: ratio between s1 and s2 as a float between 0 and 100"
 )

+struct CachedPartialTokenRatio : public CachedFuzz {
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::partial_token_ratio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};
+
 FUZZ_FUNC(
  WRatio, true,
  "WRatio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -300,6 +434,16 @@ FUZZ_FUNC(
  "    float: ratio between s1 and s2 as a float between 0 and 100"
 )

+struct CachedWRatio : public CachedFuzz {
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::WRatio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};
+
 FUZZ_FUNC(
  QRatio, true,
  "QRatio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -321,6 +465,16 @@ FUZZ_FUNC(
  "    96.55171966552734"
 )

+struct CachedQRatio : public CachedFuzz {
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::QRatio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};
+
 FUZZ_FUNC(
  quick_lev_ratio, true,
  "quick_lev_ratio($module, s1, s2, processor = False, score_cutoff = 0)\n"
@ -343,7 +497,15 @@ FUZZ_FUNC(
  "    float: ratio between s1 and s2 as a float between 0 and 100"
 )

-
+struct CachedQuickLevRatio : public CachedFuzz {
+  double call(double score_cutoff) override {
+    return mpark::visit(
+      [score_cutoff](auto&& val1, auto&& val2) {
+          return rfuzz::quick_lev_ratio(val1, val2, score_cutoff);
+        },
+        m_str1, m_str2);
+  }
+};

 constexpr const char* default_process_docstring = R"()";

@ -360,13 +522,391 @@ static PyObject* default_process(PyObject* /*self*/, PyObject* args, PyObject* k
    return NULL;
  }

-  auto sentence_view = decode_python_string(py_sentence);
-  PyObject* processed = mpark::visit(
-        [](auto&& val1) {
-          return encode_python_string(rutils::default_process(val1));},
-        sentence_view);
+  /* this is pretty verbose. However it is faster than std::variant + std::visit */
+#ifdef PYTHON_2
+  if (PyObject_TypeCheck(py_sentence, &PyString_Type)) {
+    Py_ssize_t len = PyString_GET_SIZE(py_sentence);
+    char* str = PyString_AS_STRING(py_sentence);
+
+    auto proc_str = rutils::default_process(rapidfuzz::basic_string_view<char>(str, len));
+    return PyString_FromStringAndSize(proc_str.data(), proc_str.size());
+  }
+  else {
+    Py_ssize_t len = PyUnicode_GET_SIZE(py_sentence);
+    const Py_UNICODE* str = PyUnicode_AS_UNICODE(py_sentence);
+
+    auto proc_str = rutils::default_process(rapidfuzz::basic_string_view<Py_UNICODE>(str, len));
+    return PyUnicode_FromUnicode(proc_str.data(), proc_str.size());
+  }
+#else /* Python 3 */
+
+  Py_ssize_t len = PyUnicode_GET_LENGTH(py_sentence);
+  void* str = PyUnicode_DATA(py_sentence);
+
+  switch (PyUnicode_KIND(py_sentence)) {
+  case PyUnicode_1BYTE_KIND:
+  {
+    auto proc_str = rutils::default_process(
+        rapidfuzz::basic_string_view<uint8_t>(static_cast<uint8_t*>(str), len));
+    return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, proc_str.data(), proc_str.size());
+  }
+  case PyUnicode_2BYTE_KIND:
+  {
+    auto proc_str = rutils::default_process(
+        rapidfuzz::basic_string_view<uint16_t>(static_cast<uint16_t*>(str), len));
+    return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, proc_str.data(), proc_str.size());
+  }
+  default:
+  {
+    auto proc_str = rutils::default_process(
+        rapidfuzz::basic_string_view<uint32_t>(static_cast<uint32_t*>(str), len));
+    return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, proc_str.data(), proc_str.size());
+  }
+  }
+#endif
+}
+
+static inline bool process_string(
+  PyObject* py_str, const char* name,
+  PyObject* processor, bool processor_default,
+  python_string& proc_str, std::vector<PyObject*>& owner_list)
+{
+  if (non_default_process(processor)) {
+    PyObject* proc_py_str = PyObject_CallFunctionObjArgs(processor, py_str, NULL);
+    if ((proc_py_str == NULL) || (!valid_str(proc_py_str, name))) {
+      return false;
+    }
+
+    owner_list.push_back(proc_py_str);
+    proc_str = decode_python_string(proc_py_str);
+    return true;
+  }
  
-  return processed;
+  if (!valid_str(py_str, name)) {
+    return false;
+  }
+  
+  if (use_preprocessing(processor, processor_default)) {
+    proc_str = mpark::visit(
+        [](auto&& val1) { return default_process_string(val1);},
+        decode_python_string(py_str));
+  } else {
+    proc_str = decode_python_string(py_str);
+  }
+
+  return true;
+}
+
+
+
+std::unique_ptr<CachedFuzz> get_matching_instance(PyObject* scorer)
+{
+  if (scorer) {
+    if (PyCFunction_Check(scorer)) {
+        auto scorer_func = PyCFunction_GetFunction(scorer);
+        if (scorer_func == PY_FUNC_CAST(ratio))
+        {
+          return std::make_unique<CachedRatio>();
+        } else if (scorer_func == PY_FUNC_CAST(partial_ratio)) {
+          return std::make_unique<CachedPartialRatio>();
+        } else if (scorer_func == PY_FUNC_CAST(token_sort_ratio)) {
+          return std::make_unique<CachedTokenSortRatio>();
+        } else if (scorer_func == PY_FUNC_CAST(token_set_ratio)) {
+          return std::make_unique<CachedTokenSetRatio>();
+        } else if (scorer_func == PY_FUNC_CAST(partial_token_sort_ratio)) {
+          return std::make_unique<CachedPartialTokenSortRatio>();
+        } else if (scorer_func == PY_FUNC_CAST(partial_token_set_ratio)) {
+          return std::make_unique<CachedPartialTokenSetRatio>();
+        } else if (scorer_func == PY_FUNC_CAST(token_ratio)) {
+          return std::make_unique<CachedTokenRatio>();
+        } else if (scorer_func == PY_FUNC_CAST(partial_token_ratio)) {
+          return std::make_unique<CachedPartialTokenRatio>();
+        } else if (scorer_func == PY_FUNC_CAST(WRatio)) {
+          return std::make_unique<CachedWRatio>();
+        } else if (scorer_func == PY_FUNC_CAST(QRatio)) {
+          return std::make_unique<CachedQRatio>();
+        }
+    }
+    /* call python function */
+    return nullptr;
+  /* default is fuzz.WRatio */
+  } else {
+    return std::make_unique<CachedWRatio>();
+  }
+}
+
+
+static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
+    PyObject* scorer, PyObject* processor, double score_cutoff)
+{
+  bool match_found = false;
+  PyObject* result_choice = NULL;
+  PyObject* choice_key = NULL;
+  std::vector<PyObject*> outer_owner_list;
+  
+  bool is_dict = false;
+
+  PyObject* py_score_cutoff = PyFloat_FromDouble(score_cutoff);
+  if (!py_score_cutoff) {
+    return NULL;
+  }
+
+  python_string query;
+  if (!process_string(py_query, "query", processor, true, query, outer_owner_list)) {
+    Py_DecRef(py_score_cutoff);
+    return NULL;
+  }
+
+  py_query = mpark::visit(
+        [](auto&& val) {return encode_python_string(val);},
+        query);
+
+  if (!py_query) {
+    Py_DecRef(py_score_cutoff);
+    free_owner_list(outer_owner_list);
+    return NULL;
+  }
+  outer_owner_list.push_back(py_query);
+
+  /* dict like container */
+  if (PyObject_HasAttrString(py_choices, "items")) {
+    is_dict = true;
+    py_choices = PyObject_CallMethod(py_choices, "items", NULL);
+    if (!py_choices) {
+      free_owner_list(outer_owner_list);
+      return NULL;
+    }
+    outer_owner_list.push_back(py_choices);
+  }
+
+  PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
+  if (!choices) {
+    Py_DecRef(py_score_cutoff);
+    free_owner_list(outer_owner_list);
+    return NULL;
+  }
+  outer_owner_list.push_back(choices);
+
+  std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
+
+
+  for (std::size_t i = 0; i < choice_count; ++i) {
+    PyObject* py_choice = NULL;
+    PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
+
+    if (is_dict) {
+      if (!PyArg_ParseTuple(py_match_choice, "OO", &py_choice, &py_match_choice))
+      {
+        Py_DecRef(py_score_cutoff);
+        free_owner_list(outer_owner_list);
+        return NULL;
+      }
+    }
+
+    if (py_match_choice == Py_None) {
+      continue;
+    }
+
+    std::vector<PyObject*> inner_owner_list;
+    python_string choice;
+
+    if (!process_string(py_match_choice, "choice", processor, true, choice, inner_owner_list)) {
+      Py_DecRef(py_score_cutoff);
+      free_owner_list(outer_owner_list);
+      return NULL;
+    }
+
+    PyObject* py_proc_choice = mpark::visit(
+        [](auto&& val) {return encode_python_string(val);},
+        choice);
+
+    if (!py_proc_choice) {
+      Py_DecRef(py_score_cutoff);
+      free_owner_list(outer_owner_list);
+      return NULL;
+    }
+    inner_owner_list.push_back(py_proc_choice);
+
+    PyObject* score = PyObject_CallFunction(scorer, "OOO",
+      py_query, py_proc_choice, py_score_cutoff);
+
+    if (!score) {
+      Py_DecRef(py_score_cutoff);
+      free_owner_list(outer_owner_list);
+      free_owner_list(inner_owner_list);
+      return NULL;
+    }
+
+    int comp = PyObject_RichCompareBool(score, py_score_cutoff, Py_GE);
+    if (comp == 1) {
+      Py_DecRef(py_score_cutoff);
+      py_score_cutoff = score;
+      match_found = true;
+      result_choice = py_match_choice;
+      choice_key = py_choice;
+    } else if (comp == 0) {
+      Py_DecRef(score);
+    } else if (comp == -1) {
+      Py_DecRef(py_score_cutoff);
+      Py_DecRef(score);
+      free_owner_list(outer_owner_list);
+      free_owner_list(inner_owner_list);
+      return NULL;
+    }
+    free_owner_list(inner_owner_list);
+  }
+
+  free_owner_list(outer_owner_list);
+        
+  if (!match_found) {
+    Py_DecRef(py_score_cutoff);
+    Py_RETURN_NONE;
+  }
+
+  if (score_cutoff > 100) {
+    score_cutoff = 100;
+  }
+  
+  PyObject* result = is_dict
+    ? Py_BuildValue("(OOO)", result_choice, py_score_cutoff, choice_key)
+    : Py_BuildValue("(OO)", result_choice, py_score_cutoff);
+
+  Py_DecRef(py_score_cutoff);
+  return result;
+}
+
+
+constexpr const char* extractOne_docstring = 
+  "extractOne($module, query, choices, scorer = 'fuzz.WRatio', processor = 'utils.default_process', score_cutoff = 0)\n"
+  "--\n\n"  
+  "Find the best match in a list of choices\n\n"
+  "Args:\n"
+  "    query (str): string we want to find\n"
+  "    choices (Iterable): list of all strings the query should be compared with or dict with a mapping\n"
+  "        {<result>: <string to compare>}\n"
+  "    scorer (Callable): optional callable that is used to calculate the matching score between\n"
+  "        the query and each choice. WRatio is used by default\n"
+  "    processor (Callable): optional callable that reformats the strings. utils.default_process\n"
+  "        is used by default, which lowercases the strings and trims whitespace\n"
+  "    score_cutoff (float): Optional argument for a score threshold. Matches with\n"
+  "        a lower score than this number will not be returned. Defaults to 0\n\n"
+  "Returns:\n"
+  "    Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is\n"
+  "        no match with a score >= score_cutoff\n"
+  "    Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match\n"
+  "        in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will\n"
+  "        be in the form`(<choice>, <ratio>)` when `choices` is a list of strings\n"
+  "        or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.";
+
+static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds)
+{
+  bool match_found = false;
+  PyObject* result_choice = NULL;
+  PyObject* choice_key = NULL;
+  double result_score;
+  std::vector<PyObject*> outer_owner_list;
+  python_string query;
+  bool is_dict = false;
+
+  PyObject* py_query;
+  PyObject* py_choices;
+  PyObject* processor = NULL;
+  PyObject* py_scorer = NULL;
+  double score_cutoff = 0;
+  static const char* kwlist[] = {"query", "choices", "scorer", "processor", "score_cutoff", NULL};
+
+  if (!PyArg_ParseTupleAndKeywords(args, keywds, "OO|OOd", const_cast<char**>(kwlist), &py_query,
+                                   &py_choices, &py_scorer, &processor, &score_cutoff))
+  {
+    return NULL;
+  }
+
+  if (py_query == Py_None) {
+    return PyFloat_FromDouble(0);
+  }
+
+  auto scorer = get_matching_instance(py_scorer);
+  if (!scorer) {
+    // todo this is mostly code duplication
+    return py_extractOne(py_query, py_choices, py_scorer, processor, score_cutoff);
+  }
+
+  if (!process_string(py_query, "query", processor, true, query, outer_owner_list)) {
+    return NULL;
+  }
+
+  scorer->str1_set(query);
+  PyObject* py_items;
+
+  /* dict like container */
+  if (PyObject_HasAttrString(py_choices, "items")) {
+    is_dict = true;
+    py_choices = PyObject_CallMethod(py_choices, "items", NULL);
+    if (!py_choices) {
+      free_owner_list(outer_owner_list);
+      return NULL;
+    }
+    outer_owner_list.push_back(py_choices);
+  }
+
+  PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
+  if (!choices) {
+    free_owner_list(outer_owner_list);
+    return NULL;
+  }
+  outer_owner_list.push_back(choices);
+
+  std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
+
+  for (std::size_t i = 0; i < choice_count; ++i) {
+    PyObject* py_choice = NULL;
+    PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
+
+    if (is_dict) {
+      if (!PyArg_ParseTuple(py_match_choice, "OO", &py_choice, &py_match_choice))
+      {
+        free_owner_list(outer_owner_list);
+        return NULL;
+      }
+    }
+
+    if (py_match_choice == Py_None) {
+      continue;
+    }
+
+    std::vector<PyObject*> inner_owner_list;
+    python_string choice;
+
+    if (!process_string(py_match_choice, "choice", processor, true, choice, inner_owner_list)) {
+      free_owner_list(outer_owner_list);
+      return NULL;
+    }
+
+    scorer->str2_set(choice);
+    double score = scorer->call(score_cutoff);
+
+    if (score >= score_cutoff) {
+      // increase the value by a small step so it might be able to exit early
+      score_cutoff = score + (float)0.00001;
+      result_score = score;
+      match_found = true;
+      result_choice = py_match_choice;
+      choice_key = py_choice;
+    } 
+    free_owner_list(inner_owner_list);
+  }
+
+  free_owner_list(outer_owner_list);
+        
+  if (!match_found) {
+    Py_RETURN_NONE;
+  }
+
+  if (is_dict) {
+    return Py_BuildValue("(OdO)", result_choice, result_score, choice_key);
+  } else {
+    return Py_BuildValue("(Od)", result_choice, result_score);
+  }
 }

 static PyMethodDef methods[] = {
@ -386,6 +926,7 @@ static PyMethodDef methods[] = {
    PY_METHOD(QRatio),
    PY_METHOD(quick_lev_ratio),
 /* process */
+    PY_METHOD(extractOne),
 /* sentinel */
    {NULL, NULL, 0, NULL}
 };
--- a/src/py_process.cpp
+++ b/src/py_process.cpp
@ -1,106 +0,0 @@
-#include "fuzz.hpp"
-#include "py_utils.hpp"
-#include "utils.hpp"
-#include <string>
-
-namespace rfuzz = rapidfuzz::fuzz;
-namespace utils = rapidfuzz::utils;
-
-PyObject* extractOne(PyObject* self, PyObject* args, PyObject* keywds)
-{
-  PyObject* py_query;
-  PyObject* py_choices;
-  PyObject* processor = NULL;
-  PyObject* scorer = NULL;
-  double score_cutoff = 0;
-  static const char* kwlist[] = {"query", "choices", "scorer", "processor", "score_cutoff", NULL};
-
-  if (!PyArg_ParseTupleAndKeywords(args, keywds, "OO|OOd", const_cast<char**>(kwlist), &py_query,
-                                   &py_choices, &scorer, &processor, &score_cutoff))
-  {
-    return NULL;
-  }
-
-  if (py_query == Py_None) {
-    return PyFloat_FromDouble(0);
-  }
-
-  if (PyObject_HasAttrString(py_choices, "items")) {
-  }
-  else {
-  }
-
-  if (PySequence_Check(processor)) {
-  }
-
-  if (!valid_str(py_query, "query")) {
-    return NULL;
-  }
-
-  // if is list
-
-  PyObject* choices = PySequence_Fast(py_choices, "Choices must be a sequence of strings");
-  if (!choices) {
-    return NULL;
-  }
-
-  std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
-
-  bool match_found;
-  // PyObject*
-
-  // processing missing
-  auto query_view = decode_python_string(py_query);
-
-  for (std::size_t i = 0; i < choice_count; ++i) {
-    PyObject* py_choice = PySequence_Fast_GET_ITEM(choices, i);
-
-    if (py_choice == Py_None) {
-      continue;
-    }
-
-    if (!valid_str(py_choice, "choice")) {
-      Py_DECREF(choices);
-      return NULL;
-    }
-
-    auto choice_view = decode_python_string(py_choice);
-
-    double score = mpark::visit(
-        [score_cutoff](auto&& val1, auto&& val2) {
-          return rfuzz::WRatio(val1, val2, score_cutoff);
-        },
-        query_view, choice_view);
-    /*
-        float score;
-        if (preprocess) {
-          score = fuzz::WRatio(
-            cleaned_query,
-            utils::default_process(choice),
-            score_cutoff);
-        } else {
-          score = fuzz::WRatio(
-            cleaned_query,
-            std::wstring_view(choice, wcslen(choice)),
-            score_cutoff);
-        }*/
-
-    if (score >= score_cutoff) {
-      // increase the value by a small step so it might be able to exit early
-      score_cutoff = score + (float)0.00001;
-      match_found = true;
-      result_choice = choice;
-    }
-  }
-
-  Py_DECREF(choices);
-
-  if (!match_found) {
-    Py_RETURN_NONE;
-  }
-
-  if (score_cutoff > 100) {
-    score_cutoff = 100;
-  }
-  return Py_BuildValue("(ud)", result_choice, score_cutoff);
-}
--- a/src/py_utils.hpp
+++ b/src/py_utils.hpp
@ -1,21 +1,26 @@
 /* SPDX-License-Identifier: MIT */
 /* Copyright © 2020 Max Bachmann */

+#pragma once
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include <vector>
+#include "utils.hpp"
+
+#define PY_FUNC_CAST(func) ((PyCFunction)(void (*)(void))func)
+
+#define PYTHON_VERSION(major, minor, micro) ((major << 24) | (minor << 16) | (micro <<  8))

 /* The cast of the function is necessary since PyCFunction values
 * only take two PyObject* parameters, and these functions take three.
 */
-#define PY_METHOD(x)                                                                               \
-  {                                                                                                \
-#x, (PyCFunction)(void (*)(void))x, METH_VARARGS | METH_KEYWORDS, x##_docstring                \
-  }
+#define PY_METHOD(x) \
+    { #x, PY_FUNC_CAST(x), METH_VARARGS | METH_KEYWORDS, x##_docstring }

-#if PY_MAJOR_VERSION == 2
+#if PY_VERSION_HEX < PYTHON_VERSION(3,0,0)
 #define PYTHON_2
 #include "py2_utils.hpp"
 #else
 #define PYTHON_3
 #include "py3_utils.hpp"
-#endif
+#endif
--- a/src/rapidfuzz-cpp
+++ b/src/rapidfuzz-cpp
@ -1 +1 @@
-Subproject commit aa743d18e39a1b19f83fb745e580ab311487b727
+Subproject commit 0cbbee61bd9a2401e45c96a3d3d6ab640317ccce
--- a/src/rapidfuzz/init.py
+++ b/src/rapidfuzz/init.py
@ -3,6 +3,6 @@ rapid string matching library
 """
 __author__ = "Max Bachmann"
 __license__ = "MIT"
-__version__ = "0.12.5"
+__version__ = "0.13.0"

-from rapidfuzz import process, fuzz, levenshtein, utils
+from rapidfuzz import process, fuzz, utils# levenshtein
--- a/src/rapidfuzz/process.py
+++ b/src/rapidfuzz/process.py
@ -3,6 +3,7 @@
 # Copyright © 2011 Adam Cohen

 from rapidfuzz import fuzz, utils
+from rapidfuzz.cpp_impl import extractOne
 import heapq
 import numbers

@ -117,86 +118,3 @@ def extractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.defau

 def extractBests(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
    return extract(query, choices, scorer, processor, limit, score_cutoff)
-
-
-def extractOne(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, score_cutoff = 0):
-    """
-    Find the best match in a list of choices
-
-    Args: 
-        query (str): string we want to find
-        choices (Iterable): list of all strings the query should be compared with or dict with a mapping
-            {<result>: <string to compare>}
-        scorer (Callable): optional callable that is used to calculate the matching score between
-            the query and each choice. WRatio is used by default
-        processor (Callable): optional callable that reformats the strings. utils.default_process
-            is used by default, which lowercases the strings and trims whitespace
-        score_cutoff (float): Optional argument for a score threshold. Matches with
-            a lower score than this number will not be returned. Defaults to 0
-
-    Returns: 
-        Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is
-            no match with a score >= score_cutoff
-        Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match
-            in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will
-            be in the form`(<choice>, <ratio>)` when `choices` is a list of strings
-            or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
-    """
-    if query is None:
-        return None
-
-    a = processor(query) if processor else query
-
-    result_score = None
-    result_choice = ""
-
-    if hasattr(choices, "items"):
-        choice_key = ""
-        for choice, match_choice in choices.items():
-            if match_choice is None:
-                continue
-            b = processor(match_choice) if processor else match_choice
-
-            score = scorer(
-                a, b,
-                processor=None,
-                score_cutoff=score_cutoff)
-
-            if score >= score_cutoff:
-                # very small increment for the score_cutoff, so when multiple
-                # elements have the same score the first one is used
-                # only done when the score is a number
-                if isinstance(score, numbers.Number):
-                    score_cutoff = score + 0.00001
-                    if score_cutoff > 100:
-                        return (match_choice, score, choice)
-                else:
-                    score_cutoff = score
-
-                result_score = score
-                result_choice = match_choice
-                choice_key = choice
-        return (result_choice, result_score, choice_key) if not result_score is None else None
-    
-    for choice in choices:
-        if choice is None:
-            continue
-        b = processor(choice) if processor else choice
-
-        score = scorer(
-            a, b,
-            processor=None,
-            score_cutoff=score_cutoff)
-
-        if score >= score_cutoff:
-            if isinstance(score, numbers.Number):
-                score_cutoff = score + 0.00001
-                if score_cutoff > 100:
-                    return (choice, score)
-            else:
-                score_cutoff = score
-
-            result_score = score
-            result_choice = choice
-
-    return (result_choice, result_score) if not result_score is None else None
--- a/tests/test_fuzz.py
+++ b/tests/test_fuzz.py
@ -5,6 +5,19 @@ import unittest

 from rapidfuzz import process, fuzz, utils

+scorers = [
+    fuzz.ratio,
+    fuzz.partial_ratio,
+    fuzz.token_sort_ratio,
+    fuzz.token_set_ratio,
+    fuzz.token_ratio,
+    fuzz.partial_token_sort_ratio,
+    fuzz.partial_token_set_ratio,
+    fuzz.partial_token_ratio,
+    fuzz.WRatio,
+    fuzz.QRatio
+]
+
 class RatioTest(unittest.TestCase):
    def setUp(self):
        self.s1 = "new york mets"
@ -87,5 +100,27 @@ class RatioTest(unittest.TestCase):
        score = fuzz.QRatio(s1, s2)
        self.assertEqual(0, score)

+    def testWithProcessor(self):
+        """
+        Any scorer should accept any type as s1 and s2, as long as it is a string
+        after preprocessing.
+        """
+        s1 = ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"]
+        s2 = ["chicago cubs vs new york mets", "CitiFields", "2012-05-11", "9pm"]
+
+        for scorer in scorers:
+            score = scorer(s1, s2, processor=lambda event: event[0])
+            self.assertEqual(score, 100)
+
+    def testHelp(self):
+        """
+        test that all help texts can be printed without throwing an exception,
+        since they are implemented in C++ aswell
+        """
+
+        for scorer in scorers:
+            help(scorer)
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/tests/test_hypothesis.py
+++ b/tests/test_hypothesis.py
@ -0,0 +1,138 @@
+from itertools import product
+from functools import partial
+from string import ascii_letters, digits, punctuation
+
+from hypothesis import given, assume, settings
+import hypothesis.strategies as st
+import pytest
+
+from rapidfuzz import fuzz, process, utils
+import random
+
+
+HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation
+
+SCORERS = [
+    fuzz.ratio,
+    fuzz.partial_ratio,
+    fuzz.token_set_ratio,
+    fuzz.token_sort_ratio,
+    fuzz.token_ratio,
+    fuzz.partial_token_set_ratio,
+    fuzz.partial_token_sort_ratio,
+    fuzz.partial_token_ratio,
+    fuzz.WRatio,
+    fuzz.QRatio
+]
+
+FULL_SCORERS = [
+    fuzz.ratio,
+    fuzz.WRatio,
+    fuzz.QRatio
+]
+
+PROCESSORS = [
+    lambda x: x,
+    utils.default_process
+]
+
+@given(sentence=st.text())
+@settings(max_examples=200)
+def test_multiple_processor_runs(sentence):
+    """
+    Test that running a preprocessor on a sentence
+    a second time does not change the result
+    """
+    assert utils.default_process(sentence) \
+        == utils.default_process(utils.default_process(sentence))
+
+'''
+
+def full_scorers_processors():
+    """
+    Generate a list of (scorer, processor) pairs for testing for scorers that use the full string only
+    :return: [(scorer, processor), ...]
+    """
+    scorers = [fuzz.ratio]
+    processors = [lambda x: x,
+                  partial(utils.full_process, force_ascii=False),
+                  partial(utils.full_process, force_ascii=True)]
+    splist = list(product(scorers, processors))
+    splist.extend(
+        [(fuzz.WRatio, partial(utils.full_process, force_ascii=True)),
+         (fuzz.QRatio, partial(utils.full_process, force_ascii=True)),
+         (fuzz.UWRatio, partial(utils.full_process, force_ascii=False)),
+         (fuzz.UQRatio, partial(utils.full_process, force_ascii=False))]
+    )
+
+    return splist
+
+
+@pytest.mark.parametrize('scorer,processor',
+                         scorers_processors())
+@given(data=st.data())
+@settings(max_examples=20, deadline=5000)
+def test_identical_strings_extracted(scorer, processor, data):
+    """
+    Test that identical strings will always return a perfect match.
+    :param scorer:
+    :param processor:
+    :param data:
+    :return:
+    """
+    # Draw a list of random strings
+    strings = data.draw(
+        st.lists(
+            st.text(min_size=10, max_size=100, alphabet=HYPOTHESIS_ALPHABET),
+            min_size=1,
+            max_size=10
+        )
+    )
+    # Draw a random integer for the index in that list
+    choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1)))
+
+    # Extract our choice from the list
+    choice = strings[choiceidx]
+
+    # Check process doesn't make our choice the empty string
+    assume(processor(choice) != '')
+
+    # Extract all perfect matches
+    result = process.extractBests(choice,
+                                  strings,
+                                  scorer=scorer,
+                                  processor=processor,
+                                  score_cutoff=100,
+                                  limit=None)
+
+    # Check we get a result
+    assert result != []
+
+    # Check the original is in the list
+    assert (choice, 100) in result
+'''
+
+@pytest.mark.parametrize('scorer,processor', list(product(FULL_SCORERS, PROCESSORS)))
+@given(choices=st.lists(st.text(), min_size=1))
+@settings(max_examples=20, deadline=5000)
+def test_only_identical_strings_extracted(scorer, processor, choices):
+    """
+    Test that only identical (post processing) strings score 100 on the test.
+    If two strings are not identical then using full comparison methods they should
+    not be a perfect (100) match.
+    :param scorer:
+    :param processor:
+    :param data:
+    :return:
+    """
+    query = random.choice(choices)
+    assume(processor(query) != '')
+
+    matches = process.extract(query, choices,
+        scorer=scorer, processor=processor,
+        score_cutoff=100, limit=None)
+
+    assert matches != []
+
+    for match in matches:
+        assert processor(query) == processor(match[0])
--- a/tests/test_process.py
+++ b/tests/test_process.py
@ -14,6 +14,20 @@ class ProcessTest(unittest.TestCase):
            "braves vs mets",
        ]

+    def testExtractOneExceptions(self):
+        self.assertRaises(TypeError, process.extractOne)
+        self.assertRaises(TypeError, process.extractOne, 1)
+        self.assertRaises(TypeError, process.extractOne, 1, [])
+        self.assertRaises(TypeError, process.extractOne, '', [1])
+        self.assertRaises(TypeError, process.extractOne, '', {1:1})
+
+    def testExtractExceptions(self):
+        self.assertRaises(TypeError, process.extract)
+        self.assertRaises(TypeError, process.extract, 1)
+        self.assertRaises(TypeError, process.extract, 1, [])
+        self.assertRaises(TypeError, process.extract, '', [1])
+        self.assertRaises(TypeError, process.extract, '', {1:1})
+
    def testGetBestChoice1(self):
        query = "new york mets at atlanta braves"
        best = process.extractOne(query, self.baseball_strings)
@ -35,12 +49,16 @@ class ProcessTest(unittest.TestCase):
        self.assertEqual(best[0], self.baseball_strings[0])

    def testWithProcessor(self):
+        """
+        extractOne should accept any type as long as it is a string
+        after preprocessing
+        """
        events = [
            ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"],
            ["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"],
            ["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"],
        ]
-        query = "new york mets vs chicago cubs"
+        query = events[0]

        best = process.extractOne(query, events, processor=lambda event: event[0])
        self.assertEqual(best[0], events[0])
 @ -1 +1 @@
 .12.5
 .13.0