properly link to subprojects

This commit is contained in:
Max Bachmann 2022-01-17 16:53:11 +01:00
parent 143b394566
commit dd26483b5f
10 changed files with 6494 additions and 74 deletions

View File

@ -11,11 +11,16 @@ project(rapidfuzz LANGUAGES C CXX)
find_package(NumPy REQUIRED)
find_package(PythonExtensions REQUIRED)
find_package(Threads REQUIRED)
find_package(Python COMPONENTS Interpreter Development)
find_package(Cython REQUIRED)
set(RAPIDFUZZ_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(RF_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(TF_BUILD_CUDA OFF CACHE BOOL "Enables build of CUDA code")
set(TF_BUILD_TESTS OFF CACHE BOOL "Enables build of tests")
set(TF_BUILD_EXAMPLES OFF CACHE BOOL "Enables build of examples")
add_subdirectory(extern/taskflow)
add_subdirectory(extern/rapidfuzz-cpp)
add_subdirectory(src/cython)
add_subdirectory(src/cython/distance)

View File

@ -8,6 +8,7 @@ include src/rapidfuzz/py.typed
recursive-include src/cython CMakeLists.txt
recursive-include src/cython *.hpp
recursive-include src/cython *.cpp
recursive-include src/cython *.pyx
recursive-include src/cython *.pxd

@ -1 +1 @@
Subproject commit 66c100ca849974efcd1e58dcefd197e1c95db20e
Subproject commit cdedc00c0ab3d6b9c8a258dc208448d2eb3f0009

View File

@ -32,5 +32,5 @@ setup(
include_package_data=True,
python_requires=">=3.6",
cmake_args=[f'-DRAPIDFUZZ_CAPI_PATH:STRING={rapidfuzz_capi.get_include()}']
cmake_args=[f'-DRF_CAPI_PATH:STRING={rapidfuzz_capi.get_include()}']
)

View File

@ -1,37 +1,27 @@
# should use target_include_directories once this is supported by scikit-build
include_directories(${RAPIDFUZZ_BASE_DIR}/src/cython)
include_directories(${RF_BASE_DIR}/src/cython)
add_cython_target(cpp_utils CXX)
add_library(cpp_utils MODULE
${cpp_utils}
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/rapidfuzz/details/unicode.cpp)
add_library(cpp_utils MODULE ${cpp_utils} ${RF_BASE_DIR}/src/cython/utils.cpp)
target_compile_features(cpp_utils PUBLIC cxx_std_14)
target_include_directories(cpp_utils PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython)
target_include_directories(cpp_utils PRIVATE ${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython)
target_link_libraries(cpp_utils rapidfuzz::rapidfuzz)
python_extension_module(cpp_utils)
install(TARGETS cpp_utils LIBRARY DESTINATION src/rapidfuzz)
add_cython_target(cpp_fuzz CXX)
add_library(cpp_fuzz MODULE
${cpp_fuzz}
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/rapidfuzz/details/unicode.cpp)
add_library(cpp_fuzz MODULE ${cpp_fuzz})
target_compile_features(cpp_fuzz PUBLIC cxx_std_14)
target_include_directories(cpp_fuzz PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython)
target_include_directories(cpp_fuzz PRIVATE ${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython)
target_link_libraries(cpp_fuzz rapidfuzz::rapidfuzz)
python_extension_module(cpp_fuzz)
install(TARGETS cpp_fuzz LIBRARY DESTINATION src/rapidfuzz)
add_cython_target(cpp_string_metric CXX)
add_library(cpp_string_metric MODULE ${cpp_string_metric})
target_compile_features(cpp_string_metric PUBLIC cxx_std_14)
target_include_directories(cpp_string_metric PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython)
target_include_directories(cpp_string_metric PRIVATE ${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython)
target_link_libraries(cpp_string_metric rapidfuzz::rapidfuzz)
python_extension_module(cpp_string_metric)
install(TARGETS cpp_string_metric LIBRARY DESTINATION src/rapidfuzz)
@ -39,10 +29,9 @@ add_cython_target(cpp_process CXX)
add_library(cpp_process MODULE ${cpp_process})
target_compile_features(cpp_process PUBLIC cxx_std_14)
target_include_directories(cpp_process PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_BASE_DIR}/extern/optional-lite/include
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython)
${RF_BASE_DIR}/extern/optional-lite/include
${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython)
target_link_libraries(cpp_process rapidfuzz::rapidfuzz)
python_extension_module(cpp_process)
install(TARGETS cpp_process LIBRARY DESTINATION src/rapidfuzz)
@ -50,14 +39,8 @@ add_cython_target(cpp_process_cdist CXX)
add_library(cpp_process_cdist MODULE ${cpp_process_cdist})
target_compile_features(cpp_process_cdist PUBLIC cxx_std_14)
target_include_directories(cpp_process_cdist PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_BASE_DIR}/extern/optional-lite/include
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython
${RAPIDFUZZ_BASE_DIR}/extern/taskflow/
${NumPy_INCLUDE_DIR})
# this fails with All uses of target_link_libraries with a target must be either all-keyword or all-plain.
# target_link_libraries(cpp_process_cdist PRIVATE Threads::Threads)
target_link_libraries(cpp_process_cdist Threads::Threads)
${RF_BASE_DIR}/extern/optional-lite/include
${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython ${NumPy_INCLUDE_DIR})
target_link_libraries(cpp_process_cdist Taskflow rapidfuzz::rapidfuzz)
python_extension_module(cpp_process_cdist)
install(TARGETS cpp_process_cdist LIBRARY DESTINATION src/rapidfuzz)

View File

@ -1,8 +1,6 @@
#pragma once
#include "cpp_common.hpp"
#include <rapidfuzz/utils.hpp>
namespace utils = rapidfuzz::utils;
#include "utils.hpp"
PyObject* default_process_impl(PyObject* sentence) {
RF_String c_sentence = convert_string(sentence);
@ -10,19 +8,19 @@ PyObject* default_process_impl(PyObject* sentence) {
switch (c_sentence.kind) {
case RF_UINT8:
{
auto proc_str = utils::default_process(
auto proc_str = default_process(
rapidfuzz::basic_string_view<uint8_t>(static_cast<uint8_t*>(c_sentence.data), c_sentence.length));
return PyUnicode_FromKindAndData(PyUnicode_1BYTE_KIND, proc_str.data(), (Py_ssize_t)proc_str.size());
}
case RF_UINT16:
{
auto proc_str = utils::default_process(
auto proc_str = default_process(
rapidfuzz::basic_string_view<uint16_t>(static_cast<uint16_t*>(c_sentence.data), c_sentence.length));
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, proc_str.data(), (Py_ssize_t)proc_str.size());
}
case RF_UINT32:
{
auto proc_str = utils::default_process(
auto proc_str = default_process(
rapidfuzz::basic_string_view<uint32_t>(static_cast<uint32_t*>(c_sentence.data), c_sentence.length));
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, proc_str.data(), (Py_ssize_t)proc_str.size());
}
@ -50,7 +48,7 @@ RF_String default_process_func_impl(RF_String sentence) {
sentence.dtor = default_string_deinit;
sentence.data = str;
sentence.kind = sentence.kind;
sentence.length = utils::default_process(str, sentence.length);
sentence.length = default_process(str, sentence.length);
return sentence;
}

View File

@ -1,68 +1,50 @@
# should use target_include_directories once this is supported by scikit-build
include_directories(${RAPIDFUZZ_BASE_DIR}/src/cython)
include_directories(${RF_BASE_DIR}/src/cython)
add_cython_target(_initialize CXX)
add_library(_initialize MODULE ${_initialize})
target_compile_features(_initialize PUBLIC cxx_std_14)
target_include_directories(_initialize PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython
${RAPIDFUZZ_BASE_DIR}/src/cython/distance)
target_include_directories(_initialize PRIVATE ${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython ${RF_BASE_DIR}/src/cython/distance)
target_link_libraries(_initialize rapidfuzz::rapidfuzz)
python_extension_module(_initialize)
install(TARGETS _initialize LIBRARY DESTINATION src/rapidfuzz/distance)
add_cython_target(Hamming CXX)
add_library(Hamming MODULE ${Hamming})
target_compile_features(Hamming PUBLIC cxx_std_14)
target_include_directories(Hamming PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython
${RAPIDFUZZ_BASE_DIR}/src/cython/distance)
target_include_directories(Hamming PRIVATE ${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython ${RF_BASE_DIR}/src/cython/distance)
target_link_libraries(Hamming rapidfuzz::rapidfuzz)
python_extension_module(Hamming)
install(TARGETS Hamming LIBRARY DESTINATION src/rapidfuzz/distance)
add_cython_target(Levenshtein CXX)
add_library(Levenshtein MODULE ${Levenshtein})
target_compile_features(Levenshtein PUBLIC cxx_std_14)
target_include_directories(Levenshtein PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython
${RAPIDFUZZ_BASE_DIR}/src/cython/distance)
target_include_directories(Levenshtein PRIVATE ${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython ${RF_BASE_DIR}/src/cython/distance)
target_link_libraries(Levenshtein rapidfuzz::rapidfuzz)
python_extension_module(Levenshtein)
install(TARGETS Levenshtein LIBRARY DESTINATION src/rapidfuzz/distance)
add_cython_target(Indel CXX)
add_library(Indel MODULE ${Indel})
target_compile_features(Indel PUBLIC cxx_std_14)
target_include_directories(Indel PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython
${RAPIDFUZZ_BASE_DIR}/src/cython/distance)
target_include_directories(Indel PRIVATE ${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython ${RF_BASE_DIR}/src/cython/distance)
target_link_libraries(Indel rapidfuzz::rapidfuzz)
python_extension_module(Indel)
install(TARGETS Indel LIBRARY DESTINATION src/rapidfuzz/distance)
add_cython_target(Jaro CXX)
add_library(Jaro MODULE ${Jaro})
target_compile_features(Jaro PUBLIC cxx_std_14)
target_include_directories(Jaro PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython
${RAPIDFUZZ_BASE_DIR}/src/cython/distance)
target_include_directories(Jaro PRIVATE ${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython ${RF_BASE_DIR}/src/cython/distance)
target_link_libraries(Jaro rapidfuzz::rapidfuzz)
python_extension_module(Jaro)
install(TARGETS Jaro LIBRARY DESTINATION src/rapidfuzz/distance)
add_cython_target(JaroWinkler CXX)
add_library(JaroWinkler MODULE ${JaroWinkler})
target_compile_features(JaroWinkler PUBLIC cxx_std_14)
target_include_directories(JaroWinkler PRIVATE
${RAPIDFUZZ_BASE_DIR}/extern/rapidfuzz-cpp/
${RAPIDFUZZ_CAPI_PATH}
${RAPIDFUZZ_BASE_DIR}/src/cython
${RAPIDFUZZ_BASE_DIR}/src/cython/distance)
target_include_directories(JaroWinkler PRIVATE ${RF_CAPI_PATH} ${RF_BASE_DIR}/src/cython ${RF_BASE_DIR}/src/cython/distance)
target_link_libraries(JaroWinkler rapidfuzz::rapidfuzz)
python_extension_module(JaroWinkler)
install(TARGETS JaroWinkler LIBRARY DESTINATION src/rapidfuzz/distance)

6268
src/cython/unicodetype_db.h Normal file

File diff suppressed because it is too large Load Diff

85
src/cython/utils.cpp Normal file
View File

@ -0,0 +1,85 @@
#include "unicode.hpp"
namespace rapidfuzz {
namespace Unicode {
#define ALPHA_MASK 0x01
#define DECIMAL_MASK 0x02
#define DIGIT_MASK 0x04
#define LOWER_MASK 0x08
#define LINEBREAK_MASK 0x10
#define SPACE_MASK 0x20
#define TITLE_MASK 0x40
#define UPPER_MASK 0x80
#define XID_START_MASK 0x100
#define XID_CONTINUE_MASK 0x200
#define PRINTABLE_MASK 0x400
#define NUMERIC_MASK 0x800
#define CASE_IGNORABLE_MASK 0x1000
#define CASED_MASK 0x2000
#define EXTENDED_CASE_MASK 0x4000
constexpr static bool is_alnum(const unsigned short flags) {
return ((flags & ALPHA_MASK)
|| (flags & DECIMAL_MASK)
|| (flags & DIGIT_MASK)
|| (flags & NUMERIC_MASK));
}
typedef struct {
/*
These are either deltas to the character or offsets in
_PyUnicode_ExtendedCase.
*/
const int upper;
const int lower;
const int title;
/* Note if more flag space is needed, decimal and digit could be unified. */
const unsigned char decimal;
const unsigned char digit;
const unsigned short flags;
} _PyUnicode_TypeRecord;
#include "unicodetype_db.h"
static inline const _PyUnicode_TypeRecord * gettyperecord(uint32_t code)
{
unsigned int index;
if (code >= 0x110000)
index = 0;
else
{
index = index1[(code>>SHIFT)];
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
}
return &_PyUnicode_TypeRecords[index];
}
uint32_t UnicodeDefaultProcess(uint32_t ch)
{
/* todo capital sigma not handled
* see Python implementation
*/
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
/* non alphanumeric characyers are replaces with whitespaces */
if (!is_alnum(ctype->flags)) {
return ' ';
}
if (ctype->flags & EXTENDED_CASE_MASK) {
int index = ctype->lower & 0xFFFF;
/*int n = ctype->lower >> 24;
int i;
for (i = 0; i < n; i++)
res[i] = _PyUnicode_ExtendedCase[index + i];*/
/* for now ignore extended cases. The only exisiting
* on is U+0130 anyways */
return _PyUnicode_ExtendedCase[index];
}
return ch + static_cast<uint32_t>(ctype->lower);
}
} // namespace Unicode
} // namespace rapidfuzz

98
src/cython/utils.hpp Normal file
View File

@ -0,0 +1,98 @@
/* SPDX-License-Identifier: MIT */
/* Copyright © 2020-present Max Bachmann */
#pragma once
#include <cstddef>
#include <cmath>
#include <algorithm>
#include <array>
#include <cctype>
#include <cwctype>
#include <limits>
#include <rapidfuzz/details/common.hpp>
uint32_t UnicodeDefaultProcess(uint32_t ch);
/**
* @brief removes any non alphanumeric characters, trim whitespaces from
* beginning/end and lowercase the string. Currently this only supports
* Ascii. Characters outside of the ascii spec are not changed. This
* will be changed in the future to support full unicode. In case this has
* has a noticable effect on the performance an additional `ascii_default_process`
* function will be provided, that keeps this behaviour
*
* @tparam CharT char type of the string
*
* @param s string to process
*
* @return returns the processed string
*/
template <typename CharT>
size_t default_process(CharT* str, size_t len)
{
/* mapping converting
* - non alphanumeric characters to whitespace (32)
* - alphanumeric characters to lowercase
*
* generated using
* `[ord(chr(x).lower()) if chr(x).isalnum() else 0x20 for x in range(256)]`
* in Python3.9
*/
static const int extended_ascii_mapping[256] = {
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 32, 32, 32, 32, 32, 32, 32, 97, 98, 99, 100, 101, 102, 103,
104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
122, 32, 32, 32, 32, 32, 32, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 170, 32, 32, 32, 32, 32, 32, 32, 178, 179,
32, 181, 32, 32, 32, 185, 186, 32, 188, 189, 190, 32, 224, 225, 226, 227, 228, 229,
230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 32,
248, 249, 250, 251, 252, 253, 254, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 32, 248, 249, 250, 251,
252, 253, 254, 255};
std::transform(str, str + len, str, [](CharT ch) {
/* irrelevant cases for a given char type are removed at compile time by any decent compiler
*/
if (ch < 0 || ch > std::numeric_limits<uint32_t>::max()) {
return ch;
}
else if (ch < 256) {
return static_cast<CharT>(extended_ascii_mapping[ch]);
}
else {
return static_cast<CharT>(UnicodeDefaultProcess(static_cast<uint32_t>(ch)));
}
});
while (len > 0 && str[len - 1] == ' ') {
len--;
}
size_t prefix = 0;
while (len > 0 && str[prefix] == ' ') {
len--;
prefix++;
}
if (prefix != 0) {
std::copy(str + prefix, str + prefix + len, str);
}
return len;
}
template <typename CharT>
std::basic_string<CharT> default_process(rapidfuzz::basic_string_view<CharT> s)
{
std::basic_string<CharT> str(s);
size_t len = default_process(&str[0], str.size());
str.resize(len);
return str;
}