From 3f61f5eb546c90aa483afbab8d91508f3d887acb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 23 Nov 2020 10:26:47 +0100 Subject: [PATCH] Use int8_t instead of char in Matcher (#6413) * Use signed char instead of char in Matcher Remove unused char* utf8_t typedef * Use int8_t instead of signed char --- spacy/matcher/matcher.pyx | 20 ++++++++++---------- spacy/typedefs.pxd | 1 - spacy/vocab.pxd | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index bc615b07c..644f7704b 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -3,7 +3,7 @@ from __future__ import unicode_literals from libcpp.vector cimport vector -from libc.stdint cimport int32_t +from libc.stdint cimport int32_t, int8_t from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 @@ -279,7 +279,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e # avoid any processing or mem alloc if the document is empty return output if len(predicates) > 0: - predicate_cache = mem.alloc(length * len(predicates), sizeof(char)) + predicate_cache = mem.alloc(length * len(predicates), sizeof(int8_t)) if extensions is not None and len(extensions) >= 1: nr_extra_attr = max(extensions.values()) + 1 extra_attr_values = mem.alloc(length * nr_extra_attr, sizeof(attr_t)) @@ -320,7 +320,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, - char* cached_py_predicates, + int8_t* cached_py_predicates, Token token, const attr_t* extra_attrs, py_predicates) except *: cdef int q = 0 cdef vector[PatternStateC] new_states @@ -392,7 +392,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match states.push_back(new_states[i]) -cdef int update_predicate_cache(char* cache, +cdef int update_predicate_cache(int8_t* cache, const TokenPatternC* pattern, Token token, predicates) except -1: # If the state references any extra predicates, check whether they match. # These are cached, so that we don't call these potentially expensive @@ -430,7 +430,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs, - const char* predicate_matches) nogil: + const int8_t* predicate_matches) nogil: """We need to consider: a) Does the token match the specification? [Yes, No] b) What's the quantifier? [1, 0+, ?] @@ -488,7 +488,7 @@ cdef action_t get_action(PatternStateC state, Problem: If a quantifier is matching, we're adding a lot of open partials """ - cdef char is_match + cdef int8_t is_match is_match = get_is_match(state, token, extra_attrs, predicate_matches) quantifier = get_quantifier(state) is_final = get_is_final(state) @@ -540,9 +540,9 @@ cdef action_t get_action(PatternStateC state, return RETRY -cdef char get_is_match(PatternStateC state, +cdef int8_t get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs, - const char* predicate_matches) nogil: + const int8_t* predicate_matches) nogil: for i in range(state.pattern.nr_py): if predicate_matches[state.pattern.py_predicates[i]] == -1: return 0 @@ -557,7 +557,7 @@ cdef char get_is_match(PatternStateC state, return True -cdef char get_is_final(PatternStateC state) nogil: +cdef int8_t get_is_final(PatternStateC state) nogil: if state.pattern[1].quantifier == FINAL_ID: id_attr = state.pattern[1].attrs[0] if id_attr.attr != ID: @@ -568,7 +568,7 @@ cdef char get_is_final(PatternStateC state) nogil: return 0 -cdef char get_quantifier(PatternStateC state) nogil: +cdef int8_t get_quantifier(PatternStateC state) nogil: return state.pattern.quantifier diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index bd5b38958..b04781107 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -3,7 +3,6 @@ from libc.stdint cimport uint8_t ctypedef uint64_t hash_t -ctypedef char* utf8_t ctypedef uint64_t attr_t ctypedef uint64_t flags_t ctypedef uint16_t len_t diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 73754eb02..f394c6d4e 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -5,7 +5,7 @@ from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 from .structs cimport LexemeC, TokenC -from .typedefs cimport utf8_t, attr_t, hash_t +from .typedefs cimport attr_t, hash_t from .strings cimport StringStore from .morphology cimport Morphology