Move more of special case retokenize to cdef nogil

Move as much of the special case retokenization to nogil as possible.
2019-09-27 09:26:20 +02:00 · 2019-09-27 09:26:20 +02:00 · 0b7e52c797
parent 72c2f98dc9
commit 0b7e52c797
2 changed files with 57 additions and 38 deletions
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -3,6 +3,8 @@ from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from libcpp.queue cimport queue
 from .typedefs cimport hash_t
 from .structs cimport LexemeC, TokenC
 from .strings cimport StringStore
@ -11,6 +13,13 @@ from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct
 cdef struct SpecialSpanStruct:
    int start
    int end
    int span_length_diff
    _Cached* cached
 cdef class Tokenizer:
    cdef Pool mem
    cdef PreshMap _cache
@ -30,6 +39,8 @@ cdef class Tokenizer:
    cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
    cdef int _apply_special_cases(self, Doc doc)
    cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens,
                                       queue[SpecialSpanStruct] span_queue) nogil
    cdef int _try_cache(self, hash_t key, Doc tokens) except -1
    cdef int _try_specials(self, hash_t key, Doc tokens,
                           int* has_special) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -240,9 +240,6 @@ cdef class Tokenizer:
        cdef int offset = 0
        cdef int span_length_diff = 0
        cdef bint modify_in_place = True
        cdef int idx_offset = 0
        cdef int orig_final_spacy
        cdef int orig_idx
        cdef Pool mem = Pool()
        cdef vector[MatchStruct] c_matches
        self._special_matcher.find_matches(doc, &c_matches)
@ -254,52 +251,34 @@ cdef class Tokenizer:
        spans = [doc[match.start:match.end] for match in c_filtered]
        # Put span info in span.start-indexed dict and calculate maximum
        # intermediate document size
-        span_data = {}
+        cdef SpecialSpanStruct sd
        cdef queue[SpecialSpanStruct] span_queue
        for span in spans:
            rule = self._rules.get(span.text, None)
            span_length_diff = 0
            # Check for rule to differentiate cases like "' '" vs. "''"
            if rule:
                span_length_diff = len(rule) - (span.end - span.start)
-            if span_length_diff > 0:
+                if span_length_diff > 0:
-                modify_in_place = False
+                    modify_in_place = False
-            curr_length += span_length_diff
+                curr_length += span_length_diff
-            if curr_length > max_length:
+                if curr_length > max_length:
-                max_length = curr_length
+                    max_length = curr_length
-            span_data[span.start] = (span.text, span.start, span.end, span_length_diff)
+                cached = <_Cached*>self._specials.get(hash_string(span.text))
-        # Modify tokenization according to filtered special cases
+                if cached != NULL:
                    sd.start = span.start
                    sd.end = span.end
                    sd.span_length_diff = span_length_diff
                    sd.cached = cached
                    span_queue.push(sd)
        # If modifications never increase doc length, can modify in place
        if modify_in_place:
            tokens = doc.c
        # Otherwise create a separate array to store modified tokens
        else:
            tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
-        i = 0
+        # Modify tokenization according to filtered special cases
-        while i < doc.length:
+        offset = self._retokenize_special_cases(doc, tokens, span_queue)
            if not i in span_data:
                tokens[i + offset] = doc.c[i]
                i += 1
            else:
                span = span_data[i]
                cached = <_Cached*>self._specials.get(hash_string(span[0]))
                if cached == NULL:
                    # Copy original tokens if no rule found
                    for j in range(span[2] - span[1]):
                        tokens[i + offset + j] = doc.c[i + j]
                    i += span[2] - span[1]
                else:
                    # Copy special case tokens into doc and adjust token and
                    # character offsets
                    idx_offset = 0
                    orig_final_spacy = doc.c[span[2] + offset - 1].spacy
                    orig_idx = doc.c[i].idx
                    for j in range(cached.length):
                        tokens[i + offset + j] = cached.data.tokens[j]
                        tokens[i + offset + j].idx = orig_idx + idx_offset
                        idx_offset += cached.data.tokens[j].lex.length + \
                                1 if cached.data.tokens[j].spacy else 0
                    tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
                    i += span[2] - span[1]
                    offset += span[3]
        # Allocate more memory for doc if needed
        while doc.length < doc.length + offset:
            doc._realloc(doc.length * 2)
@ -312,6 +291,35 @@ cdef class Tokenizer:
        doc.length = doc.length + offset
        return True
    cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens, queue[SpecialSpanStruct] span_queue) nogil:
        cdef int i = 0
        cdef int j = 0
        cdef int offset = 0
        cdef int idx_offset = 0
        cdef int orig_final_spacy
        cdef int orig_idx
        while i < doc.length:
            sd = span_queue.front()
            if span_queue.empty() or i < sd.start:
                tokens[i + offset] = doc.c[i]
                i += 1
            elif i == sd.start:
                span_queue.pop()
                # Copy special case tokens into doc and adjust token and
                # character offsets
                idx_offset = 0
                orig_final_spacy = doc.c[sd.end + offset - 1].spacy
                orig_idx = doc.c[i].idx
                for j in range(sd.cached.length):
                    tokens[i + offset + j] = sd.cached.data.tokens[j]
                    tokens[i + offset + j].idx = orig_idx + idx_offset
                    idx_offset += sd.cached.data.tokens[j].lex.length + \
                            1 if sd.cached.data.tokens[j].spacy else 0
                tokens[i + offset + sd.cached.length - 1].spacy = orig_final_spacy
                i += sd.end - sd.start
                offset += sd.span_length_diff
        return offset
    cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil:
        cdef int seen_i