Move more of special case retokenize to cdef nogil

Move as much of the special case retokenization to nogil as possible.
2019-09-27 09:26:20 +02:00 · 2019-09-27 09:26:20 +02:00 · 0b7e52c797
parent 72c2f98dc9
commit 0b7e52c797
2 changed files with 57 additions and 38 deletions
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -3,6 +3,8 @@ from libcpp.vector cimport vector
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool

+from libcpp.queue cimport queue
+
 from .typedefs cimport hash_t
 from .structs cimport LexemeC, TokenC
 from .strings cimport StringStore
@ -11,6 +13,13 @@ from .vocab cimport Vocab, LexemesOrTokens, _Cached
 from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct


+cdef struct SpecialSpanStruct:
+    int start
+    int end
+    int span_length_diff
+    _Cached* cached
+
+
 cdef class Tokenizer:
    cdef Pool mem
    cdef PreshMap _cache
@ -30,6 +39,8 @@ cdef class Tokenizer:

    cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
    cdef int _apply_special_cases(self, Doc doc)
+    cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens,
+                                       queue[SpecialSpanStruct] span_queue) nogil
    cdef int _try_cache(self, hash_t key, Doc tokens) except -1
    cdef int _try_specials(self, hash_t key, Doc tokens,
                           int* has_special) except -1
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -240,9 +240,6 @@ cdef class Tokenizer:
        cdef int offset = 0
        cdef int span_length_diff = 0
        cdef bint modify_in_place = True
-        cdef int idx_offset = 0
-        cdef int orig_final_spacy
-        cdef int orig_idx
        cdef Pool mem = Pool()
        cdef vector[MatchStruct] c_matches
        self._special_matcher.find_matches(doc, &c_matches)
@ -254,10 +251,12 @@ cdef class Tokenizer:
        spans = [doc[match.start:match.end] for match in c_filtered]
        # Put span info in span.start-indexed dict and calculate maximum
        # intermediate document size
-        span_data = {}
+        cdef SpecialSpanStruct sd
+        cdef queue[SpecialSpanStruct] span_queue
        for span in spans:
            rule = self._rules.get(span.text, None)
            span_length_diff = 0
+            # Check for rule to differentiate cases like "' '" vs. "''"
            if rule:
                span_length_diff = len(rule) - (span.end - span.start)
                if span_length_diff > 0:
@ -265,41 +264,21 @@ cdef class Tokenizer:
                curr_length += span_length_diff
                if curr_length > max_length:
                    max_length = curr_length
-            span_data[span.start] = (span.text, span.start, span.end, span_length_diff)
-        # Modify tokenization according to filtered special cases
+                cached = <_Cached*>self._specials.get(hash_string(span.text))
+                if cached != NULL:
+                    sd.start = span.start
+                    sd.end = span.end
+                    sd.span_length_diff = span_length_diff
+                    sd.cached = cached
+                    span_queue.push(sd)
        # If modifications never increase doc length, can modify in place
        if modify_in_place:
            tokens = doc.c
        # Otherwise create a separate array to store modified tokens
        else:
            tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
-        i = 0
-        while i < doc.length:
-            if not i in span_data:
-                tokens[i + offset] = doc.c[i]
-                i += 1
-            else:
-                span = span_data[i]
-                cached = <_Cached*>self._specials.get(hash_string(span[0]))
-                if cached == NULL:
-                    # Copy original tokens if no rule found
-                    for j in range(span[2] - span[1]):
-                        tokens[i + offset + j] = doc.c[i + j]
-                    i += span[2] - span[1]
-                else:
-                    # Copy special case tokens into doc and adjust token and
-                    # character offsets
-                    idx_offset = 0
-                    orig_final_spacy = doc.c[span[2] + offset - 1].spacy
-                    orig_idx = doc.c[i].idx
-                    for j in range(cached.length):
-                        tokens[i + offset + j] = cached.data.tokens[j]
-                        tokens[i + offset + j].idx = orig_idx + idx_offset
-                        idx_offset += cached.data.tokens[j].lex.length + \
-                                1 if cached.data.tokens[j].spacy else 0
-                    tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
-                    i += span[2] - span[1]
-                    offset += span[3]
+        # Modify tokenization according to filtered special cases
+        offset = self._retokenize_special_cases(doc, tokens, span_queue)
        # Allocate more memory for doc if needed
        while doc.length < doc.length + offset:
            doc._realloc(doc.length * 2)
@ -312,6 +291,35 @@ cdef class Tokenizer:
        doc.length = doc.length + offset
        return True

+    cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens, queue[SpecialSpanStruct] span_queue) nogil:
+        cdef int i = 0
+        cdef int j = 0
+        cdef int offset = 0
+        cdef int idx_offset = 0
+        cdef int orig_final_spacy
+        cdef int orig_idx
+        while i < doc.length:
+            sd = span_queue.front()
+            if span_queue.empty() or i < sd.start:
+                tokens[i + offset] = doc.c[i]
+                i += 1
+            elif i == sd.start:
+                span_queue.pop()
+                # Copy special case tokens into doc and adjust token and
+                # character offsets
+                idx_offset = 0
+                orig_final_spacy = doc.c[sd.end + offset - 1].spacy
+                orig_idx = doc.c[i].idx
+                for j in range(sd.cached.length):
+                    tokens[i + offset + j] = sd.cached.data.tokens[j]
+                    tokens[i + offset + j].idx = orig_idx + idx_offset
+                    idx_offset += sd.cached.data.tokens[j].lex.length + \
+                            1 if sd.cached.data.tokens[j].spacy else 0
+                tokens[i + offset + sd.cached.length - 1].spacy = orig_final_spacy
+                i += sd.end - sd.start
+                offset += sd.span_length_diff
+        return offset
+
    cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil:

        cdef int seen_i