Move more of special case retokenize to cdef nogil

Move as much of the special case retokenization to nogil as possible.
This commit is contained in:
Adriane Boyd 2019-09-27 09:26:20 +02:00
parent 72c2f98dc9
commit 0b7e52c797
2 changed files with 57 additions and 38 deletions

View File

@ -3,6 +3,8 @@ from libcpp.vector cimport vector
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from libcpp.queue cimport queue
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC from .structs cimport LexemeC, TokenC
from .strings cimport StringStore from .strings cimport StringStore
@ -11,6 +13,13 @@ from .vocab cimport Vocab, LexemesOrTokens, _Cached
from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct
cdef struct SpecialSpanStruct:
int start
int end
int span_length_diff
_Cached* cached
cdef class Tokenizer: cdef class Tokenizer:
cdef Pool mem cdef Pool mem
cdef PreshMap _cache cdef PreshMap _cache
@ -30,6 +39,8 @@ cdef class Tokenizer:
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) cdef int _apply_special_cases(self, Doc doc)
cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens,
queue[SpecialSpanStruct] span_queue) nogil
cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _try_cache(self, hash_t key, Doc tokens) except -1
cdef int _try_specials(self, hash_t key, Doc tokens, cdef int _try_specials(self, hash_t key, Doc tokens,
int* has_special) except -1 int* has_special) except -1

View File

@ -240,9 +240,6 @@ cdef class Tokenizer:
cdef int offset = 0 cdef int offset = 0
cdef int span_length_diff = 0 cdef int span_length_diff = 0
cdef bint modify_in_place = True cdef bint modify_in_place = True
cdef int idx_offset = 0
cdef int orig_final_spacy
cdef int orig_idx
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef vector[MatchStruct] c_matches cdef vector[MatchStruct] c_matches
self._special_matcher.find_matches(doc, &c_matches) self._special_matcher.find_matches(doc, &c_matches)
@ -254,10 +251,12 @@ cdef class Tokenizer:
spans = [doc[match.start:match.end] for match in c_filtered] spans = [doc[match.start:match.end] for match in c_filtered]
# Put span info in span.start-indexed dict and calculate maximum # Put span info in span.start-indexed dict and calculate maximum
# intermediate document size # intermediate document size
span_data = {} cdef SpecialSpanStruct sd
cdef queue[SpecialSpanStruct] span_queue
for span in spans: for span in spans:
rule = self._rules.get(span.text, None) rule = self._rules.get(span.text, None)
span_length_diff = 0 span_length_diff = 0
# Check for rule to differentiate cases like "' '" vs. "''"
if rule: if rule:
span_length_diff = len(rule) - (span.end - span.start) span_length_diff = len(rule) - (span.end - span.start)
if span_length_diff > 0: if span_length_diff > 0:
@ -265,41 +264,21 @@ cdef class Tokenizer:
curr_length += span_length_diff curr_length += span_length_diff
if curr_length > max_length: if curr_length > max_length:
max_length = curr_length max_length = curr_length
span_data[span.start] = (span.text, span.start, span.end, span_length_diff) cached = <_Cached*>self._specials.get(hash_string(span.text))
# Modify tokenization according to filtered special cases if cached != NULL:
sd.start = span.start
sd.end = span.end
sd.span_length_diff = span_length_diff
sd.cached = cached
span_queue.push(sd)
# If modifications never increase doc length, can modify in place # If modifications never increase doc length, can modify in place
if modify_in_place: if modify_in_place:
tokens = doc.c tokens = doc.c
# Otherwise create a separate array to store modified tokens # Otherwise create a separate array to store modified tokens
else: else:
tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC)) tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
i = 0 # Modify tokenization according to filtered special cases
while i < doc.length: offset = self._retokenize_special_cases(doc, tokens, span_queue)
if not i in span_data:
tokens[i + offset] = doc.c[i]
i += 1
else:
span = span_data[i]
cached = <_Cached*>self._specials.get(hash_string(span[0]))
if cached == NULL:
# Copy original tokens if no rule found
for j in range(span[2] - span[1]):
tokens[i + offset + j] = doc.c[i + j]
i += span[2] - span[1]
else:
# Copy special case tokens into doc and adjust token and
# character offsets
idx_offset = 0
orig_final_spacy = doc.c[span[2] + offset - 1].spacy
orig_idx = doc.c[i].idx
for j in range(cached.length):
tokens[i + offset + j] = cached.data.tokens[j]
tokens[i + offset + j].idx = orig_idx + idx_offset
idx_offset += cached.data.tokens[j].lex.length + \
1 if cached.data.tokens[j].spacy else 0
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
i += span[2] - span[1]
offset += span[3]
# Allocate more memory for doc if needed # Allocate more memory for doc if needed
while doc.length < doc.length + offset: while doc.length < doc.length + offset:
doc._realloc(doc.length * 2) doc._realloc(doc.length * 2)
@ -312,6 +291,35 @@ cdef class Tokenizer:
doc.length = doc.length + offset doc.length = doc.length + offset
return True return True
cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens, queue[SpecialSpanStruct] span_queue) nogil:
cdef int i = 0
cdef int j = 0
cdef int offset = 0
cdef int idx_offset = 0
cdef int orig_final_spacy
cdef int orig_idx
while i < doc.length:
sd = span_queue.front()
if span_queue.empty() or i < sd.start:
tokens[i + offset] = doc.c[i]
i += 1
elif i == sd.start:
span_queue.pop()
# Copy special case tokens into doc and adjust token and
# character offsets
idx_offset = 0
orig_final_spacy = doc.c[sd.end + offset - 1].spacy
orig_idx = doc.c[i].idx
for j in range(sd.cached.length):
tokens[i + offset + j] = sd.cached.data.tokens[j]
tokens[i + offset + j].idx = orig_idx + idx_offset
idx_offset += sd.cached.data.tokens[j].lex.length + \
1 if sd.cached.data.tokens[j].spacy else 0
tokens[i + offset + sd.cached.length - 1].spacy = orig_final_spacy
i += sd.end - sd.start
offset += sd.span_length_diff
return offset
cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil: cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil:
cdef int seen_i cdef int seen_i