mirror of https://github.com/explosion/spaCy.git
Move more of special case retokenize to cdef nogil
Move as much of the special case retokenization to nogil as possible.
This commit is contained in:
parent
72c2f98dc9
commit
0b7e52c797
|
@ -3,6 +3,8 @@ from libcpp.vector cimport vector
|
|||
from preshed.maps cimport PreshMap
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from libcpp.queue cimport queue
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport LexemeC, TokenC
|
||||
from .strings cimport StringStore
|
||||
|
@ -11,6 +13,13 @@ from .vocab cimport Vocab, LexemesOrTokens, _Cached
|
|||
from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct
|
||||
|
||||
|
||||
cdef struct SpecialSpanStruct:
|
||||
int start
|
||||
int end
|
||||
int span_length_diff
|
||||
_Cached* cached
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
cdef Pool mem
|
||||
cdef PreshMap _cache
|
||||
|
@ -30,6 +39,8 @@ cdef class Tokenizer:
|
|||
|
||||
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
|
||||
cdef int _apply_special_cases(self, Doc doc)
|
||||
cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens,
|
||||
queue[SpecialSpanStruct] span_queue) nogil
|
||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
||||
cdef int _try_specials(self, hash_t key, Doc tokens,
|
||||
int* has_special) except -1
|
||||
|
|
|
@ -240,9 +240,6 @@ cdef class Tokenizer:
|
|||
cdef int offset = 0
|
||||
cdef int span_length_diff = 0
|
||||
cdef bint modify_in_place = True
|
||||
cdef int idx_offset = 0
|
||||
cdef int orig_final_spacy
|
||||
cdef int orig_idx
|
||||
cdef Pool mem = Pool()
|
||||
cdef vector[MatchStruct] c_matches
|
||||
self._special_matcher.find_matches(doc, &c_matches)
|
||||
|
@ -254,10 +251,12 @@ cdef class Tokenizer:
|
|||
spans = [doc[match.start:match.end] for match in c_filtered]
|
||||
# Put span info in span.start-indexed dict and calculate maximum
|
||||
# intermediate document size
|
||||
span_data = {}
|
||||
cdef SpecialSpanStruct sd
|
||||
cdef queue[SpecialSpanStruct] span_queue
|
||||
for span in spans:
|
||||
rule = self._rules.get(span.text, None)
|
||||
span_length_diff = 0
|
||||
# Check for rule to differentiate cases like "' '" vs. "''"
|
||||
if rule:
|
||||
span_length_diff = len(rule) - (span.end - span.start)
|
||||
if span_length_diff > 0:
|
||||
|
@ -265,41 +264,21 @@ cdef class Tokenizer:
|
|||
curr_length += span_length_diff
|
||||
if curr_length > max_length:
|
||||
max_length = curr_length
|
||||
span_data[span.start] = (span.text, span.start, span.end, span_length_diff)
|
||||
# Modify tokenization according to filtered special cases
|
||||
cached = <_Cached*>self._specials.get(hash_string(span.text))
|
||||
if cached != NULL:
|
||||
sd.start = span.start
|
||||
sd.end = span.end
|
||||
sd.span_length_diff = span_length_diff
|
||||
sd.cached = cached
|
||||
span_queue.push(sd)
|
||||
# If modifications never increase doc length, can modify in place
|
||||
if modify_in_place:
|
||||
tokens = doc.c
|
||||
# Otherwise create a separate array to store modified tokens
|
||||
else:
|
||||
tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
|
||||
i = 0
|
||||
while i < doc.length:
|
||||
if not i in span_data:
|
||||
tokens[i + offset] = doc.c[i]
|
||||
i += 1
|
||||
else:
|
||||
span = span_data[i]
|
||||
cached = <_Cached*>self._specials.get(hash_string(span[0]))
|
||||
if cached == NULL:
|
||||
# Copy original tokens if no rule found
|
||||
for j in range(span[2] - span[1]):
|
||||
tokens[i + offset + j] = doc.c[i + j]
|
||||
i += span[2] - span[1]
|
||||
else:
|
||||
# Copy special case tokens into doc and adjust token and
|
||||
# character offsets
|
||||
idx_offset = 0
|
||||
orig_final_spacy = doc.c[span[2] + offset - 1].spacy
|
||||
orig_idx = doc.c[i].idx
|
||||
for j in range(cached.length):
|
||||
tokens[i + offset + j] = cached.data.tokens[j]
|
||||
tokens[i + offset + j].idx = orig_idx + idx_offset
|
||||
idx_offset += cached.data.tokens[j].lex.length + \
|
||||
1 if cached.data.tokens[j].spacy else 0
|
||||
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
|
||||
i += span[2] - span[1]
|
||||
offset += span[3]
|
||||
# Modify tokenization according to filtered special cases
|
||||
offset = self._retokenize_special_cases(doc, tokens, span_queue)
|
||||
# Allocate more memory for doc if needed
|
||||
while doc.length < doc.length + offset:
|
||||
doc._realloc(doc.length * 2)
|
||||
|
@ -312,6 +291,35 @@ cdef class Tokenizer:
|
|||
doc.length = doc.length + offset
|
||||
return True
|
||||
|
||||
cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens, queue[SpecialSpanStruct] span_queue) nogil:
|
||||
cdef int i = 0
|
||||
cdef int j = 0
|
||||
cdef int offset = 0
|
||||
cdef int idx_offset = 0
|
||||
cdef int orig_final_spacy
|
||||
cdef int orig_idx
|
||||
while i < doc.length:
|
||||
sd = span_queue.front()
|
||||
if span_queue.empty() or i < sd.start:
|
||||
tokens[i + offset] = doc.c[i]
|
||||
i += 1
|
||||
elif i == sd.start:
|
||||
span_queue.pop()
|
||||
# Copy special case tokens into doc and adjust token and
|
||||
# character offsets
|
||||
idx_offset = 0
|
||||
orig_final_spacy = doc.c[sd.end + offset - 1].spacy
|
||||
orig_idx = doc.c[i].idx
|
||||
for j in range(sd.cached.length):
|
||||
tokens[i + offset + j] = sd.cached.data.tokens[j]
|
||||
tokens[i + offset + j].idx = orig_idx + idx_offset
|
||||
idx_offset += sd.cached.data.tokens[j].lex.length + \
|
||||
1 if sd.cached.data.tokens[j].spacy else 0
|
||||
tokens[i + offset + sd.cached.length - 1].spacy = orig_final_spacy
|
||||
i += sd.end - sd.start
|
||||
offset += sd.span_length_diff
|
||||
return offset
|
||||
|
||||
cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil:
|
||||
|
||||
cdef int seen_i
|
||||
|
|
Loading…
Reference in New Issue