mirror of https://github.com/explosion/spaCy.git
Move more of special case retokenize to cdef nogil
Move as much of the special case retokenization to nogil as possible.
This commit is contained in:
parent
72c2f98dc9
commit
0b7e52c797
|
@ -3,6 +3,8 @@ from libcpp.vector cimport vector
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
from libcpp.queue cimport queue
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .structs cimport LexemeC, TokenC
|
from .structs cimport LexemeC, TokenC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
@ -11,6 +13,13 @@ from .vocab cimport Vocab, LexemesOrTokens, _Cached
|
||||||
from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct
|
from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct SpecialSpanStruct:
|
||||||
|
int start
|
||||||
|
int end
|
||||||
|
int span_length_diff
|
||||||
|
_Cached* cached
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef PreshMap _cache
|
cdef PreshMap _cache
|
||||||
|
@ -30,6 +39,8 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
|
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
|
||||||
cdef int _apply_special_cases(self, Doc doc)
|
cdef int _apply_special_cases(self, Doc doc)
|
||||||
|
cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens,
|
||||||
|
queue[SpecialSpanStruct] span_queue) nogil
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
||||||
cdef int _try_specials(self, hash_t key, Doc tokens,
|
cdef int _try_specials(self, hash_t key, Doc tokens,
|
||||||
int* has_special) except -1
|
int* has_special) except -1
|
||||||
|
|
|
@ -240,9 +240,6 @@ cdef class Tokenizer:
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
cdef int span_length_diff = 0
|
cdef int span_length_diff = 0
|
||||||
cdef bint modify_in_place = True
|
cdef bint modify_in_place = True
|
||||||
cdef int idx_offset = 0
|
|
||||||
cdef int orig_final_spacy
|
|
||||||
cdef int orig_idx
|
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef vector[MatchStruct] c_matches
|
cdef vector[MatchStruct] c_matches
|
||||||
self._special_matcher.find_matches(doc, &c_matches)
|
self._special_matcher.find_matches(doc, &c_matches)
|
||||||
|
@ -254,52 +251,34 @@ cdef class Tokenizer:
|
||||||
spans = [doc[match.start:match.end] for match in c_filtered]
|
spans = [doc[match.start:match.end] for match in c_filtered]
|
||||||
# Put span info in span.start-indexed dict and calculate maximum
|
# Put span info in span.start-indexed dict and calculate maximum
|
||||||
# intermediate document size
|
# intermediate document size
|
||||||
span_data = {}
|
cdef SpecialSpanStruct sd
|
||||||
|
cdef queue[SpecialSpanStruct] span_queue
|
||||||
for span in spans:
|
for span in spans:
|
||||||
rule = self._rules.get(span.text, None)
|
rule = self._rules.get(span.text, None)
|
||||||
span_length_diff = 0
|
span_length_diff = 0
|
||||||
|
# Check for rule to differentiate cases like "' '" vs. "''"
|
||||||
if rule:
|
if rule:
|
||||||
span_length_diff = len(rule) - (span.end - span.start)
|
span_length_diff = len(rule) - (span.end - span.start)
|
||||||
if span_length_diff > 0:
|
if span_length_diff > 0:
|
||||||
modify_in_place = False
|
modify_in_place = False
|
||||||
curr_length += span_length_diff
|
curr_length += span_length_diff
|
||||||
if curr_length > max_length:
|
if curr_length > max_length:
|
||||||
max_length = curr_length
|
max_length = curr_length
|
||||||
span_data[span.start] = (span.text, span.start, span.end, span_length_diff)
|
cached = <_Cached*>self._specials.get(hash_string(span.text))
|
||||||
# Modify tokenization according to filtered special cases
|
if cached != NULL:
|
||||||
|
sd.start = span.start
|
||||||
|
sd.end = span.end
|
||||||
|
sd.span_length_diff = span_length_diff
|
||||||
|
sd.cached = cached
|
||||||
|
span_queue.push(sd)
|
||||||
# If modifications never increase doc length, can modify in place
|
# If modifications never increase doc length, can modify in place
|
||||||
if modify_in_place:
|
if modify_in_place:
|
||||||
tokens = doc.c
|
tokens = doc.c
|
||||||
# Otherwise create a separate array to store modified tokens
|
# Otherwise create a separate array to store modified tokens
|
||||||
else:
|
else:
|
||||||
tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
|
tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
|
||||||
i = 0
|
# Modify tokenization according to filtered special cases
|
||||||
while i < doc.length:
|
offset = self._retokenize_special_cases(doc, tokens, span_queue)
|
||||||
if not i in span_data:
|
|
||||||
tokens[i + offset] = doc.c[i]
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
span = span_data[i]
|
|
||||||
cached = <_Cached*>self._specials.get(hash_string(span[0]))
|
|
||||||
if cached == NULL:
|
|
||||||
# Copy original tokens if no rule found
|
|
||||||
for j in range(span[2] - span[1]):
|
|
||||||
tokens[i + offset + j] = doc.c[i + j]
|
|
||||||
i += span[2] - span[1]
|
|
||||||
else:
|
|
||||||
# Copy special case tokens into doc and adjust token and
|
|
||||||
# character offsets
|
|
||||||
idx_offset = 0
|
|
||||||
orig_final_spacy = doc.c[span[2] + offset - 1].spacy
|
|
||||||
orig_idx = doc.c[i].idx
|
|
||||||
for j in range(cached.length):
|
|
||||||
tokens[i + offset + j] = cached.data.tokens[j]
|
|
||||||
tokens[i + offset + j].idx = orig_idx + idx_offset
|
|
||||||
idx_offset += cached.data.tokens[j].lex.length + \
|
|
||||||
1 if cached.data.tokens[j].spacy else 0
|
|
||||||
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
|
|
||||||
i += span[2] - span[1]
|
|
||||||
offset += span[3]
|
|
||||||
# Allocate more memory for doc if needed
|
# Allocate more memory for doc if needed
|
||||||
while doc.length < doc.length + offset:
|
while doc.length < doc.length + offset:
|
||||||
doc._realloc(doc.length * 2)
|
doc._realloc(doc.length * 2)
|
||||||
|
@ -312,6 +291,35 @@ cdef class Tokenizer:
|
||||||
doc.length = doc.length + offset
|
doc.length = doc.length + offset
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens, queue[SpecialSpanStruct] span_queue) nogil:
|
||||||
|
cdef int i = 0
|
||||||
|
cdef int j = 0
|
||||||
|
cdef int offset = 0
|
||||||
|
cdef int idx_offset = 0
|
||||||
|
cdef int orig_final_spacy
|
||||||
|
cdef int orig_idx
|
||||||
|
while i < doc.length:
|
||||||
|
sd = span_queue.front()
|
||||||
|
if span_queue.empty() or i < sd.start:
|
||||||
|
tokens[i + offset] = doc.c[i]
|
||||||
|
i += 1
|
||||||
|
elif i == sd.start:
|
||||||
|
span_queue.pop()
|
||||||
|
# Copy special case tokens into doc and adjust token and
|
||||||
|
# character offsets
|
||||||
|
idx_offset = 0
|
||||||
|
orig_final_spacy = doc.c[sd.end + offset - 1].spacy
|
||||||
|
orig_idx = doc.c[i].idx
|
||||||
|
for j in range(sd.cached.length):
|
||||||
|
tokens[i + offset + j] = sd.cached.data.tokens[j]
|
||||||
|
tokens[i + offset + j].idx = orig_idx + idx_offset
|
||||||
|
idx_offset += sd.cached.data.tokens[j].lex.length + \
|
||||||
|
1 if sd.cached.data.tokens[j].spacy else 0
|
||||||
|
tokens[i + offset + sd.cached.length - 1].spacy = orig_final_spacy
|
||||||
|
i += sd.end - sd.start
|
||||||
|
offset += sd.span_length_diff
|
||||||
|
return offset
|
||||||
|
|
||||||
cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil:
|
cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil:
|
||||||
|
|
||||||
cdef int seen_i
|
cdef int seen_i
|
||||||
|
|
Loading…
Reference in New Issue