diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index ec6640196..dca30e3d7 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -3,6 +3,8 @@ from libcpp.vector cimport vector from preshed.maps cimport PreshMap from cymem.cymem cimport Pool +from libcpp.queue cimport queue + from .typedefs cimport hash_t from .structs cimport LexemeC, TokenC from .strings cimport StringStore @@ -11,6 +13,13 @@ from .vocab cimport Vocab, LexemesOrTokens, _Cached from .matcher.phrasematcher cimport PhraseMatcher, MatchStruct +cdef struct SpecialSpanStruct: + int start + int end + int span_length_diff + _Cached* cached + + cdef class Tokenizer: cdef Pool mem cdef PreshMap _cache @@ -30,6 +39,8 @@ cdef class Tokenizer: cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) + cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens, + queue[SpecialSpanStruct] span_queue) nogil cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ec3394861..e0f814550 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -240,9 +240,6 @@ cdef class Tokenizer: cdef int offset = 0 cdef int span_length_diff = 0 cdef bint modify_in_place = True - cdef int idx_offset = 0 - cdef int orig_final_spacy - cdef int orig_idx cdef Pool mem = Pool() cdef vector[MatchStruct] c_matches self._special_matcher.find_matches(doc, &c_matches) @@ -254,52 +251,34 @@ cdef class Tokenizer: spans = [doc[match.start:match.end] for match in c_filtered] # Put span info in span.start-indexed dict and calculate maximum # intermediate document size - span_data = {} + cdef SpecialSpanStruct sd + cdef queue[SpecialSpanStruct] span_queue for span in spans: rule = self._rules.get(span.text, None) span_length_diff = 0 + # Check for rule to differentiate cases like "' '" vs. "''" if rule: span_length_diff = len(rule) - (span.end - span.start) - if span_length_diff > 0: - modify_in_place = False - curr_length += span_length_diff - if curr_length > max_length: - max_length = curr_length - span_data[span.start] = (span.text, span.start, span.end, span_length_diff) - # Modify tokenization according to filtered special cases + if span_length_diff > 0: + modify_in_place = False + curr_length += span_length_diff + if curr_length > max_length: + max_length = curr_length + cached = <_Cached*>self._specials.get(hash_string(span.text)) + if cached != NULL: + sd.start = span.start + sd.end = span.end + sd.span_length_diff = span_length_diff + sd.cached = cached + span_queue.push(sd) # If modifications never increase doc length, can modify in place if modify_in_place: tokens = doc.c # Otherwise create a separate array to store modified tokens else: tokens = mem.alloc(max_length, sizeof(TokenC)) - i = 0 - while i < doc.length: - if not i in span_data: - tokens[i + offset] = doc.c[i] - i += 1 - else: - span = span_data[i] - cached = <_Cached*>self._specials.get(hash_string(span[0])) - if cached == NULL: - # Copy original tokens if no rule found - for j in range(span[2] - span[1]): - tokens[i + offset + j] = doc.c[i + j] - i += span[2] - span[1] - else: - # Copy special case tokens into doc and adjust token and - # character offsets - idx_offset = 0 - orig_final_spacy = doc.c[span[2] + offset - 1].spacy - orig_idx = doc.c[i].idx - for j in range(cached.length): - tokens[i + offset + j] = cached.data.tokens[j] - tokens[i + offset + j].idx = orig_idx + idx_offset - idx_offset += cached.data.tokens[j].lex.length + \ - 1 if cached.data.tokens[j].spacy else 0 - tokens[i + offset + cached.length - 1].spacy = orig_final_spacy - i += span[2] - span[1] - offset += span[3] + # Modify tokenization according to filtered special cases + offset = self._retokenize_special_cases(doc, tokens, span_queue) # Allocate more memory for doc if needed while doc.length < doc.length + offset: doc._realloc(doc.length * 2) @@ -312,6 +291,35 @@ cdef class Tokenizer: doc.length = doc.length + offset return True + cdef int _retokenize_special_cases(self, Doc doc, TokenC* tokens, queue[SpecialSpanStruct] span_queue) nogil: + cdef int i = 0 + cdef int j = 0 + cdef int offset = 0 + cdef int idx_offset = 0 + cdef int orig_final_spacy + cdef int orig_idx + while i < doc.length: + sd = span_queue.front() + if span_queue.empty() or i < sd.start: + tokens[i + offset] = doc.c[i] + i += 1 + elif i == sd.start: + span_queue.pop() + # Copy special case tokens into doc and adjust token and + # character offsets + idx_offset = 0 + orig_final_spacy = doc.c[sd.end + offset - 1].spacy + orig_idx = doc.c[i].idx + for j in range(sd.cached.length): + tokens[i + offset + j] = sd.cached.data.tokens[j] + tokens[i + offset + j].idx = orig_idx + idx_offset + idx_offset += sd.cached.data.tokens[j].lex.length + \ + 1 if sd.cached.data.tokens[j].spacy else 0 + tokens[i + offset + sd.cached.length - 1].spacy = orig_final_spacy + i += sd.end - sd.start + offset += sd.span_length_diff + return offset + cdef void _filter_spans(self, vector[MatchStruct] &original, vector[MatchStruct] &filtered, int doc_len) nogil: cdef int seen_i