diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 1d3c5b9c3..6f4656962 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -29,7 +29,7 @@ cdef class Tokenizer: cpdef Doc tokens_from_list(self, list strings) - cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1 + cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1 cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except NULL diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index aa348abd0..cd9dd722f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -39,16 +39,17 @@ cdef class Tokenizer: return cls(vocab, rules, prefix_re, suffix_re, infix_re) cpdef Doc tokens_from_list(self, list strings): - cdef int length = sum([len(s) for s in strings]) - cdef Doc tokens = Doc(self.vocab, ' '.join(strings)) - if length == 0: + cdef Doc tokens = Doc(self.vocab) + if sum([len(s) for s in strings]) == 0: return tokens cdef UniStr string_struct cdef unicode py_string cdef int idx = 0 for i, py_string in enumerate(strings): slice_unicode(&string_struct, py_string, 0, len(py_string)) - tokens.push_back(idx, self.vocab.get(tokens.mem, &string_struct)) + # Note that we pass tokens.mem here --- the Doc object has ownership + tokens.push_back( + self.vocab.get(tokens.mem, &string_struct), True) idx += len(py_string) + 1 return tokens @@ -73,7 +74,7 @@ cdef class Tokenizer: tokens (Doc): A Doc object, giving access to a sequence of LexemeCs. """ cdef int length = len(string) - cdef Doc tokens = Doc(self.vocab, string) + cdef Doc tokens = Doc(self.vocab) if length == 0: return tokens cdef int i = 0 @@ -86,32 +87,39 @@ cdef class Tokenizer: if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: slice_unicode(&span, chars, start, i) - cache_hit = self._try_cache(start, span.key, tokens) + cache_hit = self._try_cache(span.key, tokens) if not cache_hit: self._tokenize(tokens, &span, start, i) in_ws = not in_ws start = i if chars[i] == ' ': + tokens.data[tokens.length - 1].spacy = True start += 1 i += 1 if start < i: slice_unicode(&span, chars, start, i) - cache_hit = self._try_cache(start, span.key, tokens) + cache_hit = self._try_cache(span.key, tokens) if not cache_hit: self._tokenize(tokens, &span, start, i) + + tokens.data[tokens.length - 1].spacy = string[-1] == ' ' return tokens - cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1: + cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: return False cdef int i + cdef int less_one = cached.length-1 if cached.is_lex: - for i in range(cached.length): - idx = tokens.push_back(idx, cached.data.lexemes[i]) + for i in range(less_one): + # There's a space at the end of the chunk. + tokens.push_back(cached.data.lexemes[i], False) + tokens.push_back(cached.data.lexemes[less_one], False) else: - for i in range(cached.length): - idx = tokens.push_back(idx, &cached.data.tokens[i]) + for i in range(less_one): + tokens.push_back(&cached.data.tokens[i], False) + tokens.push_back(&cached.data.tokens[less_one], False) return True cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1: @@ -171,36 +179,39 @@ cdef class Tokenizer: vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes) except -1: cdef bint cache_hit + cdef bint is_spacy cdef int split cdef const LexemeC* const* lexemes - cdef LexemeC* lexeme + cdef const LexemeC* lexeme cdef UniStr span cdef int i + # Have to calculate is_spacy here, i.e. does the token have a trailing + # space. There are no spaces *between* the tokens we attach + # here, and there *is* a space after the last token. if prefixes.size(): for i in range(prefixes.size()): - idx = tokens.push_back(idx, prefixes[0][i]) + tokens.push_back(prefixes[0][i], False) if string.n != 0: - cache_hit = self._try_cache(idx, string.key, tokens) + cache_hit = self._try_cache(string.key, tokens) if cache_hit: - # Get last idx - idx = tokens.data[tokens.length - 1].idx - # Increment by last length - idx += tokens.data[tokens.length - 1].lex.length + pass else: split = self._find_infix(string.chars, string.n) if split == 0 or split == -1: - idx = tokens.push_back(idx, self.vocab.get(tokens.mem, string)) + tokens.push_back(self.vocab.get(tokens.mem, string), False) else: + # Append the beginning, afix, end of the infix token slice_unicode(&span, string.chars, 0, split) - idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) + tokens.push_back(self.vocab.get(tokens.mem, &span), False) slice_unicode(&span, string.chars, split, split+1) - idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) + tokens.push_back(self.vocab.get(tokens.mem, &span), False) slice_unicode(&span, string.chars, split + 1, string.n) - idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) + tokens.push_back(self.vocab.get(tokens.mem, &span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): - idx = tokens.push_back(idx, deref(it)) + lexeme = deref(it) preinc(it) + tokens.push_back(lexeme, False) cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: cdef int i diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 63f5bd815..a19c387ba 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -26,7 +26,7 @@ cdef class Doc: cdef int length cdef int max_length - cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 + cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 006a58307..1daef2c05 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -173,7 +173,7 @@ cdef class Doc: start = i yield Span(self, start, self.length) - cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: + cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.data[self.length] @@ -181,9 +181,13 @@ cdef class Doc: t[0] = lex_or_tok[0] else: t.lex = lex_or_tok - t.idx = idx + if self.length == 0: + t.idx = 0 + else: + t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy + t.spacy = has_space self.length += 1 - return idx + t.lex.length + return t.idx + t.lex.length + t.spacy @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): @@ -375,11 +379,11 @@ cdef class Doc: string += vocab.strings[lex.orth] if space: string += u' ' - cdef Doc doc = Doc(vocab, string) + cdef Doc doc = Doc(vocab) + cdef bint has_space = False cdef int idx = 0 for i, id_ in enumerate(ids): - doc.push_back(idx, vocab.lexemes[id_]) - idx += vocab.lexemes[id_].length - if spaces[i]: - idx += 1 + lex = vocab.lexemes[id_] + has_space = spaces[i] + doc.push_back(lex, has_space) return doc