diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 17714940d..24536face 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -404,9 +404,7 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: - if self.token_match and self.token_match(string) \ - and not self.find_prefix(string) \ - and not self.find_suffix(string): + if self.token_match and self.token_match(string): break if with_special_cases and self._specials.get(hash_string(string)) != NULL: break @@ -679,6 +677,8 @@ cdef class Tokenizer: break suffixes.append(("SUFFIX", substring[split:])) substring = substring[:split] + if len(substring) == 0: + continue if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = ''