From cf693f0eae487badbfea4f66ec8610c075d67a73 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 25 Nov 2020 11:43:05 +0100 Subject: [PATCH] Fix token_match in tokenizer --- spacy/tokenizer.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 17714940d..24536face 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -404,9 +404,7 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: - if self.token_match and self.token_match(string) \ - and not self.find_prefix(string) \ - and not self.find_suffix(string): + if self.token_match and self.token_match(string): break if with_special_cases and self._specials.get(hash_string(string)) != NULL: break @@ -679,6 +677,8 @@ cdef class Tokenizer: break suffixes.append(("SUFFIX", substring[split:])) substring = substring[:split] + if len(substring) == 0: + continue if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = ''