From c54aabc3cdc1381db256c9e29bb97d43554b76c4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 28 Aug 2019 14:17:44 +0200 Subject: [PATCH] fix loading custom tokenizer rules/exceptions from file --- spacy/tokenizer.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f19f851c7..19029ec05 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -441,8 +441,13 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if data.get("token_match"): self.token_match = re.compile(data["token_match"]).match - for string, substrings in data.get("rules", {}).items(): - self.add_special_case(string, substrings) + if data.get("rules"): + # make sure to hard reset the cache to remove data from the default exceptions + self._rules = {} + self._cache = PreshMap() + for string, substrings in data.get("rules", {}).items(): + self.add_special_case(string, substrings) + return self