From 59eba273bb300f6027e4bd2c47ef82687bf407ae Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 7 Mar 2022 12:13:36 +0100
Subject: [PATCH] Add tokenizer option to allow Matcher handling for all rules

Add tokenizer option `with_faster_rules_heuristics` that determines
whether the special cases applied by the internal `Matcher` are filtered
by whether they contain affixes or space. If `True` (default), the rules
are filtered to prioritize speed over rare edge cases. If `False`, all
rules are included in the final `Matcher`-based pass over the doc.
---
 .../serialize/test_serialize_tokenizer.py     |  2 ++
 spacy/tests/tokenizer/test_tokenizer.py       | 17 ++++++++++++
 spacy/tokenizer.pxd                           |  5 ++--
 spacy/tokenizer.pyx                           | 26 +++++++++++++++----
 website/docs/api/tokenizer.md                 | 19 +++++++-------
 5 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py
index e271f7707..4359c5ad5 100644
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@@ -70,6 +70,7 @@ def test_issue4190():
             suffix_search=suffix_re.search,
             infix_finditer=infix_re.finditer,
             token_match=nlp.tokenizer.token_match,
+            with_faster_rules_heuristics=False,
         )
         nlp.tokenizer = new_tokenizer
 
@@ -90,6 +91,7 @@ def test_issue4190():
     doc_2 = nlp_2(test_string)
     result_2 = [token.text for token in doc_2]
     assert result_1b == result_2
+    assert nlp_2.tokenizer.with_faster_rules_heuristics is False
 
 
 def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index a7270cb1e..4cf913b64 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -521,3 +521,20 @@ def test_tokenizer_infix_prefix(en_vocab):
     assert tokens == ["±10", "%"]
     explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
     assert tokens == explain_tokens
+
+
+@pytest.mark.issue(10086)
+def test_issue10086(en_tokenizer):
+    """Test special case works when part of infix substring."""
+    text = "No--don't see"
+
+    # without heuristics: do n't
+    en_tokenizer.with_faster_rules_heuristics = False
+    doc = en_tokenizer(text)
+    assert "n't" in [w.text for w in doc]
+    assert "do" in [w.text for w in doc]
+
+    # with (default) heuristics: don't
+    en_tokenizer.with_faster_rules_heuristics = True
+    doc = en_tokenizer(text)
+    assert "don't" in [w.text for w in doc]
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index fa38a1015..0361cb2de 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -23,9 +23,10 @@ cdef class Tokenizer:
     cdef object _infix_finditer
     cdef object _rules
     cdef PhraseMatcher _special_matcher
-    # TODO next two are unused and should be removed in v4
+    # TODO convert to bool in v4
+    cdef int _with_faster_rules_heuristics
+    # TODO next one is unused and should be removed in v4
     # https://github.com/explosion/spaCy/pull/9150
-    cdef int _unused_int1
     cdef int _unused_int2
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 91f228032..40b35c6f6 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -34,7 +34,7 @@ cdef class Tokenizer:
     """
     def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                  suffix_search=None, infix_finditer=None, token_match=None,
-                 url_match=None):
+                 url_match=None, with_faster_rules_heuristics=True):
         """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 
         vocab (Vocab): A storage container for lexical types.
@@ -43,7 +43,7 @@ cdef class Tokenizer:
             `re.compile(string).search` to match prefixes.
         suffix_search (callable): A function matching the signature of
             `re.compile(string).search` to match suffixes.
-        `infix_finditer` (callable): A function matching the signature of
+        infix_finditer (callable): A function matching the signature of
             `re.compile(string).finditer` to find infixes.
         token_match (callable): A function matching the signature of
             `re.compile(string).match`, for matching strings to be
@@ -51,6 +51,9 @@ cdef class Tokenizer:
         url_match (callable): A function matching the signature of
             `re.compile(string).match`, for matching strings to be
             recognized as urls.
+        with_faster_rules_heuristics (bool): Whether to restrict the final
+            Matcher-based pass for rules to those containing affixes or space.
+            Defaults to True.
 
         EXAMPLE:
             >>> tokenizer = Tokenizer(nlp.vocab)
@@ -66,6 +69,7 @@ cdef class Tokenizer:
         self.suffix_search = suffix_search
         self.infix_finditer = infix_finditer
         self.vocab = vocab
+        self.with_faster_rules_heuristics = with_faster_rules_heuristics
         self._rules = {}
         self._special_matcher = PhraseMatcher(self.vocab)
         self._load_special_cases(rules)
@@ -122,6 +126,14 @@ cdef class Tokenizer:
             self._specials = PreshMap()
             self._load_special_cases(rules)
 
+    property with_faster_rules_heuristics:
+        def __get__(self):
+            return bool(self._with_faster_rules_heuristics)
+
+        def __set__(self, with_faster_rules_heuristics):
+            self._with_faster_rules_heuristics = bool(with_faster_rules_heuristics)
+            self._reload_special_cases()
+
     def __reduce__(self):
         args = (self.vocab,
                 self.rules,
@@ -602,7 +614,7 @@ cdef class Tokenizer:
             self.mem.free(stale_special)
         self._rules[string] = substrings
         self._flush_cache()
-        if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
+        if not self.with_faster_rules_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
             self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
 
     def _reload_special_cases(self):
@@ -773,7 +785,8 @@ cdef class Tokenizer:
             "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
             "token_match": lambda: _get_regex_pattern(self.token_match),
             "url_match": lambda: _get_regex_pattern(self.url_match),
-            "exceptions": lambda: dict(sorted(self._rules.items()))
+            "exceptions": lambda: dict(sorted(self._rules.items())),
+            "with_faster_rules_heuristics": lambda: self.with_faster_rules_heuristics,
         }
         return util.to_bytes(serializers, exclude)
 
@@ -794,7 +807,8 @@ cdef class Tokenizer:
             "infix_finditer": lambda b: data.setdefault("infix_finditer", b),
             "token_match": lambda b: data.setdefault("token_match", b),
             "url_match": lambda b: data.setdefault("url_match", b),
-            "exceptions": lambda b: data.setdefault("rules", b)
+            "exceptions": lambda b: data.setdefault("rules", b),
+            "with_faster_rules_heuristics": lambda b: data.setdefault("with_faster_rules_heuristics", b),
         }
         # reset all properties and flush all caches (through rules),
         # reset rules first so that _reload_special_cases is trivial/fast as
@@ -818,6 +832,8 @@ cdef class Tokenizer:
             self.url_match = re.compile(data["url_match"]).match
         if "rules" in data and isinstance(data["rules"], dict):
             self.rules = data["rules"]
+        if "with_faster_rules_heuristics" in data:
+            self.with_faster_rules_heuristics = data["with_faster_rules_heuristics"]
         return self
 
 
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index 8809c10bc..66046fce2 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -44,15 +44,16 @@ how to construct a custom tokenizer with different tokenization rules, see the
 > tokenizer = nlp.tokenizer
 > ```
 
-| Name             | Description                                                                                                                                                                   |
-| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | A storage container for lexical types. ~~Vocab~~                                                                                                                              |
-| `rules`          | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~                                                                                 |
-| `prefix_search`  | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
-| `suffix_search`  | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
-| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~                                            |
-| `token_match`    | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~                                         |
-| `url_match`      | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
+| Name                           | Description                                                                                                                                                                   |
+| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                        | A storage container for lexical types. ~~Vocab~~                                                                                                                              |
+| `rules`                        | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~                                                                                 |
+| `prefix_search`                | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
+| `suffix_search`                | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~                                            |
+| `infix_finditer`               | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~                                            |
+| `token_match`                  | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~                                         |
+| `url_match`                    | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
+| `with_faster_rules_heuristics` | Whether to restrict the final `Matcher`-based pass for rules to those containing affixes or space. Defaults to `True`. ~~bool~~                                               |
 
 ## Tokenizer.\_\_call\_\_ {#call tag="method"}