From 3711af74e5ca59c6406f3041d54cd29a06b4ca26 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Mar 2022 13:21:32 +0100 Subject: [PATCH] Add tokenizer option to allow Matcher handling for all rules (#10452) * Add tokenizer option to allow Matcher handling for all rules Add tokenizer option `with_faster_rules_heuristics` that determines whether the special cases applied by the internal `Matcher` are filtered by whether they contain affixes or space. If `True` (default), the rules are filtered to prioritize speed over rare edge cases. If `False`, all rules are included in the final `Matcher`-based pass over the doc. * Reset all caches when reloading special cases * Revert "Reset all caches when reloading special cases" This reverts commit 4ef6bd171d00da01cbabc3bcde00088ba4bd5578. * Initialize max_length properly * Add new tag to API docs * Rename to faster heuristics --- .../serialize/test_serialize_tokenizer.py | 2 ++ spacy/tests/tokenizer/test_tokenizer.py | 19 ++++++++++++- spacy/tokenizer.pxd | 5 ++-- spacy/tokenizer.pyx | 28 +++++++++++++++---- website/docs/api/tokenizer.md | 19 +++++++------ 5 files changed, 55 insertions(+), 18 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index e271f7707..9b74d7721 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -70,6 +70,7 @@ def test_issue4190(): suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=nlp.tokenizer.token_match, + faster_heuristics=False, ) nlp.tokenizer = new_tokenizer @@ -90,6 +91,7 @@ def test_issue4190(): doc_2 = nlp_2(test_string) result_2 = [token.text for token in doc_2] assert result_1b == result_2 + assert nlp_2.tokenizer.faster_heuristics is False def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index b27af6bcd..c661e91f7 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -523,6 +523,23 @@ def test_tokenizer_infix_prefix(en_vocab): assert tokens == explain_tokens +@pytest.mark.issue(10086) +def test_issue10086(en_tokenizer): + """Test special case works when part of infix substring.""" + text = "No--don't see" + + # without heuristics: do n't + en_tokenizer.faster_heuristics = False + doc = en_tokenizer(text) + assert "n't" in [w.text for w in doc] + assert "do" in [w.text for w in doc] + + # with (default) heuristics: don't + en_tokenizer.faster_heuristics = True + doc = en_tokenizer(text) + assert "don't" in [w.text for w in doc] + + def test_tokenizer_initial_special_case_explain(en_vocab): tokenizer = Tokenizer( en_vocab, @@ -533,4 +550,4 @@ def test_tokenizer_initial_special_case_explain(en_vocab): ) tokens = [t.text for t in tokenizer("id")] explain_tokens = [t[1] for t in tokenizer.explain("id")] - assert tokens == explain_tokens + assert tokens == explain_tokens \ No newline at end of file diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index fa38a1015..e6a072053 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -23,9 +23,10 @@ cdef class Tokenizer: cdef object _infix_finditer cdef object _rules cdef PhraseMatcher _special_matcher - # TODO next two are unused and should be removed in v4 + # TODO convert to bool in v4 + cdef int _faster_heuristics + # TODO next one is unused and should be removed in v4 # https://github.com/explosion/spaCy/pull/9150 - cdef int _unused_int1 cdef int _unused_int2 cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ac55a61f3..0e75b5f7a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -34,7 +34,7 @@ cdef class Tokenizer: """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None, - url_match=None): + url_match=None, faster_heuristics=True): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -43,7 +43,7 @@ cdef class Tokenizer: `re.compile(string).search` to match prefixes. suffix_search (callable): A function matching the signature of `re.compile(string).search` to match suffixes. - `infix_finditer` (callable): A function matching the signature of + infix_finditer (callable): A function matching the signature of `re.compile(string).finditer` to find infixes. token_match (callable): A function matching the signature of `re.compile(string).match`, for matching strings to be @@ -51,6 +51,9 @@ cdef class Tokenizer: url_match (callable): A function matching the signature of `re.compile(string).match`, for matching strings to be recognized as urls. + faster_heuristics (bool): Whether to restrict the final + Matcher-based pass for rules to those containing affixes or space. + Defaults to True. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) @@ -66,6 +69,7 @@ cdef class Tokenizer: self.suffix_search = suffix_search self.infix_finditer = infix_finditer self.vocab = vocab + self.faster_heuristics = faster_heuristics self._rules = {} self._special_matcher = PhraseMatcher(self.vocab) self._load_special_cases(rules) @@ -122,6 +126,14 @@ cdef class Tokenizer: self._specials = PreshMap() self._load_special_cases(rules) + property faster_heuristics: + def __get__(self): + return bool(self._faster_heuristics) + + def __set__(self, faster_heuristics): + self._faster_heuristics = bool(faster_heuristics) + self._reload_special_cases() + def __reduce__(self): args = (self.vocab, self.rules, @@ -287,7 +299,7 @@ cdef class Tokenizer: spans = [doc[match.start:match.end] for match in filtered] cdef bint modify_in_place = True cdef int curr_length = doc.length - cdef int max_length + cdef int max_length = 0 cdef int span_length_diff = 0 span_data = {} for span in spans: @@ -602,7 +614,7 @@ cdef class Tokenizer: self.mem.free(stale_special) self._rules[string] = substrings self._flush_cache() - if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string: + if not self.faster_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string: self._special_matcher.add(string, None, self._tokenize_affixes(string, False)) def _reload_special_cases(self): @@ -777,7 +789,8 @@ cdef class Tokenizer: "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer), "token_match": lambda: _get_regex_pattern(self.token_match), "url_match": lambda: _get_regex_pattern(self.url_match), - "exceptions": lambda: dict(sorted(self._rules.items())) + "exceptions": lambda: dict(sorted(self._rules.items())), + "faster_heuristics": lambda: self.faster_heuristics, } return util.to_bytes(serializers, exclude) @@ -798,7 +811,8 @@ cdef class Tokenizer: "infix_finditer": lambda b: data.setdefault("infix_finditer", b), "token_match": lambda b: data.setdefault("token_match", b), "url_match": lambda b: data.setdefault("url_match", b), - "exceptions": lambda b: data.setdefault("rules", b) + "exceptions": lambda b: data.setdefault("rules", b), + "faster_heuristics": lambda b: data.setdefault("faster_heuristics", b), } # reset all properties and flush all caches (through rules), # reset rules first so that _reload_special_cases is trivial/fast as @@ -822,6 +836,8 @@ cdef class Tokenizer: self.url_match = re.compile(data["url_match"]).match if "rules" in data and isinstance(data["rules"], dict): self.rules = data["rules"] + if "faster_heuristics" in data: + self.faster_heuristics = data["faster_heuristics"] return self diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 8809c10bc..6eb7e8024 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -44,15 +44,16 @@ how to construct a custom tokenizer with different tokenization rules, see the > tokenizer = nlp.tokenizer > ``` -| Name | Description | -| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | A storage container for lexical types. ~~Vocab~~ | -| `rules` | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ | -| `prefix_search` | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~ | -| `suffix_search` | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ | -| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~ | -| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~ | -| `url_match` | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| Name | Description | +| -------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | A storage container for lexical types. ~~Vocab~~ | +| `rules` | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ | +| `prefix_search` | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `suffix_search` | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~ | +| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `url_match` | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `faster_heuristics` 3.3.0 | Whether to restrict the final `Matcher`-based pass for rules to those containing affixes or space. Defaults to `True`. ~~bool~~ | ## Tokenizer.\_\_call\_\_ {#call tag="method"}