Update Tokenizer.explain with special matches (#7749)

* Update Tokenizer.explain with special matches Update `Tokenizer.explain` and the pseudo-code in the docs to include the processing of special cases that contain affixes or whitespace. * Handle optional settings in explain * Add test for special matches in explain Add test for `Tokenizer.explain` for special cases containing affixes.
2021-04-19 11:08:20 +02:00 · 2021-04-19 11:08:20 +02:00 · 0e7f94b247
parent 07b41c38ae
commit 0e7f94b247
3 changed files with 59 additions and 4 deletions
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@ -1,5 +1,7 @@
 import pytest
 import re
 from spacy.util import get_lang_class
 from spacy.tokenizer import Tokenizer
 # Only include languages with no external dependencies
 # "is" seems to confuse importlib, so we're also excluding it for now
@ -60,3 +62,18 @@ def test_tokenizer_explain(lang):
        tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
        debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
        assert tokens == debug_tokens
 def test_tokenizer_explain_special_matcher(en_vocab):
    suffix_re = re.compile(r"[\.]$")
    infix_re = re.compile(r"[/]")
    rules = {"a.": [{"ORTH": "a."}]}
    tokenizer = Tokenizer(
        en_vocab,
        rules=rules,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
    )
    tokens = [t.text for t in tokenizer("a/a.")]
    explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
    assert tokens == explain_tokens
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -20,11 +20,12 @@ from .attrs import intify_attrs
 from .symbols import ORTH, NORM
 from .errors import Errors, Warnings
 from . import util
-from .util import registry
+from .util import registry, get_words_and_spaces
 from .attrs import intify_attrs
 from .symbols import ORTH
 from .scorer import Scorer
 from .training import validate_examples
 from .tokens import Span
 cdef class Tokenizer:
@ -638,8 +639,14 @@ cdef class Tokenizer:
        DOCS: https://spacy.io/api/tokenizer#explain
        """
        prefix_search = self.prefix_search
        if prefix_search is None:
            prefix_search = re.compile("a^").search
        suffix_search = self.suffix_search
        if suffix_search is None:
            suffix_search = re.compile("a^").search
        infix_finditer = self.infix_finditer
        if infix_finditer is None:
            infix_finditer = re.compile("a^").finditer
        token_match = self.token_match
        if token_match is None:
            token_match = re.compile("a^").match
@ -687,7 +694,7 @@ cdef class Tokenizer:
                    tokens.append(("URL_MATCH", substring))
                    substring = ''
                elif substring in special_cases:
-                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
                    substring = ''
                elif list(infix_finditer(substring)):
                    infixes = infix_finditer(substring)
@ -705,7 +712,33 @@ cdef class Tokenizer:
                    tokens.append(("TOKEN", substring))
                    substring = ''
            tokens.extend(reversed(suffixes))
-        return tokens
+        # Find matches for special cases handled by special matcher
        words, spaces = get_words_and_spaces([t[1] for t in tokens], text)
        t_words = []
        t_spaces = []
        for word, space in zip(words, spaces):
            if not word.isspace():
                t_words.append(word)
                t_spaces.append(space)
        doc = Doc(self.vocab, words=t_words, spaces=t_spaces)
        matches = self._special_matcher(doc)
        spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches]
        spans = util.filter_spans(spans)
        # Replace matched tokens with their exceptions
        i = 0
        final_tokens = []
        spans_by_start = {s.start: s for s in spans}
        while i < len(tokens):
            if i in spans_by_start:
                span = spans_by_start[i]
                exc = [d[ORTH] for d in special_cases[span.label_]]
                for j, orth in enumerate(exc):
                    final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
                i += len(span)
            else:
                final_tokens.append(tokens[i])
                i += 1
        return final_tokens
    def score(self, examples, **kwargs):
        validate_examples(examples, "Tokenizer.score")
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -786,6 +786,7 @@ rather than performance:
 ```python
 def tokenizer_pseudo_code(
    text,
    special_cases,
    prefix_search,
    suffix_search,
@ -839,12 +840,14 @@ def tokenizer_pseudo_code(
                tokens.append(substring)
                substring = ""
        tokens.extend(reversed(suffixes))
    for match in matcher(special_cases, text):
        tokens.replace(match, special_cases[match])
    return tokens
 ```
 The algorithm can be summarized as follows:
-1. Iterate over whitespace-separated substrings.
+1. Iterate over space-separated substrings.
 2. Look for a token match. If there is a match, stop processing and keep this
   token.
 3. Check whether we have an explicitly defined special case for this substring.
@ -858,6 +861,8 @@ The algorithm can be summarized as follows:
 8. Look for "infixes" – stuff like hyphens etc. and split the substring into
   tokens on all infixes.
 9. Once we can't consume any more of the string, handle it as a single token.
 10. Make a final pass over the text to check for special cases that include
    spaces or that were missed due to the incremental processing of affixes.
 </Accordion>