Fix initial special cases for Tokenizer.explain (#10460)

Add the missing initial check for special cases to `Tokenizer.explain` to align with `Tokenizer._tokenize_affixes`.
2022-03-11 10:50:47 +01:00 · 2022-03-11 10:50:47 +01:00 · 297dd82c86
parent 01ec6349ea
commit 297dd82c86
3 changed files with 34 additions and 11 deletions
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -521,3 +521,16 @@ def test_tokenizer_infix_prefix(en_vocab):
    assert tokens == ["±10", "%"]
    explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
    assert tokens == explain_tokens
+
+
+def test_tokenizer_initial_special_case_explain(en_vocab):
+    tokenizer = Tokenizer(
+        en_vocab,
+        token_match=re.compile("^id$").match,
+        rules={
+            "id": [{"ORTH": "i"}, {"ORTH": "d"}],
+        }
+    )
+    tokens = [t.text for t in tokenizer("id")]
+    explain_tokens = [t[1] for t in tokenizer.explain("id")]
+    assert tokens == explain_tokens
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -643,6 +643,10 @@ cdef class Tokenizer:
        for substring in text.split():
            suffixes = []
            while substring:
+                if substring in special_cases:
+                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    substring = ''
+                    continue
                while prefix_search(substring) or suffix_search(substring):
                    if token_match(substring):
                        tokens.append(("TOKEN_MATCH", substring))
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -799,6 +799,10 @@ def tokenizer_pseudo_code(
    for substring in text.split():
        suffixes = []
        while substring:
+            if substring in special_cases:
+                tokens.extend(special_cases[substring])
+                substring = ""
+                continue
            while prefix_search(substring) or suffix_search(substring):
                if token_match(substring):
                    tokens.append(substring)
@ -851,20 +855,22 @@ def tokenizer_pseudo_code(
 The algorithm can be summarized as follows:

 1. Iterate over space-separated substrings.
-2. Look for a token match. If there is a match, stop processing and keep this
-   token.
-3. Check whether we have an explicitly defined special case for this substring.
+2. Check whether we have an explicitly defined special case for this substring.
   If we do, use it.
-4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
+3. Look for a token match. If there is a match, stop processing and keep this
+   token.
+4. Check whether we have an explicitly defined special case for this substring.
+   If we do, use it.
+5. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #3,
   so that the token match and special cases always get priority.
-5. If we didn't consume a prefix, try to consume a suffix and then go back to
-   #2.
-6. If we can't consume a prefix or a suffix, look for a URL match.
-7. If there's no URL match, then look for a special case.
-8. Look for "infixes" – stuff like hyphens etc. and split the substring into
+6. If we didn't consume a prefix, try to consume a suffix and then go back to
+   #3.
+7. If we can't consume a prefix or a suffix, look for a URL match.
+8. If there's no URL match, then look for a special case.
+9. Look for "infixes" – stuff like hyphens etc. and split the substring into
   tokens on all infixes.
-9. Once we can't consume any more of the string, handle it as a single token.
-10. Make a final pass over the text to check for special cases that include
+10. Once we can't consume any more of the string, handle it as a single token.
+11. Make a final pass over the text to check for special cases that include
    spaces or that were missed due to the incremental processing of affixes.

 </Accordion>