From 297dd82c86372c7aa0a181e55dc72512718aafe8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Mar 2022 10:50:47 +0100
Subject: [PATCH] Fix initial special cases for Tokenizer.explain (#10460)

Add the missing initial check for special cases to `Tokenizer.explain`
to align with `Tokenizer._tokenize_affixes`.
---
 spacy/tests/tokenizer/test_tokenizer.py   | 13 +++++++++++
 spacy/tokenizer.pyx                       |  4 ++++
 website/docs/usage/linguistic-features.md | 28 ++++++++++++++---------
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index a7270cb1e..ed11508b4 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -521,3 +521,16 @@ def test_tokenizer_infix_prefix(en_vocab):
     assert tokens == ["±10", "%"]
     explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
     assert tokens == explain_tokens
+
+
+def test_tokenizer_initial_special_case_explain(en_vocab):
+    tokenizer = Tokenizer(
+        en_vocab,
+        token_match=re.compile("^id$").match,
+        rules={
+            "id": [{"ORTH": "i"}, {"ORTH": "d"}],
+        }
+    )
+    tokens = [t.text for t in tokenizer("id")]
+    explain_tokens = [t[1] for t in tokenizer.explain("id")]
+    assert tokens == explain_tokens
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 91f228032..ac55a61f3 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -643,6 +643,10 @@ cdef class Tokenizer:
         for substring in text.split():
             suffixes = []
             while substring:
+                if substring in special_cases:
+                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    substring = ''
+                    continue
                 while prefix_search(substring) or suffix_search(substring):
                     if token_match(substring):
                         tokens.append(("TOKEN_MATCH", substring))
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index f8baf5588..c3f25565a 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -799,6 +799,10 @@ def tokenizer_pseudo_code(
     for substring in text.split():
         suffixes = []
         while substring:
+            if substring in special_cases:
+                tokens.extend(special_cases[substring])
+                substring = ""
+                continue
             while prefix_search(substring) or suffix_search(substring):
                 if token_match(substring):
                     tokens.append(substring)
@@ -851,20 +855,22 @@ def tokenizer_pseudo_code(
 The algorithm can be summarized as follows:
 
 1. Iterate over space-separated substrings.
-2. Look for a token match. If there is a match, stop processing and keep this
-   token.
-3. Check whether we have an explicitly defined special case for this substring.
+2. Check whether we have an explicitly defined special case for this substring.
    If we do, use it.
-4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2,
+3. Look for a token match. If there is a match, stop processing and keep this
+   token.
+4. Check whether we have an explicitly defined special case for this substring.
+   If we do, use it.
+5. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #3,
    so that the token match and special cases always get priority.
-5. If we didn't consume a prefix, try to consume a suffix and then go back to
-   #2.
-6. If we can't consume a prefix or a suffix, look for a URL match.
-7. If there's no URL match, then look for a special case.
-8. Look for "infixes" – stuff like hyphens etc. and split the substring into
+6. If we didn't consume a prefix, try to consume a suffix and then go back to
+   #3.
+7. If we can't consume a prefix or a suffix, look for a URL match.
+8. If there's no URL match, then look for a special case.
+9. Look for "infixes" – stuff like hyphens etc. and split the substring into
    tokens on all infixes.
-9. Once we can't consume any more of the string, handle it as a single token.
-10. Make a final pass over the text to check for special cases that include
+10. Once we can't consume any more of the string, handle it as a single token.
+11. Make a final pass over the text to check for special cases that include
     spaces or that were missed due to the incremental processing of affixes.
 
 </Accordion>