Fix infix as prefix in Tokenizer.explain (#10140)

* Fix infix as prefix in Tokenizer.explain Update `Tokenizer.explain` to align with the `Tokenizer` algorithm: * skip infix matches that are prefixes in the current substring * Update tokenizer pseudocode in docs
2022-01-28 17:00:54 +01:00 · 2022-01-28 17:00:54 +01:00 · 4f441dfa24
parent 30cf9d6a05
commit 4f441dfa24
3 changed files with 22 additions and 0 deletions
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.training import Example
 from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
+from spacy.util import compile_infix_regex
 from spacy.vocab import Vocab
 from spacy.symbols import ORTH

@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
    assert tokens == ["a", "10", "."]
    explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
    assert tokens == explain_tokens
+
+
+def test_tokenizer_infix_prefix(en_vocab):
+    # the prefix and suffix matches overlap in the suffix lookbehind
+    infixes = ["±"]
+    suffixes = ["%"]
+    infix_re = compile_infix_regex(infixes)
+    suffix_re = compile_suffix_regex(suffixes)
+    tokenizer = Tokenizer(
+        en_vocab,
+        infix_finditer=infix_re.finditer,
+        suffix_search=suffix_re.search,
+    )
+    tokens = [t.text for t in tokenizer("±10%")]
+    assert tokens == ["±10", "%"]
+    explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
+    assert tokens == explain_tokens
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -683,6 +683,8 @@ cdef class Tokenizer:
                    infixes = infix_finditer(substring)
                    offset = 0
                    for match in infixes:
+                        if offset == 0 and match.start() == 0:
+                            continue
                        if substring[offset : match.start()]:
                            tokens.append(("TOKEN", substring[offset : match.start()]))
                        if substring[match.start() : match.end()]:
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
                infixes = infix_finditer(substring)
                offset = 0
                for match in infixes:
+                    if offset == 0 and match.start() == 0:
+                        continue
                    tokens.append(substring[offset : match.start()])
                    tokens.append(substring[match.start() : match.end()])
                    offset = match.end()