From 4f441dfa24a890f4a112ae375ef7daf1a7c21ffd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 28 Jan 2022 17:00:54 +0100 Subject: [PATCH] Fix infix as prefix in Tokenizer.explain (#10140) * Fix infix as prefix in Tokenizer.explain Update `Tokenizer.explain` to align with the `Tokenizer` algorithm: * skip infix matches that are prefixes in the current substring * Update tokenizer pseudocode in docs --- spacy/tests/tokenizer/test_tokenizer.py | 18 ++++++++++++++++++ spacy/tokenizer.pyx | 2 ++ website/docs/usage/linguistic-features.md | 2 ++ 3 files changed, 22 insertions(+) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index c2aeffcb5..a7270cb1e 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer from spacy.tokens import Doc from spacy.training import Example from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path +from spacy.util import compile_infix_regex from spacy.vocab import Vocab from spacy.symbols import ORTH @@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): assert tokens == ["a", "10", "."] explain_tokens = [t[1] for t in tokenizer.explain("a10.")] assert tokens == explain_tokens + + +def test_tokenizer_infix_prefix(en_vocab): + # the prefix and suffix matches overlap in the suffix lookbehind + infixes = ["±"] + suffixes = ["%"] + infix_re = compile_infix_regex(infixes) + suffix_re = compile_suffix_regex(suffixes) + tokenizer = Tokenizer( + en_vocab, + infix_finditer=infix_re.finditer, + suffix_search=suffix_re.search, + ) + tokens = [t.text for t in tokenizer("±10%")] + assert tokens == ["±10", "%"] + explain_tokens = [t[1] for t in tokenizer.explain("±10%")] + assert tokens == explain_tokens diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4a148b356..91f228032 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -683,6 +683,8 @@ cdef class Tokenizer: infixes = infix_finditer(substring) offset = 0 for match in infixes: + if offset == 0 and match.start() == 0: + continue if substring[offset : match.start()]: tokens.append(("TOKEN", substring[offset : match.start()])) if substring[match.start() : match.end()]: diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index f748fa8d6..f8baf5588 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -831,6 +831,8 @@ def tokenizer_pseudo_code( infixes = infix_finditer(substring) offset = 0 for match in infixes: + if offset == 0 and match.start() == 0: + continue tokens.append(substring[offset : match.start()]) tokens.append(substring[match.start() : match.end()]) offset = match.end()