mirror of https://github.com/explosion/spaCy.git
Fix infix as prefix in Tokenizer.explain (#10140)
* Fix infix as prefix in Tokenizer.explain Update `Tokenizer.explain` to align with the `Tokenizer` algorithm: * skip infix matches that are prefixes in the current substring * Update tokenizer pseudocode in docs
This commit is contained in:
parent
30cf9d6a05
commit
4f441dfa24
|
@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
|
|||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
|
||||
from spacy.util import compile_infix_regex
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.symbols import ORTH
|
||||
|
||||
|
@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
|
|||
assert tokens == ["a", "10", "."]
|
||||
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
|
||||
assert tokens == explain_tokens
|
||||
|
||||
|
||||
def test_tokenizer_infix_prefix(en_vocab):
|
||||
# the prefix and suffix matches overlap in the suffix lookbehind
|
||||
infixes = ["±"]
|
||||
suffixes = ["%"]
|
||||
infix_re = compile_infix_regex(infixes)
|
||||
suffix_re = compile_suffix_regex(suffixes)
|
||||
tokenizer = Tokenizer(
|
||||
en_vocab,
|
||||
infix_finditer=infix_re.finditer,
|
||||
suffix_search=suffix_re.search,
|
||||
)
|
||||
tokens = [t.text for t in tokenizer("±10%")]
|
||||
assert tokens == ["±10", "%"]
|
||||
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
|
||||
assert tokens == explain_tokens
|
||||
|
|
|
@ -683,6 +683,8 @@ cdef class Tokenizer:
|
|||
infixes = infix_finditer(substring)
|
||||
offset = 0
|
||||
for match in infixes:
|
||||
if offset == 0 and match.start() == 0:
|
||||
continue
|
||||
if substring[offset : match.start()]:
|
||||
tokens.append(("TOKEN", substring[offset : match.start()]))
|
||||
if substring[match.start() : match.end()]:
|
||||
|
|
|
@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
|
|||
infixes = infix_finditer(substring)
|
||||
offset = 0
|
||||
for match in infixes:
|
||||
if offset == 0 and match.start() == 0:
|
||||
continue
|
||||
tokens.append(substring[offset : match.start()])
|
||||
tokens.append(substring[match.start() : match.end()])
|
||||
offset = match.end()
|
||||
|
|
Loading…
Reference in New Issue