diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 0a10ae67d..5b4eeca16 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -1,7 +1,13 @@ -import pytest import re -from spacy.util import get_lang_class +import string + +import hypothesis +import hypothesis.strategies +import pytest + +import spacy from spacy.tokenizer import Tokenizer +from spacy.util import get_lang_class # Only include languages with no external dependencies # "is" seems to confuse importlib, so we're also excluding it for now @@ -77,3 +83,46 @@ def test_tokenizer_explain_special_matcher(en_vocab): tokens = [t.text for t in tokenizer("a/a.")] explain_tokens = [t[1] for t in tokenizer.explain("a/a.")] assert tokens == explain_tokens + + +@hypothesis.strategies.composite +def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str: + """ + Composite strategy for fuzzily generating sentence with varying interpunctation. + + draw (hypothesis.strategies.DrawFn): Protocol for drawing function allowing to fuzzily pick from hypothesis' + strategies. + max_n_words (int): Max. number of words in generated sentence. + RETURNS (str): Fuzzily generated sentence. + """ + + punctuation_and_space_regex = "|".join( + [*[re.escape(p) for p in string.punctuation], r"\s"] + ) + sentence = [ + [ + draw(hypothesis.strategies.text(min_size=1)), + draw(hypothesis.strategies.from_regex(punctuation_and_space_regex)), + ] + for _ in range( + draw(hypothesis.strategies.integers(min_value=2, max_value=max_n_words)) + ) + ] + + return " ".join([token for token_pair in sentence for token in token_pair]) + + +@pytest.mark.xfail +@pytest.mark.parametrize("lang", LANGUAGES) +@hypothesis.given(sentence=sentence_strategy()) +def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None: + """ + Tests whether output of tokenizer.explain() matches tokenizer output. Input generated by hypothesis. + lang (str): Language to test. + text (str): Fuzzily generated sentence to tokenize. + """ + + tokenizer: Tokenizer = spacy.blank(lang).tokenizer + tokens = [t.text for t in tokenizer(sentence) if not t.is_space] + debug_tokens = [t[1] for t in tokenizer.explain(sentence)] + assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"