Fuzz tokenizer.explain: draft for fuzzy tests. (#10771)

* Fuzz tokenizer.explain: draft for fuzzy tests. * Fuzz tokenizer.explain: xignoring tokenizer.explain() tests. Removed deadline modification. Removed LANGUAGES_WITHOUT_TOKENIZERS. * Fuzz tokenizer.explain: changed tokenizer initialization to avoid failus in Azure runs. * Fuzz tokenizer.explain: type hint for tokenizer in test. Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2022-05-17 10:23:16 +02:00 · 2022-05-17 10:23:16 +02:00 · 357be2614e
parent 99aeaf9bd3
commit 357be2614e
1 changed files with 51 additions and 2 deletions
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@ -1,7 +1,13 @@
-import pytest
 import re
-from spacy.util import get_lang_class
+import string
+
+import hypothesis
+import hypothesis.strategies
+import pytest
+
+import spacy
 from spacy.tokenizer import Tokenizer
+from spacy.util import get_lang_class

 # Only include languages with no external dependencies
 # "is" seems to confuse importlib, so we're also excluding it for now
@ -77,3 +83,46 @@ def test_tokenizer_explain_special_matcher(en_vocab):
    tokens = [t.text for t in tokenizer("a/a.")]
    explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
    assert tokens == explain_tokens
+
+
+@hypothesis.strategies.composite
+def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
+    """
+    Composite strategy for fuzzily generating sentence with varying interpunctation.
+
+    draw (hypothesis.strategies.DrawFn): Protocol for drawing function allowing to fuzzily pick from hypothesis'
+                                         strategies.
+    max_n_words (int): Max. number of words in generated sentence.
+    RETURNS (str): Fuzzily generated sentence.
+    """
+
+    punctuation_and_space_regex = "|".join(
+        [*[re.escape(p) for p in string.punctuation], r"\s"]
+    )
+    sentence = [
+        [
+            draw(hypothesis.strategies.text(min_size=1)),
+            draw(hypothesis.strategies.from_regex(punctuation_and_space_regex)),
+        ]
+        for _ in range(
+            draw(hypothesis.strategies.integers(min_value=2, max_value=max_n_words))
+        )
+    ]
+
+    return " ".join([token for token_pair in sentence for token in token_pair])
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("lang", LANGUAGES)
+@hypothesis.given(sentence=sentence_strategy())
+def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
+    """
+    Tests whether output of tokenizer.explain() matches tokenizer output. Input generated by hypothesis.
+    lang (str): Language to test.
+    text (str): Fuzzily generated sentence to tokenize.
+    """
+
+    tokenizer: Tokenizer = spacy.blank(lang).tokenizer
+    tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
+    debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
+    assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"