spaCy/spacy/tests/lang/dsb/test_tokenizer.py

30 lines
749 B
Python

import pytest
DSB_BASIC_TOKENIZATION_TESTS = [
(
"Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
[
"Ale",
"eksistěrujo",
"mimo",
"togo",
"ceła",
"kopica",
"narěcow",
",",
"ako",
"na",
"pśikład",
"slěpjańska",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
tokens = dsb_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list