# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.util import get_lang_class
# Only include languages with no external dependencies
# "is" seems to confuse importlib, so we're also excluding it for now
# excluded: ja, ru, th, uk, vi, zh, is
pytest.param("ca", marks=pytest.mark.xfail()),
pytest.param("fr", marks=pytest.mark.xfail()),
pytest.param("hu", marks=pytest.mark.xfail()),
pytest.param("pl", marks=pytest.mark.xfail()),
# @pytest.mark.slow
@pytest.mark.parametrize("lang", LANGUAGES)
def test_tokenizer_explain(lang):
nlp = get_lang_class(lang)()
examples = pytest.importorskip("spacy.lang.{}.examples".format(lang))
for sentence in examples.sentences:
tokens = [t.text for t in nlp.tokenizer(sentence) if not t.is_space]
debug_tokens = [t[1] for t in nlp.tokenizer.explain(sentence)]
assert tokens == debug_tokens