from __future__ import unicode_literals

from spacy.util import utf8open

import pytest
from os import path


HERE = path.dirname(__file__)


@pytest.fixture
def sun_txt():
    loc = path.join(HERE, '..', 'sun.txt')
    return utf8open(loc).read()


def test_tokenize(sun_txt, en_tokenizer):
    assert len(sun_txt) != 0
    tokens = en_tokenizer(sun_txt)
    assert len(tokens) > 100