spaCy/tests/tokenizer/test_wiki_sun.py

22 lines
385 B
Python
Raw Normal View History

from __future__ import unicode_literals
from spacy.util import utf8open
import pytest
from os import path
HERE = path.dirname(__file__)
2014-07-07 03:11:04 +00:00
@pytest.fixture
def sun_txt():
2015-06-07 16:07:32 +00:00
loc = path.join(HERE, '..', 'sun.txt')
return utf8open(loc).read()
2015-06-07 16:07:32 +00:00
def test_tokenize(sun_txt, en_tokenizer):
assert len(sun_txt) != 0
2015-06-07 16:07:32 +00:00
tokens = en_tokenizer(sun_txt)
assert len(tokens) > 100