From e60b958b7d218264d74ed5921445920db925ffa1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 05:11:31 +0200 Subject: [PATCH] * Add test to check how well we match ptb tokenizer. Needs more text. --- tests/test_ptb_match_wiki_sun.py | 46 ++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tests/test_ptb_match_wiki_sun.py diff --git a/tests/test_ptb_match_wiki_sun.py b/tests/test_ptb_match_wiki_sun.py new file mode 100644 index 000000000..b50d6913d --- /dev/null +++ b/tests/test_ptb_match_wiki_sun.py @@ -0,0 +1,46 @@ +from __future__ import unicode_literals + +from spacy.en import unhash +from spacy import lex_of +from spacy.util import utf8open +from spacy.en_ptb import tokenize, lookup, unhash + +import pytest +import os +from os import path + + +HERE = path.dirname(__file__) + + +@pytest.fixture +def sun_txt(): + loc = path.join(HERE, 'sun.txt') + return utf8open(loc).read() + + +@pytest.fixture +def my_tokens(sun_txt): + assert len(sun_txt) != 0 + tokens = tokenize(sun_txt) + return [unhash(lex_of(t)) for t in tokens] + + +@pytest.fixture +def sed_tokens(): + loc = path.join(HERE, 'sun.tokens') + return utf8open(loc).read().split() + + +def test_compare_tokens(my_tokens, sed_tokens): + me = my_tokens + sed = sed_tokens + i = 0 + while i < len(me) and i < len(sed): + assert me[i] == sed[i] + i += 1 + + assert len(me) == len(sed) + + +