From 0d9972f4b05e82dee969a04ee860dce3956c06f0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 21 Dec 2014 20:38:27 +1100 Subject: [PATCH] * Upd tokenizer test --- tests/test_tokenizer.py | 76 ++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 21d115b9b..e3f4aff0e 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,22 +1,28 @@ # coding: utf-8 from __future__ import unicode_literals -from spacy.en import EN +import pytest + +from spacy.en import English -def test_single_word(): - tokens = EN.tokenize(u'hello') +@pytest.fixture +def EN(): + return English(pos_tag=False, parse=False) + +def test_single_word(EN): + tokens = EN(u'hello') assert tokens[0].string == 'hello' -def test_two_words(): - tokens = EN.tokenize('hello possums') +def test_two_words(EN): + tokens = EN('hello possums') assert len(tokens) == 2 assert tokens[0].string != tokens[1].string -def test_punct(): - tokens = EN.tokenize('hello, possums.') +def test_punct(EN): + tokens = EN('hello, possums.') assert len(tokens) == 4 assert tokens[0].string == 'hello' assert tokens[1].string == ',' @@ -24,33 +30,33 @@ def test_punct(): assert tokens[1].string != 'hello' -def test_digits(): - tokens = EN.tokenize('The year: 1984.') +def test_digits(EN): + tokens = EN('The year: 1984.') assert len(tokens) == 5 - assert tokens[0].sic == EN.lexicon['The']['sic'] - assert tokens[3].sic == EN.lexicon['1984']['sic'] + assert tokens[0].sic == EN.vocab['The']['sic'] + assert tokens[3].sic == EN.vocab['1984']['sic'] -def test_contraction(): - tokens = EN.tokenize("don't giggle") +def test_contraction(EN): + tokens = EN("don't giggle") assert len(tokens) == 3 - assert tokens[1].sic == EN.lexicon["n't"]['sic'] - tokens = EN.tokenize("i said don't!") + assert tokens[1].sic == EN.vocab["n't"]['sic'] + tokens = EN("i said don't!") assert len(tokens) == 5 - assert tokens[4].sic == EN.lexicon['!']['sic'] + assert tokens[4].sic == EN.vocab['!']['sic'] -def test_contraction_punct(): - tokens = EN.tokenize("(can't") +def test_contraction_punct(EN): + tokens = EN("(can't") assert len(tokens) == 3 - tokens = EN.tokenize("`ain't") + tokens = EN("`ain't") assert len(tokens) == 3 - tokens = EN.tokenize('''"isn't''') + tokens = EN('''"isn't''') assert len(tokens) == 3 - tokens = EN.tokenize("can't!") + tokens = EN("can't!") assert len(tokens) == 3 -def test_sample(): +def test_sample(EN): text = """Tributes pour in for late British Labour Party leader Tributes poured in from around the world Thursday @@ -62,45 +68,45 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. "Mr. Smith, throughout his distinguished""" - tokens = EN.tokenize(text) + tokens = EN(text) assert len(tokens) > 5 -def test_cnts1(): +def test_cnts1(EN): text = u"""The U.S. Army likes Shock and Awe.""" - tokens = EN.tokenize(text) + tokens = EN(text) assert len(tokens) == 8 -def test_cnts2(): +def test_cnts2(EN): text = u"""U.N. regulations are not a part of their concern.""" - tokens = EN.tokenize(text) + tokens = EN(text) assert len(tokens) == 10 -def test_cnts3(): +def test_cnts3(EN): text = u"“Isn't it?”" - tokens = EN.tokenize(text) + tokens = EN(text) words = [t.string for t in tokens] assert len(words) == 6 -def test_cnts4(): +def test_cnts4(EN): text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ - tokens = EN.tokenize(text) + tokens = EN(text) words = [t.string for t in tokens] assert len(words) == 15 -def test_cnts5(): +def test_cnts5(EN): text = """'Me too!', Mr. P. Delaware cried. """ - tokens = EN.tokenize(text) + tokens = EN(text) assert len(tokens) == 11 -def test_cnts6(): +def test_cnts6(EN): text = u'They ran about 10km.' - tokens = EN.tokenize(text) + tokens = EN(text) words = [t.string for t in tokens] assert len(words) == 6