From 7195c07897fe117f0f2c3988e4d7047dc6b47f03 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Jun 2015 21:29:17 +0200 Subject: [PATCH] * Add explicit tokenizer test --- tests/tokenizer/test_tokenizer.py | 37 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/tests/tokenizer/test_tokenizer.py b/tests/tokenizer/test_tokenizer.py index 804e09114..ed2bfddf2 100644 --- a/tests/tokenizer/test_tokenizer.py +++ b/tests/tokenizer/test_tokenizer.py @@ -72,53 +72,58 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. assert len(tokens) > 5 -def test_cnts1(EN): +def test_cnts1(en_tokenizer): text = u"""The U.S. Army likes Shock and Awe.""" - tokens = EN(text) + tokens = en_tokenizer(text) assert len(tokens) == 8 -def test_cnts2(EN): +def test_cnts2(en_tokenizer): text = u"""U.N. regulations are not a part of their concern.""" - tokens = EN(text) + tokens = en_tokenizer(text) assert len(tokens) == 10 -def test_cnts3(EN): +def test_cnts3(en_tokenizer): text = u"“Isn't it?”" - tokens = EN(text) + tokens = en_tokenizer(text) words = [t.orth_ for t in tokens] assert len(words) == 6 -def test_cnts4(EN): +def test_cnts4(en_tokenizer): text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ - tokens = EN(text) + tokens = en_tokenizer(text) words = [t.orth_ for t in tokens] assert len(words) == 15 -def test_cnts5(EN): +def test_cnts5(en_tokenizer): text = """'Me too!', Mr. P. Delaware cried. """ - tokens = EN(text) + tokens = en_tokenizer(text) assert len(tokens) == 11 +def test_mr(en_tokenizer): + text = """Mr. Smith""" + tokens = en_tokenizer(text) + assert len(tokens) == 2 -def test_cnts6(EN): + +def test_cnts6(en_tokenizer): text = u'They ran about 10km.' - tokens = EN(text) + tokens = en_tokenizer(text) words = [t.orth_ for t in tokens] assert len(words) == 6 -def test_bracket_period(EN): +def test_bracket_period(en_tokenizer): text = u'(And a 6a.m. run through Washington Park).' - tokens = EN(text) + tokens = en_tokenizer(text) assert tokens[len(tokens) - 1].orth_ == u'.' -def test_ie(EN): +def test_ie(en_tokenizer): text = u"It's mediocre i.e. bad." - tokens = EN(text) + tokens = en_tokenizer(text) assert len(tokens) == 6 assert tokens[3].orth_ == "i.e."