From db191361eedb83b4f1ed3e7b923c6b18cd5e2539 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Sep 2014 06:31:58 +0200 Subject: [PATCH] * Add new tests for fancier tokenization cases --- tests/test_tokenizer.py | 47 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 03f728ada..fcdcdc141 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals from spacy.en import EN @@ -17,19 +18,19 @@ def test_two_words(): def test_punct(): tokens = EN.tokenize('hello, possums.') - assert len(tokens) == 3 + assert len(tokens) == 4 assert tokens[0].string == EN.lexicon.lookup('hello').string assert tokens[1].string == EN.lexicon.lookup(',').string - assert tokens[2].string == EN.lexicon.lookup('possums.').string + assert tokens[2].string == EN.lexicon.lookup('possums').string assert tokens[1].string != EN.lexicon.lookup('hello').string def test_digits(): lex_ids = EN.tokenize('The year: 1984.') - assert lex_ids.string(3) == "1984." - assert len(lex_ids) == 4 + assert lex_ids.string(3) == "1984" + assert len(lex_ids) == 5 assert lex_ids[0].string == EN.lexicon.lookup('The').string - assert lex_ids[3].string == EN.lexicon.lookup('1984.').string + assert lex_ids[3].string == EN.lexicon.lookup('1984').string def test_contraction(): @@ -65,3 +66,39 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. tokens = EN.tokenize(text) assert len(tokens) > 5 + + +def test_cnts1(): + text = u"""The U.S. Army likes Shock and Awe.""" + tokens = EN.tokenize(text) + assert len(tokens) == 8 + +def test_cnts2(): + text = u"""U.N. regulations are not a part of their concern.""" + tokens = EN.tokenize(text) + assert len(tokens) == 10 + +def test_cnts3(): + text = u"“Isn't it?”" + tokens = EN.tokenize(text) + assert len(tokens) == 6 + +def test_cnts4(): + text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ + tokens = EN.tokenize(text) + assert len(tokens) == 15 + +def test_cnts5(): + text = """'Me too!', Mr. P. Delaware cried. """ + tokens = EN.tokenize(text) + assert len(tokens) == 11 + +def test_cnts6(): + text = u'They ran about 10km.' + tokens = EN.tokenize(text) + assert len(tokens) == 6 + +def test_cnts7(): + text = 'But then the 6,000-year ice age came...' + tokens = EN.tokenize(text) + assert len(tokens) == 8