From 5dcc1a426a06754339668ac235b5e205e0af069a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Sep 2014 01:32:51 +0200 Subject: [PATCH] * Update tokenization tests for new tokenizer rules --- tests/test_tokenizer.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 12ae2595f..03f728ada 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -17,21 +17,19 @@ def test_two_words(): def test_punct(): tokens = EN.tokenize('hello, possums.') - assert len(tokens) == 4 + assert len(tokens) == 3 assert tokens[0].string == EN.lexicon.lookup('hello').string assert tokens[1].string == EN.lexicon.lookup(',').string - assert tokens[2].string == EN.lexicon.lookup('possums').string + assert tokens[2].string == EN.lexicon.lookup('possums.').string assert tokens[1].string != EN.lexicon.lookup('hello').string def test_digits(): lex_ids = EN.tokenize('The year: 1984.') - assert lex_ids.string(4) == "." - assert lex_ids.string(3) == "1984" - assert len(lex_ids) == 5 + assert lex_ids.string(3) == "1984." + assert len(lex_ids) == 4 assert lex_ids[0].string == EN.lexicon.lookup('The').string - assert lex_ids[3].string == EN.lexicon.lookup('1984').string - assert lex_ids[4].string == EN.lexicon.lookup('.').string + assert lex_ids[3].string == EN.lexicon.lookup('1984.').string def test_contraction(): @@ -53,3 +51,17 @@ def test_contraction_punct(): tokens = EN.tokenize("can't!") assert len(tokens) == 3 +def test_sample(): + text = """Tributes pour in for late British Labour Party leader + +Tributes poured in from around the world Thursday +to the late Labour Party leader John Smith, who died earlier from a massive +heart attack aged 55. + +In Washington, the US State Department issued a statement regretting "the +untimely death" of the rapier-tongued Scottish barrister and parliamentarian. + +"Mr. Smith, throughout his distinguished""" + + tokens = EN.tokenize(text) + assert len(tokens) > 5