From 0d9972f4b05e82dee969a04ee860dce3956c06f0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 21 Dec 2014 20:38:27 +1100
Subject: [PATCH] * Upd tokenizer test

---
 tests/test_tokenizer.py | 76 ++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 21d115b9b..e3f4aff0e 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -1,22 +1,28 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from spacy.en import EN
+import pytest
+
+from spacy.en import English
 
 
-def test_single_word():
-    tokens = EN.tokenize(u'hello')
+@pytest.fixture
+def EN():
+    return English(pos_tag=False, parse=False)
+
+def test_single_word(EN):
+    tokens = EN(u'hello')
     assert tokens[0].string == 'hello'
 
 
-def test_two_words():
-    tokens = EN.tokenize('hello possums')
+def test_two_words(EN):
+    tokens = EN('hello possums')
     assert len(tokens) == 2
     assert tokens[0].string != tokens[1].string
 
 
-def test_punct():
-    tokens = EN.tokenize('hello, possums.')
+def test_punct(EN):
+    tokens = EN('hello, possums.')
     assert len(tokens) == 4
     assert tokens[0].string == 'hello'
     assert tokens[1].string == ','
@@ -24,33 +30,33 @@ def test_punct():
     assert tokens[1].string != 'hello'
 
 
-def test_digits():
-    tokens = EN.tokenize('The year: 1984.')
+def test_digits(EN):
+    tokens = EN('The year: 1984.')
     assert len(tokens) == 5
-    assert tokens[0].sic == EN.lexicon['The']['sic']
-    assert tokens[3].sic == EN.lexicon['1984']['sic']
+    assert tokens[0].sic == EN.vocab['The']['sic']
+    assert tokens[3].sic == EN.vocab['1984']['sic']
 
 
-def test_contraction():
-    tokens = EN.tokenize("don't giggle")
+def test_contraction(EN):
+    tokens = EN("don't giggle")
     assert len(tokens) == 3
-    assert tokens[1].sic == EN.lexicon["n't"]['sic']
-    tokens = EN.tokenize("i said don't!")
+    assert tokens[1].sic == EN.vocab["n't"]['sic']
+    tokens = EN("i said don't!")
     assert len(tokens) == 5
-    assert tokens[4].sic == EN.lexicon['!']['sic']
+    assert tokens[4].sic == EN.vocab['!']['sic']
 
 
-def test_contraction_punct():
-    tokens = EN.tokenize("(can't")
+def test_contraction_punct(EN):
+    tokens = EN("(can't")
     assert len(tokens) == 3
-    tokens = EN.tokenize("`ain't")
+    tokens = EN("`ain't")
     assert len(tokens) == 3
-    tokens = EN.tokenize('''"isn't''')
+    tokens = EN('''"isn't''')
     assert len(tokens) == 3
-    tokens = EN.tokenize("can't!")
+    tokens = EN("can't!")
     assert len(tokens) == 3
 
-def test_sample():
+def test_sample(EN):
     text = """Tributes pour in for late British Labour Party leader
 
 Tributes poured in from around the world Thursday 
@@ -62,45 +68,45 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
 
 "Mr. Smith, throughout his distinguished"""
     
-    tokens = EN.tokenize(text)
+    tokens = EN(text)
     assert len(tokens) > 5
 
 
-def test_cnts1():
+def test_cnts1(EN):
     text = u"""The U.S. Army likes Shock and Awe."""
-    tokens = EN.tokenize(text)
+    tokens = EN(text)
     assert len(tokens) == 8
 
 
-def test_cnts2():
+def test_cnts2(EN):
     text = u"""U.N. regulations are not a part of their concern."""
-    tokens = EN.tokenize(text)
+    tokens = EN(text)
     assert len(tokens) == 10
 
 
-def test_cnts3():
+def test_cnts3(EN):
     text = u"“Isn't it?”"
-    tokens = EN.tokenize(text)
+    tokens = EN(text)
     words = [t.string for t in tokens]
     assert len(words) == 6
 
 
-def test_cnts4():
+def test_cnts4(EN):
     text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
-    tokens = EN.tokenize(text)
+    tokens = EN(text)
     words = [t.string for t in tokens]
     assert len(words) == 15
 
 
-def test_cnts5():
+def test_cnts5(EN):
     text = """'Me too!', Mr. P. Delaware cried. """
-    tokens = EN.tokenize(text)
+    tokens = EN(text)
     assert len(tokens) == 11
 
 
-def test_cnts6():
+def test_cnts6(EN):
     text = u'They ran about 10km.'
-    tokens = EN.tokenize(text)
+    tokens = EN(text)
     words = [t.string for t in tokens]
     assert len(words) == 6