2014-09-15 04:31:58 +00:00
|
|
|
# coding: utf-8
|
2014-07-07 02:23:46 +00:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
import pytest
|
2014-07-07 02:23:46 +00:00
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
from spacy.en import English
|
2014-07-07 02:23:46 +00:00
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def EN():
|
2014-12-30 10:34:09 +00:00
|
|
|
return English()
|
2014-12-21 09:38:27 +00:00
|
|
|
|
|
|
|
def test_single_word(EN):
|
|
|
|
tokens = EN(u'hello')
|
2014-10-23 13:59:17 +00:00
|
|
|
assert tokens[0].string == 'hello'
|
2014-07-07 02:23:46 +00:00
|
|
|
|
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_two_words(EN):
|
|
|
|
tokens = EN('hello possums')
|
2014-10-23 13:59:17 +00:00
|
|
|
assert len(tokens) == 2
|
|
|
|
assert tokens[0].string != tokens[1].string
|
2014-07-07 02:23:46 +00:00
|
|
|
|
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_punct(EN):
|
|
|
|
tokens = EN('hello, possums.')
|
2014-09-15 04:31:58 +00:00
|
|
|
assert len(tokens) == 4
|
2014-10-23 13:59:17 +00:00
|
|
|
assert tokens[0].string == 'hello'
|
|
|
|
assert tokens[1].string == ','
|
|
|
|
assert tokens[2].string == 'possums'
|
|
|
|
assert tokens[1].string != 'hello'
|
2014-07-07 02:23:46 +00:00
|
|
|
|
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_digits(EN):
|
|
|
|
tokens = EN('The year: 1984.')
|
2014-10-23 13:59:17 +00:00
|
|
|
assert len(tokens) == 5
|
2015-01-14 13:33:16 +00:00
|
|
|
assert tokens[0].sic == EN.vocab['The'].sic
|
|
|
|
assert tokens[3].sic == EN.vocab['1984'].sic
|
2014-07-07 02:23:46 +00:00
|
|
|
|
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_contraction(EN):
|
|
|
|
tokens = EN("don't giggle")
|
2014-10-23 13:59:17 +00:00
|
|
|
assert len(tokens) == 3
|
2015-01-14 13:33:16 +00:00
|
|
|
assert tokens[1].sic == EN.vocab["n't"].sic
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN("i said don't!")
|
2014-10-23 13:59:17 +00:00
|
|
|
assert len(tokens) == 5
|
2015-01-14 13:33:16 +00:00
|
|
|
assert tokens[4].sic == EN.vocab['!'].sic
|
2014-09-12 16:00:42 +00:00
|
|
|
|
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_contraction_punct(EN):
|
|
|
|
tokens = EN("(can't")
|
2014-09-12 16:00:42 +00:00
|
|
|
assert len(tokens) == 3
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN("`ain't")
|
2014-09-12 16:00:42 +00:00
|
|
|
assert len(tokens) == 3
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN('''"isn't''')
|
2014-09-12 16:00:42 +00:00
|
|
|
assert len(tokens) == 3
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN("can't!")
|
2014-09-12 16:00:42 +00:00
|
|
|
assert len(tokens) == 3
|
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_sample(EN):
|
2014-09-14 23:32:51 +00:00
|
|
|
text = """Tributes pour in for late British Labour Party leader
|
|
|
|
|
|
|
|
Tributes poured in from around the world Thursday
|
|
|
|
to the late Labour Party leader John Smith, who died earlier from a massive
|
|
|
|
heart attack aged 55.
|
|
|
|
|
|
|
|
In Washington, the US State Department issued a statement regretting "the
|
|
|
|
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
|
|
|
|
|
|
|
"Mr. Smith, throughout his distinguished"""
|
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN(text)
|
2014-09-14 23:32:51 +00:00
|
|
|
assert len(tokens) > 5
|
2014-09-15 04:31:58 +00:00
|
|
|
|
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_cnts1(EN):
|
2014-09-15 04:31:58 +00:00
|
|
|
text = u"""The U.S. Army likes Shock and Awe."""
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN(text)
|
2014-09-15 04:31:58 +00:00
|
|
|
assert len(tokens) == 8
|
|
|
|
|
2014-12-09 03:48:01 +00:00
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_cnts2(EN):
|
2014-09-15 04:31:58 +00:00
|
|
|
text = u"""U.N. regulations are not a part of their concern."""
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN(text)
|
2014-09-15 04:31:58 +00:00
|
|
|
assert len(tokens) == 10
|
|
|
|
|
2014-12-09 03:48:01 +00:00
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_cnts3(EN):
|
2014-09-15 04:31:58 +00:00
|
|
|
text = u"“Isn't it?”"
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN(text)
|
2014-12-09 03:48:01 +00:00
|
|
|
words = [t.string for t in tokens]
|
|
|
|
assert len(words) == 6
|
|
|
|
|
2014-09-15 04:31:58 +00:00
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_cnts4(EN):
|
2014-09-15 04:31:58 +00:00
|
|
|
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN(text)
|
2014-12-09 03:48:01 +00:00
|
|
|
words = [t.string for t in tokens]
|
|
|
|
assert len(words) == 15
|
|
|
|
|
2014-09-15 04:31:58 +00:00
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_cnts5(EN):
|
2014-09-15 04:31:58 +00:00
|
|
|
text = """'Me too!', Mr. P. Delaware cried. """
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN(text)
|
2014-09-15 04:31:58 +00:00
|
|
|
assert len(tokens) == 11
|
|
|
|
|
2014-12-09 03:48:01 +00:00
|
|
|
|
2014-12-21 09:38:27 +00:00
|
|
|
def test_cnts6(EN):
|
2014-09-15 04:31:58 +00:00
|
|
|
text = u'They ran about 10km.'
|
2014-12-21 09:38:27 +00:00
|
|
|
tokens = EN(text)
|
2014-12-09 03:48:01 +00:00
|
|
|
words = [t.string for t in tokens]
|
|
|
|
assert len(words) == 6
|
|
|
|
|
2015-01-22 11:25:18 +00:00
|
|
|
def test_bracket_period(EN):
|
|
|
|
text = u'(And a 6a.m. run through Washington Park).'
|
|
|
|
tokens = EN(text)
|
|
|
|
assert tokens[len(tokens) - 1].string == u'.'
|
|
|
|
|
2014-11-04 15:03:22 +00:00
|
|
|
#def test_cnts7():
|
|
|
|
# text = 'But then the 6,000-year ice age came...'
|
|
|
|
# tokens = EN.tokenize(text)
|
|
|
|
# assert len(tokens) == 10
|