* Tests passing for reorganized version

This commit is contained in:
Matthew Honnibal 2014-07-07 04:23:46 +02:00
parent 12f8a0e3c2
commit e4263a241a
4 changed files with 116 additions and 0 deletions

View File

@ -0,0 +1,34 @@
from __future__ import unicode_literals
from spacy.spacy import expand_chunk
from spacy.en import lookup, unhash
from spacy import lex_of
def test_possess():
tokens = expand_chunk(lookup("Mike's"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "Mike"
assert unhash(lex_of(tokens[1])) == "'s"
def test_apostrophe():
tokens = expand_chunk(lookup("schools'"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "'"
assert unhash(lex_of(tokens[0])) == "schools"
def test_LL():
tokens = expand_chunk(lookup("we'll"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "will"
assert unhash(lex_of(tokens[0])) == "we"
def test_aint():
tokens = expand_chunk(lookup("ain't"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "are"
assert unhash(lex_of(tokens[1])) == "not"

11
tests/test_rules.py Normal file
View File

@ -0,0 +1,11 @@
from spacy import util
def test_load_en():
rules = util.read_tokenization('en')
assert len(rules) != 0
aint = [rule for rule in rules if rule[0] == "ain't"][0]
chunk, lex, pieces = aint
assert chunk == "ain't"
assert lex == "are"
assert pieces == ["not"]

47
tests/test_tokenizer.py Normal file
View File

@ -0,0 +1,47 @@
from __future__ import unicode_literals
from spacy.en import tokenize
from spacy.en import lookup
from spacy.lexeme import lex_of
def test_single_word():
lex_ids = tokenize(u'hello')
assert lex_ids[0] == lookup(u'hello')
def test_two_words():
lex_ids = tokenize(u'hello possums')
assert len(lex_ids) == 2
assert lex_ids[0] == lookup(u'hello')
assert lex_ids[0] != lex_ids[1]
def test_punct():
lex_ids = tokenize('hello, possums.')
assert len(lex_ids) == 4
assert lex_ids[0] != lookup('hello')
assert lex_of(lex_ids[0]) == lex_of(lookup('hello'))
assert lex_ids[2] == lookup('possums.')
assert lex_of(lex_ids[2]) == lex_of(lookup('possums.'))
assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
assert lex_ids[0] != lookup('hello.')
def test_digits():
lex_ids = tokenize('The year: 1984.')
assert len(lex_ids) == 5
assert lex_of(lex_ids[0]) == lex_of(lookup('The'))
assert lex_of(lex_ids[3]) == lex_of(lookup('1984'))
assert lex_of(lex_ids[4]) == lex_of(lookup('.'))
def test_contraction():
lex_ids = tokenize("don't giggle")
assert len(lex_ids) == 3
assert lex_of(lex_ids[1]) == lex_of(lookup("not"))
lex_ids = tokenize("i said don't!")
assert len(lex_ids) == 4
assert lex_of(lex_ids[3]) == lex_of(lookup('!'))

24
tests/test_wiki_sun.py Normal file
View File

@ -0,0 +1,24 @@
from __future__ import unicode_literals
from spacy.en import unhash
from spacy import lex_of
from spacy import en
from spacy.util import utf8open
import pytest
import os
from os import path
HERE = path.dirname(__file__)
@pytest.fixture
def sun_txt():
loc = path.join(HERE, 'sun.txt')
return utf8open(loc).read()
def test_tokenize(sun_txt):
assert len(sun_txt) != 0
tokens = en.tokenize(sun_txt)
assert True