* Tests passing for reorganized version

2014-07-07 04:23:46 +02:00 · 2014-07-07 04:23:46 +02:00 · e4263a241a
parent 12f8a0e3c2
commit e4263a241a
4 changed files with 116 additions and 0 deletions
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -0,0 +1,34 @@
 from __future__ import unicode_literals
 from spacy.spacy import expand_chunk
 from spacy.en import lookup, unhash
 from spacy import lex_of
 def test_possess():
    tokens = expand_chunk(lookup("Mike's"))
    assert len(tokens) == 2
    assert unhash(lex_of(tokens[0])) == "Mike"
    assert unhash(lex_of(tokens[1])) == "'s"
 def test_apostrophe():
    tokens = expand_chunk(lookup("schools'"))
    assert len(tokens) == 2
    assert unhash(lex_of(tokens[1])) == "'"
    assert unhash(lex_of(tokens[0])) == "schools"
 def test_LL():
    tokens = expand_chunk(lookup("we'll"))
    assert len(tokens) == 2
    assert unhash(lex_of(tokens[1])) == "will"
    assert unhash(lex_of(tokens[0])) == "we"
 def test_aint():
    tokens = expand_chunk(lookup("ain't"))
    assert len(tokens) == 2
    assert unhash(lex_of(tokens[0])) == "are"
    assert unhash(lex_of(tokens[1])) == "not"
--- a/tests/test_rules.py
+++ b/tests/test_rules.py
@ -0,0 +1,11 @@
 from spacy import util
 def test_load_en():
    rules = util.read_tokenization('en')
    assert len(rules) != 0
    aint = [rule for rule in rules if rule[0] == "ain't"][0]
    chunk, lex, pieces = aint
    assert chunk == "ain't"
    assert lex == "are"
    assert pieces == ["not"]
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -0,0 +1,47 @@
 from __future__ import unicode_literals
 from spacy.en import tokenize
 from spacy.en import lookup
 from spacy.lexeme import lex_of
 def test_single_word():
    lex_ids = tokenize(u'hello')
    assert lex_ids[0] == lookup(u'hello')
 def test_two_words():
    lex_ids = tokenize(u'hello possums')
    assert len(lex_ids) == 2
    assert lex_ids[0] == lookup(u'hello')
    assert lex_ids[0] != lex_ids[1]
 def test_punct():
    lex_ids = tokenize('hello, possums.')
    assert len(lex_ids) == 4
    assert lex_ids[0] != lookup('hello')
    assert lex_of(lex_ids[0]) == lex_of(lookup('hello'))
    assert lex_ids[2] == lookup('possums.')
    assert lex_of(lex_ids[2]) == lex_of(lookup('possums.'))
    assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
    assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
    assert lex_ids[0] != lookup('hello.')
 def test_digits():
    lex_ids = tokenize('The year: 1984.')
    assert len(lex_ids) == 5
    assert lex_of(lex_ids[0]) == lex_of(lookup('The'))
    assert lex_of(lex_ids[3]) == lex_of(lookup('1984'))
    assert lex_of(lex_ids[4]) == lex_of(lookup('.'))
 def test_contraction():
    lex_ids = tokenize("don't giggle")
    assert len(lex_ids) == 3
    assert lex_of(lex_ids[1]) == lex_of(lookup("not"))
    lex_ids = tokenize("i said don't!")
    assert len(lex_ids) == 4
    assert lex_of(lex_ids[3]) == lex_of(lookup('!'))
--- a/tests/test_wiki_sun.py
+++ b/tests/test_wiki_sun.py
@ -0,0 +1,24 @@
 from __future__ import unicode_literals
 from spacy.en import unhash
 from spacy import lex_of
 from spacy import en
 from spacy.util import utf8open
 import pytest
 import os
 from os import path
 HERE = path.dirname(__file__)
@pytest.fixture
 def sun_txt():
    loc = path.join(HERE, 'sun.txt')
    return utf8open(loc).read()
 def test_tokenize(sun_txt):
    assert len(sun_txt) != 0
    tokens = en.tokenize(sun_txt)
    assert True