* Tests passing for reorganized version

2014-07-07 04:23:46 +02:00 · 2014-07-07 04:23:46 +02:00 · e4263a241a
parent 12f8a0e3c2
commit e4263a241a
4 changed files with 116 additions and 0 deletions
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -0,0 +1,34 @@
+from __future__ import unicode_literals
+
+from spacy.spacy import expand_chunk
+from spacy.en import lookup, unhash
+
+from spacy import lex_of
+
+
+def test_possess():
+    tokens = expand_chunk(lookup("Mike's"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "Mike"
+    assert unhash(lex_of(tokens[1])) == "'s"
+
+
+def test_apostrophe():
+    tokens = expand_chunk(lookup("schools'"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[1])) == "'"
+    assert unhash(lex_of(tokens[0])) == "schools"
+
+
+def test_LL():
+    tokens = expand_chunk(lookup("we'll"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[1])) == "will"
+    assert unhash(lex_of(tokens[0])) == "we"
+
+
+def test_aint():
+    tokens = expand_chunk(lookup("ain't"))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "are"
+    assert unhash(lex_of(tokens[1])) == "not"
--- a/tests/test_rules.py
+++ b/tests/test_rules.py
@ -0,0 +1,11 @@
+from spacy import util
+
+
+def test_load_en():
+    rules = util.read_tokenization('en')
+    assert len(rules) != 0
+    aint = [rule for rule in rules if rule[0] == "ain't"][0]
+    chunk, lex, pieces = aint
+    assert chunk == "ain't"
+    assert lex == "are"
+    assert pieces == ["not"]
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+from spacy.en import tokenize
+from spacy.en import lookup
+
+from spacy.lexeme import lex_of
+
+
+def test_single_word():
+    lex_ids = tokenize(u'hello')
+    assert lex_ids[0] == lookup(u'hello')
+
+
+def test_two_words():
+    lex_ids = tokenize(u'hello possums')
+    assert len(lex_ids) == 2
+    assert lex_ids[0] == lookup(u'hello')
+    assert lex_ids[0] != lex_ids[1]
+
+
+def test_punct():
+    lex_ids = tokenize('hello, possums.')
+    assert len(lex_ids) == 4
+    assert lex_ids[0] != lookup('hello')
+    assert lex_of(lex_ids[0]) == lex_of(lookup('hello'))
+    assert lex_ids[2] == lookup('possums.')
+    assert lex_of(lex_ids[2]) == lex_of(lookup('possums.'))
+    assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
+    assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
+    assert lex_ids[0] != lookup('hello.')
+
+
+def test_digits():
+    lex_ids = tokenize('The year: 1984.')
+    assert len(lex_ids) == 5
+    assert lex_of(lex_ids[0]) == lex_of(lookup('The'))
+    assert lex_of(lex_ids[3]) == lex_of(lookup('1984'))
+    assert lex_of(lex_ids[4]) == lex_of(lookup('.'))
+
+
+def test_contraction():
+    lex_ids = tokenize("don't giggle")
+    assert len(lex_ids) == 3
+    assert lex_of(lex_ids[1]) == lex_of(lookup("not"))
+    lex_ids = tokenize("i said don't!")
+    assert len(lex_ids) == 4
+    assert lex_of(lex_ids[3]) == lex_of(lookup('!'))
--- a/tests/test_wiki_sun.py
+++ b/tests/test_wiki_sun.py
@ -0,0 +1,24 @@
+from __future__ import unicode_literals
+
+from spacy.en import unhash
+from spacy import lex_of
+from spacy import en
+from spacy.util import utf8open
+
+import pytest
+import os
+from os import path
+
+
+HERE = path.dirname(__file__)
+
+@pytest.fixture
+def sun_txt():
+    loc = path.join(HERE, 'sun.txt')
+    return utf8open(loc).read()
+
+
+def test_tokenize(sun_txt):
+    assert len(sun_txt) != 0
+    tokens = en.tokenize(sun_txt)
+    assert True