diff --git a/tests/test_contractions.py b/tests/test_contractions.py new file mode 100644 index 000000000..be2280d75 --- /dev/null +++ b/tests/test_contractions.py @@ -0,0 +1,34 @@ +from __future__ import unicode_literals + +from spacy.spacy import expand_chunk +from spacy.en import lookup, unhash + +from spacy import lex_of + + +def test_possess(): + tokens = expand_chunk(lookup("Mike's")) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[0])) == "Mike" + assert unhash(lex_of(tokens[1])) == "'s" + + +def test_apostrophe(): + tokens = expand_chunk(lookup("schools'")) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[1])) == "'" + assert unhash(lex_of(tokens[0])) == "schools" + + +def test_LL(): + tokens = expand_chunk(lookup("we'll")) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[1])) == "will" + assert unhash(lex_of(tokens[0])) == "we" + + +def test_aint(): + tokens = expand_chunk(lookup("ain't")) + assert len(tokens) == 2 + assert unhash(lex_of(tokens[0])) == "are" + assert unhash(lex_of(tokens[1])) == "not" diff --git a/tests/test_rules.py b/tests/test_rules.py new file mode 100644 index 000000000..f95f1f820 --- /dev/null +++ b/tests/test_rules.py @@ -0,0 +1,11 @@ +from spacy import util + + +def test_load_en(): + rules = util.read_tokenization('en') + assert len(rules) != 0 + aint = [rule for rule in rules if rule[0] == "ain't"][0] + chunk, lex, pieces = aint + assert chunk == "ain't" + assert lex == "are" + assert pieces == ["not"] diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py new file mode 100644 index 000000000..c3760c6fb --- /dev/null +++ b/tests/test_tokenizer.py @@ -0,0 +1,47 @@ +from __future__ import unicode_literals + +from spacy.en import tokenize +from spacy.en import lookup + +from spacy.lexeme import lex_of + + +def test_single_word(): + lex_ids = tokenize(u'hello') + assert lex_ids[0] == lookup(u'hello') + + +def test_two_words(): + lex_ids = tokenize(u'hello possums') + assert len(lex_ids) == 2 + assert lex_ids[0] == lookup(u'hello') + assert lex_ids[0] != lex_ids[1] + + +def test_punct(): + lex_ids = tokenize('hello, possums.') + assert len(lex_ids) == 4 + assert lex_ids[0] != lookup('hello') + assert lex_of(lex_ids[0]) == lex_of(lookup('hello')) + assert lex_ids[2] == lookup('possums.') + assert lex_of(lex_ids[2]) == lex_of(lookup('possums.')) + assert lex_of(lex_ids[2]) == lex_of(lookup('possums')) + assert lex_of(lex_ids[1]) != lex_of(lookup('hello')) + assert lex_ids[0] != lookup('hello.') + + +def test_digits(): + lex_ids = tokenize('The year: 1984.') + assert len(lex_ids) == 5 + assert lex_of(lex_ids[0]) == lex_of(lookup('The')) + assert lex_of(lex_ids[3]) == lex_of(lookup('1984')) + assert lex_of(lex_ids[4]) == lex_of(lookup('.')) + + +def test_contraction(): + lex_ids = tokenize("don't giggle") + assert len(lex_ids) == 3 + assert lex_of(lex_ids[1]) == lex_of(lookup("not")) + lex_ids = tokenize("i said don't!") + assert len(lex_ids) == 4 + assert lex_of(lex_ids[3]) == lex_of(lookup('!')) diff --git a/tests/test_wiki_sun.py b/tests/test_wiki_sun.py new file mode 100644 index 000000000..8f4c05eed --- /dev/null +++ b/tests/test_wiki_sun.py @@ -0,0 +1,24 @@ +from __future__ import unicode_literals + +from spacy.en import unhash +from spacy import lex_of +from spacy import en +from spacy.util import utf8open + +import pytest +import os +from os import path + + +HERE = path.dirname(__file__) + +@pytest.fixture +def sun_txt(): + loc = path.join(HERE, 'sun.txt') + return utf8open(loc).read() + + +def test_tokenize(sun_txt): + assert len(sun_txt) != 0 + tokens = en.tokenize(sun_txt) + assert True