* Refactor around Word objects, adapting tests. Tests passing, except for string views.

This commit is contained in:
Matthew Honnibal 2014-08-23 19:55:06 +02:00
parent 4f01df9152
commit 9815c7649e
11 changed files with 65 additions and 117 deletions

View File

@ -1,7 +1,6 @@
from spacy.spacy cimport Language
from spacy.lexeme cimport LexID
from spacy.tokens cimport Tokens
from spacy.lexeme cimport StringHash
from spacy.word cimport Word
cdef class PennTreebank3(Language):
@ -10,6 +9,6 @@ cdef class PennTreebank3(Language):
cdef PennTreebank3 PTB3
cpdef LexID lookup(unicode word) except 0
cpdef Tokens tokenize(unicode string)
cpdef Word lookup(unicode word)
cpdef list tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value)

View File

@ -77,18 +77,21 @@ def nltk_regex_tokenize(text):
cdef class PennTreebank3(Language):
cpdef list find_substrings(self, unicode chunk):
strings = nltk_regex_tokenize(chunk)
if strings[-1] == '.':
strings.pop()
strings[-1] += '.'
assert strings
return strings
cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
cpdef Tokens tokenize(unicode string):
cpdef list tokenize(unicode string):
return PTB3.tokenize(string)
cpdef LexID lookup(unicode string) except 0:
return <LexID>PTB3.lookup(string)
cpdef Word lookup(unicode string):
return PTB3.lookup(string)
cpdef unicode unhash(StringHash hash_value):

View File

@ -2,35 +2,33 @@ from __future__ import unicode_literals
from spacy.en import tokenize, lookup, unhash
from spacy import lex_of
def test_possess():
tokens = tokenize("Mike's")
assert unhash(lex_of(tokens[0])) == "Mike"
assert unhash(lex_of(tokens[1])) == "'s"
assert unhash(tokens[0].lex) == "Mike"
assert unhash(tokens[1].lex) == "'s"
assert len(tokens) == 2
def test_apostrophe():
tokens = tokenize("schools'")
assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "'"
assert unhash(lex_of(tokens[0])) == "schools"
assert unhash(tokens[1].lex) == "'"
assert unhash(tokens[0].lex) == "schools"
def test_LL():
tokens = tokenize("we'll")
assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "will"
assert unhash(lex_of(tokens[0])) == "we"
assert unhash(tokens[1].lex) == "will"
assert unhash(tokens[0].lex) == "we"
def test_aint():
tokens = tokenize("ain't")
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "are"
assert unhash(lex_of(tokens[1])) == "not"
assert unhash(tokens[0].lex) == "are"
assert unhash(tokens[1].lex) == "not"
def test_capitalized():
@ -40,4 +38,4 @@ def test_capitalized():
assert len(tokens) == 2
tokens = tokenize("Ain't")
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "Are"
assert unhash(tokens[0].lex) == "Are"

View File

@ -3,8 +3,8 @@ from __future__ import unicode_literals
import pytest
from spacy.en import lookup, unhash
import spacy.word
from spacy.en import lex_of, shape_of, norm_of, first_of, length_of
@pytest.fixture
def C3P0():
@ -12,17 +12,16 @@ def C3P0():
def test_shape(C3P0):
assert unhash(shape_of(C3P0)) == "XdXd"
# TODO: Fix this
assert unhash(C3P0.get_view(2)) == "XdXd"
def test_length():
t = lookup('the')
assert length_of(t) == 3
#t = lookup('')
#assert length_of(t) == 0
assert t.length == 3
t = lookup("n't")
assert length_of(t) == 3
assert t.length == 3
t = lookup("'s")
assert length_of(t) == 2
assert t.length == 2
t = lookup('Xxxx')
assert length_of(t) == 4
assert t.length == 4

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals
from spacy import lex_of
from spacy.en import lookup
from spacy.en import tokenize
from spacy.en import unhash
@ -19,8 +18,8 @@ def test_close(close_puncts):
string = word_str + p
tokens = tokenize(string)
assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == p
assert unhash(lex_of(tokens[0])) == word_str
assert unhash(tokens[1].lex) == p
assert unhash(tokens[0].lex) == word_str
def test_two_different_close(close_puncts):
@ -29,9 +28,9 @@ def test_two_different_close(close_puncts):
string = word_str + p + "'"
tokens = tokenize(string)
assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == word_str
assert unhash(lex_of(tokens[1])) == p
assert unhash(lex_of(tokens[2])) == "'"
assert unhash(tokens[0].lex) == word_str
assert unhash(tokens[1].lex) == p
assert unhash(tokens[2].lex) == "'"
def test_three_same_close(close_puncts):
@ -40,5 +39,5 @@ def test_three_same_close(close_puncts):
string = word_str + p + p + p
tokens = tokenize(string)
assert len(tokens) == 4
assert unhash(lex_of(tokens[0])) == word_str
assert unhash(lex_of(tokens[1])) == p
assert unhash(tokens[0].lex) == word_str
assert unhash(tokens[1].lex) == p

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals
from spacy import lex_of
from spacy.en import lookup
from spacy.en import tokenize
from spacy.en import unhash
@ -19,8 +18,8 @@ def test_open(open_puncts):
string = p + word_str
tokens = tokenize(string)
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[1])) == word_str
assert unhash(tokens[0].lex) == p
assert unhash(tokens[1].lex) == word_str
def test_two_different_open(open_puncts):
@ -29,9 +28,9 @@ def test_two_different_open(open_puncts):
string = p + "`" + word_str
tokens = tokenize(string)
assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[1])) == "`"
assert unhash(lex_of(tokens[2])) == word_str
assert unhash(tokens[0].lex) == p
assert unhash(tokens[1].lex) == "`"
assert unhash(tokens[2].lex) == word_str
def test_three_same_open(open_puncts):
@ -40,12 +39,12 @@ def test_three_same_open(open_puncts):
string = p + p + p + word_str
tokens = tokenize(string)
assert len(tokens) == 4
assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[3])) == word_str
assert unhash(tokens[0].lex) == p
assert unhash(tokens[3].lex) == word_str
def test_open_appostrophe():
string = "'The"
tokens = tokenize(string)
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "'"
assert unhash(tokens[0].lex) == "'"

View File

@ -1,46 +0,0 @@
from __future__ import unicode_literals
from spacy.en import unhash
from spacy import lex_of
from spacy.util import utf8open
from spacy.ptb3 import tokenize, lookup, unhash
import pytest
import os
from os import path
HERE = path.dirname(__file__)
@pytest.fixture
def sun_txt():
loc = path.join(HERE, 'sun.txt')
return utf8open(loc).read()
@pytest.fixture
def my_tokens(sun_txt):
assert len(sun_txt) != 0
tokens = tokenize(sun_txt)
return [unhash(lex_of(t)) for t in tokens]
@pytest.fixture
def sed_tokens():
loc = path.join(HERE, 'sun.tokens')
return utf8open(loc).read().split()
def test_compare_tokens(my_tokens, sed_tokens):
me = my_tokens
sed = sed_tokens
i = 0
while i < len(me) and i < len(sed):
assert me[i] == sed[i]
i += 1
assert len(me) == len(sed)

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals
from spacy import lex_of
from spacy.en import tokenize
from spacy.en import lookup
from spacy.en import unhash
@ -19,9 +18,9 @@ def test_token(paired_puncts):
string = open_ + word_str + close_
tokens = tokenize(string)
assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == open_
assert unhash(lex_of(tokens[1])) == word_str
assert unhash(lex_of(tokens[2])) == close_
assert unhash(tokens[0].lex) == open_
assert unhash(tokens[1].lex) == word_str
assert unhash(tokens[2].lex) == close_
def test_two_different(paired_puncts):
@ -30,9 +29,9 @@ def test_two_different(paired_puncts):
string = "`" + open_ + word_str + close_ + "'"
tokens = tokenize(string)
assert len(tokens) == 5
assert unhash(lex_of(tokens[0])) == "`"
assert unhash(lex_of(tokens[1])) == open_
assert unhash(lex_of(tokens[2])) == word_str
assert unhash(lex_of(tokens[2])) == word_str
assert unhash(lex_of(tokens[3])) == close_
assert unhash(lex_of(tokens[4])) == "'"
assert unhash(tokens[0].lex) == "`"
assert unhash(tokens[1].lex) == open_
assert unhash(tokens[2].lex) == word_str
assert unhash(tokens[2].lex) == word_str
assert unhash(tokens[3].lex) == close_
assert unhash(tokens[4].lex) == "'"

View File

@ -3,8 +3,6 @@ from __future__ import unicode_literals
from spacy.en import tokenize
from spacy.en import lookup
from spacy.lexeme import lex_of
def test_single_word():
lex_ids = tokenize(u'hello')
@ -12,33 +10,33 @@ def test_single_word():
def test_two_words():
lex_ids = tokenize(u'hello possums')
assert len(lex_ids) == 2
assert lex_ids[0] == lookup(u'hello')
assert lex_ids[0] != lex_ids[1]
words = tokenize('hello possums')
assert len(words) == 2
assert words[0] == lookup('hello')
assert words[0] != words[1]
def test_punct():
tokens = tokenize('hello, possums.')
assert len(tokens) == 4
assert lex_of(tokens[0]) == lex_of(lookup('hello'))
assert lex_of(tokens[1]) == lex_of(lookup(','))
assert lex_of(tokens[2]) == lex_of(lookup('possums'))
assert lex_of(tokens[1]) != lex_of(lookup('hello'))
assert tokens[0].lex == lookup('hello').lex
assert tokens[1].lex == lookup(',').lex
assert tokens[2].lex == lookup('possums').lex
assert tokens[1].lex != lookup('hello').lex
def test_digits():
lex_ids = tokenize('The year: 1984.')
assert len(lex_ids) == 5
assert lex_of(lex_ids[0]) == lex_of(lookup('The'))
assert lex_of(lex_ids[3]) == lex_of(lookup('1984'))
assert lex_of(lex_ids[4]) == lex_of(lookup('.'))
assert lex_ids[0].lex == lookup('The').lex
assert lex_ids[3].lex == lookup('1984').lex
assert lex_ids[4].lex == lookup('.').lex
def test_contraction():
lex_ids = tokenize("don't giggle")
assert len(lex_ids) == 3
assert lex_of(lex_ids[1]) == lex_of(lookup("not"))
assert lex_ids[1].lex == lookup("not").lex
lex_ids = tokenize("i said don't!")
assert len(lex_ids) == 4
assert lex_of(lex_ids[3]) == lex_of(lookup('!'))
assert lex_ids[3].lex == lookup('!').lex

View File

@ -17,7 +17,7 @@ def test_eq():
def test_round_trip():
hello = lookup('Hello')
assert unhash(lex_of(hello)) == 'Hello'
assert unhash(hello.lex) == 'Hello'
def test_case_neq():
@ -32,6 +32,6 @@ def test_punct_neq():
def test_short():
addr = lookup('I')
assert unhash(lex_of(addr)) == 'I'
assert unhash(addr.lex) == 'I'
addr = lookup('not')
assert unhash(lex_of(addr)) == 'not'
assert unhash(addr.lex) == 'not'