* Fixed contraction tests. Need to correct problem with the way case stats and tag stats are supposed to work.

This commit is contained in:
Matthew Honnibal 2014-08-27 20:22:33 +02:00
parent fdaf24604a
commit fd4e61e58b
2 changed files with 23 additions and 20 deletions

View File

@ -20,7 +20,7 @@ cdef class Language:
self.name = name
self.cache = {}
self.lexicon = Lexicon()
#self.load_special_tokenization(util.read_tokenization(name))
self.load_special_tokenization(util.read_tokenization(name))
cpdef list tokenize(self, unicode string):
"""Tokenize a string.
@ -49,6 +49,7 @@ cdef class Language:
i += 1
if start < i:
tokens.extend(self._tokenize(string[start:]))
assert tokens
return tokens
cdef list _tokenize(self, unicode string):
@ -101,7 +102,7 @@ cdef class Language:
for string, substrings in token_rules:
lexemes = []
for i, substring in enumerate(substrings):
lexemes.append(self.lookup(substring))
lexemes.append(self.lexicon.lookup(substring))
self.cache[string] = lexemes
@ -143,13 +144,15 @@ cdef class Lexicon:
cdef Lexeme word
flag_id = len(self.flag_checkers)
for string, word in self.lexicon.items():
if flag_checker(string, word.prob, {}):
if flag_checker(string, word.prob, {}, {}):
word.set_flag(flag_id)
self.flag_checkers.append(flag_checker)
return flag_id
def add_transform(self, string_transform):
self.string_transformers.append(string_transform)
for string, word in self.lexicon.items():
word.add_view(string_transform(string, word.prob, {}, {}))
return len(self.string_transformers) - 1
def load_probs(self, location):

View File

@ -1,41 +1,41 @@
from __future__ import unicode_literals
from spacy.en import tokenize, lookup, unhash
from spacy.en import EN
def test_possess():
tokens = tokenize("Mike's")
assert unhash(tokens[0].lex) == "Mike"
assert unhash(tokens[1].lex) == "'s"
tokens = EN.tokenize("Mike's")
assert tokens[0].string == "Mike"
assert tokens[1].string == "'s"
assert len(tokens) == 2
def test_apostrophe():
tokens = tokenize("schools'")
tokens = EN.tokenize("schools'")
assert len(tokens) == 2
assert unhash(tokens[1].lex) == "'"
assert unhash(tokens[0].lex) == "schools"
assert tokens[1].string == "'"
assert tokens[0].string == "schools"
def test_LL():
tokens = tokenize("we'll")
tokens = EN.tokenize("we'll")
assert len(tokens) == 2
assert unhash(tokens[1].lex) == "will"
assert unhash(tokens[0].lex) == "we"
assert tokens[1].string == "will"
assert tokens[0].string == "we"
def test_aint():
tokens = tokenize("ain't")
tokens = EN.tokenize("ain't")
assert len(tokens) == 2
assert unhash(tokens[0].lex) == "are"
assert unhash(tokens[1].lex) == "not"
assert tokens[0].string == "are"
assert tokens[1].string == "not"
def test_capitalized():
tokens = tokenize("can't")
tokens = EN.tokenize("can't")
assert len(tokens) == 2
tokens = tokenize("Can't")
tokens = EN.tokenize("Can't")
assert len(tokens) == 2
tokens = tokenize("Ain't")
tokens = EN.tokenize("Ain't")
assert len(tokens) == 2
assert unhash(tokens[0].lex) == "Are"
assert tokens[0].string == "Are"