* Fix unicode error in lemmatizer

This commit is contained in:
Matthew Honnibal 2015-01-05 11:53:54 +11:00
parent 775a66e2b6
commit b132b3caa6
1 changed files with 2 additions and 0 deletions

View File

@ -68,6 +68,7 @@ class Lemmatizer(object):
def lemmatize(string, index, exceptions, rules):
assert isinstance(string, unicode)
string = string.lower()
forms = []
if string in index:
@ -77,6 +78,7 @@ def lemmatize(string, index, exceptions, rules):
if string.endswith(old):
form = string[:len(string) - len(old)] + new
if form in index:
assert isinstance(form, unicode)
forms.append(form)
if not forms:
forms.append(string)