* Fix data reading for lemmatizer

This commit is contained in:
Matthew Honnibal 2015-01-05 06:01:32 +11:00
parent 72613a5fca
commit 477e7fbffe
1 changed files with 3 additions and 2 deletions

View File

@ -1,5 +1,6 @@
from __future__ import unicode_literals
from os import path
import codecs
NOUN_RULES = (
@ -84,7 +85,7 @@ def lemmatize(string, index, exceptions, rules):
def read_index(loc):
index = set()
for line in open(loc):
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()
@ -96,7 +97,7 @@ def read_index(loc):
def read_exc(loc):
exceptions = {}
for line in open(loc):
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()