* Fix data reading for lemmatizer

This commit is contained in:
Matthew Honnibal 2015-01-05 06:01:32 +11:00
parent 72613a5fca
commit 477e7fbffe
1 changed files with 3 additions and 2 deletions

View File

@ -1,5 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path from os import path
import codecs
NOUN_RULES = ( NOUN_RULES = (
@ -84,7 +85,7 @@ def lemmatize(string, index, exceptions, rules):
def read_index(loc): def read_index(loc):
index = set() index = set()
for line in open(loc): for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '): if line.startswith(' '):
continue continue
pieces = line.split() pieces = line.split()
@ -96,7 +97,7 @@ def read_index(loc):
def read_exc(loc): def read_exc(loc):
exceptions = {} exceptions = {}
for line in open(loc): for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '): if line.startswith(' '):
continue continue
pieces = line.split() pieces = line.split()