From 477e7fbffef1ae1d28f761994966dfc4f5bfacce Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Jan 2015 06:01:32 +1100 Subject: [PATCH] * Fix data reading for lemmatizer --- spacy/en/lemmatizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/en/lemmatizer.py b/spacy/en/lemmatizer.py index 50041154a..5883e12c8 100644 --- a/spacy/en/lemmatizer.py +++ b/spacy/en/lemmatizer.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals from os import path +import codecs NOUN_RULES = ( @@ -84,7 +85,7 @@ def lemmatize(string, index, exceptions, rules): def read_index(loc): index = set() - for line in open(loc): + for line in codecs.open(loc, 'r', 'utf8'): if line.startswith(' '): continue pieces = line.split() @@ -96,7 +97,7 @@ def read_index(loc): def read_exc(loc): exceptions = {} - for line in open(loc): + for line in codecs.open(loc, 'r', 'utf8'): if line.startswith(' '): continue pieces = line.split()