* Fix data reading for lemmatizer

2015-01-05 06:01:32 +11:00 · 2015-01-05 06:01:32 +11:00 · 477e7fbffe
parent 72613a5fca
commit 477e7fbffe
1 changed files with 3 additions and 2 deletions
--- a/spacy/en/lemmatizer.py
+++ b/spacy/en/lemmatizer.py
@ -1,5 +1,6 @@
 from __future__ import unicode_literals
 from os import path
+import codecs


 NOUN_RULES = (
@ -84,7 +85,7 @@ def lemmatize(string, index, exceptions, rules):

 def read_index(loc):
    index = set()
-    for line in open(loc):
+    for line in codecs.open(loc, 'r', 'utf8'):
        if line.startswith(' '):
            continue
        pieces = line.split()
@ -96,7 +97,7 @@ def read_index(loc):

 def read_exc(loc):
    exceptions = {}
-    for line in open(loc):
+    for line in codecs.open(loc, 'r', 'utf8'):
        if line.startswith(' '):
            continue
        pieces = line.split()