* Fix Issue #51: Handle non-ascii lemmas correctly

This commit is contained in:
Matthew Honnibal 2015-04-13 22:28:59 +02:00
parent bf0aff5124
commit c6707778dd
1 changed files with 2 additions and 1 deletions

View File

@ -331,7 +331,8 @@ cdef class EnPosTagger:
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i
bytes_string = lemma_string.encode('utf8')
lemma = self.strings.intern(bytes_string, len(bytes_string)).i
return lemma
def load_morph_exceptions(self, dict exc):