From c6707778dd12536ef2655c844555710dd1e5bdcc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 13 Apr 2015 22:28:59 +0200 Subject: [PATCH] * Fix Issue #51: Handle non-ascii lemmas correctly --- spacy/en/pos.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index c5d7e126d..8b4e0730e 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -331,7 +331,8 @@ cdef class EnPosTagger: cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, pos) lemma_string = sorted(lemma_strings)[0] - lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i + bytes_string = lemma_string.encode('utf8') + lemma = self.strings.intern(bytes_string, len(bytes_string)).i return lemma def load_morph_exceptions(self, dict exc):