From b740f2019108b2974fd650cb0aec5b91851eb564 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sun, 6 Mar 2016 09:19:27 +0100 Subject: [PATCH] hash_string() should not depend on python's internal unicode representation, also fixes https://github.com/spacy-io/sense2vec/issues/5 for py2 --- spacy/strings.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index ef8422aa0..d54dcdf1a 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -23,10 +23,8 @@ import ujson as json cpdef hash_t hash_string(unicode string) except 0: - # This has to be like this for - chars = PyUnicode_AS_DATA(string) - size = PyUnicode_GET_DATA_SIZE(string) - return hash64(chars, size, 1) + chars = string.encode('utf8') + return hash64(chars, len(chars), 1) cdef unicode _decode(const Utf8Str* string):