* Switch to better Python2/3 compatible unicode handling

2015-07-28 14:45:37 +02:00 · 2015-07-28 14:45:37 +02:00 · 9c4d0aae62
parent 7606d9936f
commit 9c4d0aae62
2 changed files with 21 additions and 17 deletions
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -3,6 +3,9 @@ import codecs
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash64
 from cpython cimport PyUnicode_AS_DATA
 from cpython cimport PyUnicode_GET_DATA_SIZE
 from libc.stdint cimport int64_t
@ -13,9 +16,10 @@ SEPARATOR = '\n|-SEP-|\n'
 cpdef hash_t hash_string(unicode string) except 0:
-    # This should probably use Py_UCS4 API, but I can't in Python2.7
+    # This has to be like this for
-    chars = <Py_UNICODE*>string
+    chars = <char*>PyUnicode_AS_DATA(string)
-    return hash64(chars, len(string) * sizeof(Py_UNICODE), 0)
+    size = PyUnicode_GET_DATA_SIZE(string)
    return hash64(chars, size, 1)
 cdef unicode _decode(const Utf8Str* string):
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -10,7 +10,6 @@ from cpython cimport Py_UNICODE_ISSPACE
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from murmurhash.mrmr cimport hash64
 from .morphology cimport set_morph_from_dict
 from .strings cimport hash_string
@ -81,25 +80,24 @@ cdef class Tokenizer:
        cdef int i = 0
        cdef int start = 0
        cdef bint cache_hit
-        cdef bint in_ws = Py_UNICODE_ISSPACE(string[0])
+        cdef bint in_ws = False
        cdef unicode span
        # Use of Py_UNICODE is deprecated, and I should be using Py_UCS4.
        # But this is hard --- I need to acquire a pointer, but there's no
        # Py_UCS4 API in Python 2.
        cdef Py_UNICODE uc
        cdef Py_UNICODE* chars_ptr = <Py_UNICODE*>string
        # The task here is much like string.split, but not quite
        # We find spans of whitespace and non-space characters, and ignore
        # spans that are exactly ' '. So, our sequences will all be separated
        # by either ' ' or nothing.
-        for i in range(1, length):
+        for uc in string:
-            uc = chars_ptr[i]
+            if uc.isspace() != in_ws:
            if Py_UNICODE_ISSPACE(uc) != in_ws:
                if start < i:
-                    key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
+                    # When we want to make this fast, get the data buffer once
                    # with PyUnicode_AS_DATA, and then maintain a start_byte
                    # and end_byte, so we can call hash64 directly. That way
                    # we don't have to create the slice when we hit the cache.
                    span = string[start:i]
                    key = hash_string(span)
                    cache_hit = self._try_cache(key, tokens)
                    if not cache_hit:
-                        self._tokenize(tokens, string[start:i], key)
+                        self._tokenize(tokens, span, key)
                in_ws = not in_ws
                if uc == ' ':
                    tokens.data[tokens.length - 1].spacy = True
@ -107,11 +105,13 @@ cdef class Tokenizer:
                else:
                    start = i
            i += 1
        i += 1
        if start < i:
-            key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
+            span = string[start:]
            key = hash_string(span)
            cache_hit = self._try_cache(key, tokens)
            if not cache_hit:
-                self._tokenize(tokens, string[start:], key)
+                self._tokenize(tokens, span, key)
            tokens.data[tokens.length - 1].spacy = string[-1] == ' '
        return tokens