* Switch to better Python2/3 compatible unicode handling

This commit is contained in:
Matthew Honnibal 2015-07-28 14:45:37 +02:00
parent 7606d9936f
commit 9c4d0aae62
2 changed files with 21 additions and 17 deletions

View File

@ -3,6 +3,9 @@ import codecs
from libc.string cimport memcpy from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from cpython cimport PyUnicode_AS_DATA
from cpython cimport PyUnicode_GET_DATA_SIZE
from libc.stdint cimport int64_t from libc.stdint cimport int64_t
@ -13,9 +16,10 @@ SEPARATOR = '\n|-SEP-|\n'
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(unicode string) except 0:
# This should probably use Py_UCS4 API, but I can't in Python2.7 # This has to be like this for
chars = <Py_UNICODE*>string chars = <char*>PyUnicode_AS_DATA(string)
return hash64(chars, len(string) * sizeof(Py_UNICODE), 0) size = PyUnicode_GET_DATA_SIZE(string)
return hash64(chars, size, 1)
cdef unicode _decode(const Utf8Str* string): cdef unicode _decode(const Utf8Str* string):

View File

@ -10,7 +10,6 @@ from cpython cimport Py_UNICODE_ISSPACE
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64
from .morphology cimport set_morph_from_dict from .morphology cimport set_morph_from_dict
from .strings cimport hash_string from .strings cimport hash_string
@ -81,37 +80,38 @@ cdef class Tokenizer:
cdef int i = 0 cdef int i = 0
cdef int start = 0 cdef int start = 0
cdef bint cache_hit cdef bint cache_hit
cdef bint in_ws = Py_UNICODE_ISSPACE(string[0]) cdef bint in_ws = False
cdef unicode span cdef unicode span
# Use of Py_UNICODE is deprecated, and I should be using Py_UCS4.
# But this is hard --- I need to acquire a pointer, but there's no
# Py_UCS4 API in Python 2.
cdef Py_UNICODE uc
cdef Py_UNICODE* chars_ptr = <Py_UNICODE*>string
# The task here is much like string.split, but not quite # The task here is much like string.split, but not quite
# We find spans of whitespace and non-space characters, and ignore # We find spans of whitespace and non-space characters, and ignore
# spans that are exactly ' '. So, our sequences will all be separated # spans that are exactly ' '. So, our sequences will all be separated
# by either ' ' or nothing. # by either ' ' or nothing.
for i in range(1, length): for uc in string:
uc = chars_ptr[i] if uc.isspace() != in_ws:
if Py_UNICODE_ISSPACE(uc) != in_ws:
if start < i: if start < i:
key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0) # When we want to make this fast, get the data buffer once
# with PyUnicode_AS_DATA, and then maintain a start_byte
# and end_byte, so we can call hash64 directly. That way
# we don't have to create the slice when we hit the cache.
span = string[start:i]
key = hash_string(span)
cache_hit = self._try_cache(key, tokens) cache_hit = self._try_cache(key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, string[start:i], key) self._tokenize(tokens, span, key)
in_ws = not in_ws in_ws = not in_ws
if uc == ' ': if uc == ' ':
tokens.data[tokens.length - 1].spacy = True tokens.data[tokens.length - 1].spacy = True
start = i + 1 start = i + 1
else: else:
start = i start = i
i += 1
i += 1 i += 1
if start < i: if start < i:
key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0) span = string[start:]
key = hash_string(span)
cache_hit = self._try_cache(key, tokens) cache_hit = self._try_cache(key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, string[start:], key) self._tokenize(tokens, span, key)
tokens.data[tokens.length - 1].spacy = string[-1] == ' ' tokens.data[tokens.length - 1].spacy = string[-1] == ' '
return tokens return tokens