mirror of https://github.com/explosion/spaCy.git
* Switch to better Python2/3 compatible unicode handling
This commit is contained in:
parent
7606d9936f
commit
9c4d0aae62
|
@ -3,6 +3,9 @@ import codecs
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
|
from cpython cimport PyUnicode_AS_DATA
|
||||||
|
from cpython cimport PyUnicode_GET_DATA_SIZE
|
||||||
|
|
||||||
from libc.stdint cimport int64_t
|
from libc.stdint cimport int64_t
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,9 +16,10 @@ SEPARATOR = '\n|-SEP-|\n'
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0:
|
cpdef hash_t hash_string(unicode string) except 0:
|
||||||
# This should probably use Py_UCS4 API, but I can't in Python2.7
|
# This has to be like this for
|
||||||
chars = <Py_UNICODE*>string
|
chars = <char*>PyUnicode_AS_DATA(string)
|
||||||
return hash64(chars, len(string) * sizeof(Py_UNICODE), 0)
|
size = PyUnicode_GET_DATA_SIZE(string)
|
||||||
|
return hash64(chars, size, 1)
|
||||||
|
|
||||||
|
|
||||||
cdef unicode _decode(const Utf8Str* string):
|
cdef unicode _decode(const Utf8Str* string):
|
||||||
|
|
|
@ -10,7 +10,6 @@ from cpython cimport Py_UNICODE_ISSPACE
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
|
|
||||||
from .morphology cimport set_morph_from_dict
|
from .morphology cimport set_morph_from_dict
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
|
@ -81,25 +80,24 @@ cdef class Tokenizer:
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
cdef int start = 0
|
cdef int start = 0
|
||||||
cdef bint cache_hit
|
cdef bint cache_hit
|
||||||
cdef bint in_ws = Py_UNICODE_ISSPACE(string[0])
|
cdef bint in_ws = False
|
||||||
cdef unicode span
|
cdef unicode span
|
||||||
# Use of Py_UNICODE is deprecated, and I should be using Py_UCS4.
|
|
||||||
# But this is hard --- I need to acquire a pointer, but there's no
|
|
||||||
# Py_UCS4 API in Python 2.
|
|
||||||
cdef Py_UNICODE uc
|
|
||||||
cdef Py_UNICODE* chars_ptr = <Py_UNICODE*>string
|
|
||||||
# The task here is much like string.split, but not quite
|
# The task here is much like string.split, but not quite
|
||||||
# We find spans of whitespace and non-space characters, and ignore
|
# We find spans of whitespace and non-space characters, and ignore
|
||||||
# spans that are exactly ' '. So, our sequences will all be separated
|
# spans that are exactly ' '. So, our sequences will all be separated
|
||||||
# by either ' ' or nothing.
|
# by either ' ' or nothing.
|
||||||
for i in range(1, length):
|
for uc in string:
|
||||||
uc = chars_ptr[i]
|
if uc.isspace() != in_ws:
|
||||||
if Py_UNICODE_ISSPACE(uc) != in_ws:
|
|
||||||
if start < i:
|
if start < i:
|
||||||
key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
|
# When we want to make this fast, get the data buffer once
|
||||||
|
# with PyUnicode_AS_DATA, and then maintain a start_byte
|
||||||
|
# and end_byte, so we can call hash64 directly. That way
|
||||||
|
# we don't have to create the slice when we hit the cache.
|
||||||
|
span = string[start:i]
|
||||||
|
key = hash_string(span)
|
||||||
cache_hit = self._try_cache(key, tokens)
|
cache_hit = self._try_cache(key, tokens)
|
||||||
if not cache_hit:
|
if not cache_hit:
|
||||||
self._tokenize(tokens, string[start:i], key)
|
self._tokenize(tokens, span, key)
|
||||||
in_ws = not in_ws
|
in_ws = not in_ws
|
||||||
if uc == ' ':
|
if uc == ' ':
|
||||||
tokens.data[tokens.length - 1].spacy = True
|
tokens.data[tokens.length - 1].spacy = True
|
||||||
|
@ -107,11 +105,13 @@ cdef class Tokenizer:
|
||||||
else:
|
else:
|
||||||
start = i
|
start = i
|
||||||
i += 1
|
i += 1
|
||||||
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
|
span = string[start:]
|
||||||
|
key = hash_string(span)
|
||||||
cache_hit = self._try_cache(key, tokens)
|
cache_hit = self._try_cache(key, tokens)
|
||||||
if not cache_hit:
|
if not cache_hit:
|
||||||
self._tokenize(tokens, string[start:], key)
|
self._tokenize(tokens, span, key)
|
||||||
tokens.data[tokens.length - 1].spacy = string[-1] == ' '
|
tokens.data[tokens.length - 1].spacy = string[-1] == ' '
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue