mirror of https://github.com/explosion/spaCy.git
* Use Py_UNICODE in tokenizer for now, while sort out Py_UCS4 stuff
This commit is contained in:
parent
4d61239eac
commit
2fc66e3723
|
@ -80,34 +80,37 @@ cdef class Tokenizer:
|
|||
cdef int i = 0
|
||||
cdef int start = 0
|
||||
cdef bint cache_hit
|
||||
chars = <Py_UNICODE*>string
|
||||
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||
cdef bint in_ws = Py_UNICODE_ISSPACE(string[0])
|
||||
cdef unicode span
|
||||
# Use of Py_UNICODE is deprecated, and I should be using Py_UCS4.
|
||||
# But this is hard --- I need to acquire a pointer, but there's no
|
||||
# Py_UCS4 API in Python 2.
|
||||
cdef Py_UNICODE uc
|
||||
cdef Py_UNICODE* chars_ptr = <Py_UNICODE*>string
|
||||
# The task here is much like string.split, but not quite
|
||||
# We find spans of whitespace and non-space characters, and ignore
|
||||
# spans that are exactly ' '. So, our sequences will all be separated
|
||||
# by either ' ' or nothing.
|
||||
for i in range(1, length):
|
||||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||
for i range(1, length):
|
||||
uc = chars_ptr[i]
|
||||
if Py_UNICODE_ISSPACE(uc) != in_ws:
|
||||
if start < i:
|
||||
span = string[start:i]
|
||||
key = hash_string(span)
|
||||
key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0)
|
||||
cache_hit = self._try_cache(key, tokens)
|
||||
if not cache_hit:
|
||||
self._tokenize(tokens, span, key)
|
||||
self._tokenize(tokens, string[start:i], key)
|
||||
in_ws = not in_ws
|
||||
start = i
|
||||
if chars[i] == ' ':
|
||||
if uc == ' ':
|
||||
tokens.data[tokens.length - 1].spacy = True
|
||||
start += 1
|
||||
start = i + 1
|
||||
else:
|
||||
start = i
|
||||
i += 1
|
||||
if start < i:
|
||||
span = string[start:i]
|
||||
key = hash_string(span)
|
||||
key = hash64(chars_ptr, (i - start) * sizeof(Py_UNICODE), 0)
|
||||
cache_hit = self._try_cache(key, tokens)
|
||||
if not cache_hit:
|
||||
self._tokenize(tokens, span, key)
|
||||
|
||||
self._tokenize(tokens, string[start:], key)
|
||||
tokens.data[tokens.length - 1].spacy = string[-1] == ' '
|
||||
return tokens
|
||||
|
||||
|
|
Loading…
Reference in New Issue