mirror of https://github.com/explosion/spaCy.git
38 lines
806 B
Cython
38 lines
806 B
Cython
# cython: profile=True
|
|
from murmurhash cimport mrmr
|
|
|
|
|
|
cpdef bytes to_bytes(unicode string):
|
|
return string.encode('utf8')
|
|
|
|
|
|
cpdef unicode from_bytes(bytes string):
|
|
return string.decode('utf8')
|
|
|
|
|
|
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
|
if end >= length:
|
|
end = -1
|
|
if start >= length:
|
|
start = 0
|
|
if start <= 0 and end < 0:
|
|
return string
|
|
elif start < 0:
|
|
start = 0
|
|
elif end < 0:
|
|
end = length
|
|
return string[start:end]
|
|
|
|
|
|
cdef bint is_whitespace(Py_UNICODE c):
|
|
# TODO: Support other unicode spaces
|
|
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
|
|
if c == u' ':
|
|
return True
|
|
elif c == u'\n':
|
|
return True
|
|
elif c == u'\t':
|
|
return True
|
|
else:
|
|
return False
|