mirror of https://github.com/explosion/spaCy.git
* Fix bytes problems for Python3
This commit is contained in:
parent
ce984f471c
commit
c4ff410fdb
|
@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
|
||||||
|
|
||||||
|
|
||||||
cdef class BitArray:
|
cdef class BitArray:
|
||||||
cdef bytes data
|
cdef bytearray data
|
||||||
cdef uchar byte
|
cdef uchar byte
|
||||||
cdef uchar bit_of_byte
|
cdef uchar bit_of_byte
|
||||||
cdef uint32_t i
|
cdef uint32_t i
|
||||||
|
|
|
@ -16,8 +16,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
|
||||||
|
|
||||||
|
|
||||||
cdef class BitArray:
|
cdef class BitArray:
|
||||||
def __init__(self, bytes data=b''):
|
def __init__(self, data=b''):
|
||||||
self.data = data
|
self.data = bytearray(data)
|
||||||
self.byte = 0
|
self.byte = 0
|
||||||
self.bit_of_byte = 0
|
self.bit_of_byte = 0
|
||||||
self.i = 0
|
self.i = 0
|
||||||
|
@ -47,7 +47,7 @@ cdef class BitArray:
|
||||||
start_bit = self.i % 8
|
start_bit = self.i % 8
|
||||||
|
|
||||||
if start_bit != 0 and start_byte < len(self.data):
|
if start_bit != 0 and start_byte < len(self.data):
|
||||||
byte = ord(self.data[start_byte])
|
byte = self.data[start_byte]
|
||||||
for i in range(start_bit, 8):
|
for i in range(start_bit, 8):
|
||||||
self.i += 1
|
self.i += 1
|
||||||
yield 1 if (byte & (one << i)) else 0
|
yield 1 if (byte & (one << i)) else 0
|
||||||
|
@ -70,10 +70,10 @@ cdef class BitArray:
|
||||||
|
|
||||||
# TODO portability
|
# TODO portability
|
||||||
cdef uchar[4] chars
|
cdef uchar[4] chars
|
||||||
chars[0] = <uchar>ord(self.data[start_byte])
|
chars[0] = self.data[start_byte]
|
||||||
chars[1] = <uchar>ord(self.data[start_byte+1])
|
chars[1] = self.data[start_byte+1]
|
||||||
chars[2] = <uchar>ord(self.data[start_byte+2])
|
chars[2] = self.data[start_byte+2]
|
||||||
chars[3] = <uchar>ord(self.data[start_byte+3])
|
chars[3] = self.data[start_byte+3]
|
||||||
cdef uint32_t output
|
cdef uint32_t output
|
||||||
memcpy(&output, chars, 4)
|
memcpy(&output, chars, 4)
|
||||||
self.i += 32
|
self.i += 32
|
||||||
|
@ -85,8 +85,7 @@ cdef class BitArray:
|
||||||
byte = chr(self.byte)
|
byte = chr(self.byte)
|
||||||
# Jump through some hoops for Python3
|
# Jump through some hoops for Python3
|
||||||
if isinstance(byte, unicode):
|
if isinstance(byte, unicode):
|
||||||
byte_char = <unsigned char>byte
|
return self.data + <bytes>(&self.byte)[:1]
|
||||||
return self.data + <bytes>&byte_char
|
|
||||||
else:
|
else:
|
||||||
return self.data + chr(self.byte)
|
return self.data + chr(self.byte)
|
||||||
else:
|
else:
|
||||||
|
@ -101,7 +100,7 @@ cdef class BitArray:
|
||||||
self.bit_of_byte += 1
|
self.bit_of_byte += 1
|
||||||
self.i += 1
|
self.i += 1
|
||||||
if self.bit_of_byte == 8:
|
if self.bit_of_byte == 8:
|
||||||
self.data += chr(self.byte)
|
self.data += bytearray((self.byte,))
|
||||||
self.byte = 0
|
self.byte = 0
|
||||||
self.bit_of_byte = 0
|
self.bit_of_byte = 0
|
||||||
|
|
||||||
|
|
|
@ -110,14 +110,14 @@ cdef class HuffmanCodec:
|
||||||
cdef int branch
|
cdef int branch
|
||||||
|
|
||||||
cdef int n_msg = msg.shape[0]
|
cdef int n_msg = msg.shape[0]
|
||||||
cdef bytes bytes_ = bits.as_bytes()
|
cdef bytearray bytes_ = bits.as_bytes()
|
||||||
cdef unsigned char byte
|
cdef unsigned char byte
|
||||||
cdef int i_msg = 0
|
cdef int i_msg = 0
|
||||||
cdef int i_byte = bits.i // 8
|
cdef int i_byte = bits.i // 8
|
||||||
cdef unsigned char i_bit = 0
|
cdef unsigned char i_bit = 0
|
||||||
cdef unsigned char one = 1
|
cdef unsigned char one = 1
|
||||||
while i_msg < n_msg:
|
while i_msg < n_msg:
|
||||||
byte = ord(bytes_[i_byte])
|
byte = bytes_[i_byte]
|
||||||
i_byte += 1
|
i_byte += 1
|
||||||
for i_bit in range(8):
|
for i_bit in range(8):
|
||||||
branch = node.right if (byte & (one << i_bit)) else node.left
|
branch = node.right if (byte & (one << i_bit)) else node.left
|
||||||
|
@ -138,11 +138,11 @@ cdef class HuffmanCodec:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
output = []
|
output = []
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef bytes string
|
cdef unicode string
|
||||||
cdef Code code
|
cdef Code code
|
||||||
for i in range(self.codes.size()):
|
for i in range(self.codes.size()):
|
||||||
code = self.codes[i]
|
code = self.codes[i]
|
||||||
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
|
string = '{0:b}'.format(code.bits).rjust(code.length, '0')
|
||||||
string = string[::-1]
|
string = string[::-1]
|
||||||
output.append(string)
|
output.append(string)
|
||||||
return output
|
return output
|
||||||
|
|
|
@ -66,7 +66,7 @@ def _gen_orths(Vocab vocab):
|
||||||
def _gen_chars(Vocab vocab):
|
def _gen_chars(Vocab vocab):
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
char_weights = {chr(i): 1e-20 for i in range(256)}
|
char_weights = {i: 1e-20 for i in range(256)}
|
||||||
cdef unicode string
|
cdef unicode string
|
||||||
cdef bytes char
|
cdef bytes char
|
||||||
cdef bytes utf8_str
|
cdef bytes utf8_str
|
||||||
|
@ -75,9 +75,9 @@ def _gen_chars(Vocab vocab):
|
||||||
string = vocab.strings[lex.orth]
|
string = vocab.strings[lex.orth]
|
||||||
utf8_str = string.encode('utf8')
|
utf8_str = string.encode('utf8')
|
||||||
for char in utf8_str:
|
for char in utf8_str:
|
||||||
char_weights.setdefault(char, 0.0)
|
char_weights.setdefault(ord(char), 0.0)
|
||||||
char_weights[char] += c_exp(lex.prob)
|
char_weights[ord(char)] += c_exp(lex.prob)
|
||||||
char_weights[b' '] += c_exp(lex.prob)
|
char_weights[ord(' ')] += c_exp(lex.prob)
|
||||||
return char_weights.items()
|
return char_weights.items()
|
||||||
|
|
||||||
|
|
||||||
|
@ -110,12 +110,12 @@ cdef class Packer:
|
||||||
codec.encode(array[:, i], bits)
|
codec.encode(array[:, i], bits)
|
||||||
return bits.as_bytes()
|
return bits.as_bytes()
|
||||||
|
|
||||||
def unpack(self, bytes data):
|
def unpack(self, data):
|
||||||
doc = Doc(self.vocab)
|
doc = Doc(self.vocab)
|
||||||
self.unpack_into(data, doc)
|
self.unpack_into(data, doc)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def unpack_into(self, bytes byte_string, Doc doc):
|
def unpack_into(self, byte_string, Doc doc):
|
||||||
bits = BitArray(byte_string)
|
bits = BitArray(byte_string)
|
||||||
bits.seek(0)
|
bits.seek(0)
|
||||||
cdef int32_t length = bits.read32()
|
cdef int32_t length = bits.read32()
|
||||||
|
@ -149,7 +149,7 @@ cdef class Packer:
|
||||||
cdef int32_t length = len(utf8_str)
|
cdef int32_t length = len(utf8_str)
|
||||||
# Signal chars with negative length
|
# Signal chars with negative length
|
||||||
bits.extend(-length, 32)
|
bits.extend(-length, 32)
|
||||||
self.char_codec.encode(utf8_str, bits)
|
self.char_codec.encode(bytearray(utf8_str), bits)
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
for j in range(doc.data[i].lex.length-1):
|
for j in range(doc.data[i].lex.length-1):
|
||||||
|
@ -167,7 +167,7 @@ cdef class Packer:
|
||||||
spaces = iter(bits)
|
spaces = iter(bits)
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
orth = orths[i]
|
orth = orths[i]
|
||||||
space = spaces.next()
|
space = next(spaces)
|
||||||
lex = self.vocab.get_by_orth(doc.mem, orth)
|
lex = self.vocab.get_by_orth(doc.mem, orth)
|
||||||
doc.push_back(lex, space)
|
doc.push_back(lex, space)
|
||||||
return doc
|
return doc
|
||||||
|
|
Loading…
Reference in New Issue