* Fix bytes problems for Python3

This commit is contained in:
Matthew Honnibal 2015-07-24 03:48:23 +02:00
parent ce984f471c
commit c4ff410fdb
4 changed files with 22 additions and 23 deletions

View File

@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
cdef class BitArray:
cdef bytes data
cdef bytearray data
cdef uchar byte
cdef uchar bit_of_byte
cdef uint32_t i

View File

@ -16,8 +16,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
cdef class BitArray:
def __init__(self, bytes data=b''):
self.data = data
def __init__(self, data=b''):
self.data = bytearray(data)
self.byte = 0
self.bit_of_byte = 0
self.i = 0
@ -47,7 +47,7 @@ cdef class BitArray:
start_bit = self.i % 8
if start_bit != 0 and start_byte < len(self.data):
byte = ord(self.data[start_byte])
byte = self.data[start_byte]
for i in range(start_bit, 8):
self.i += 1
yield 1 if (byte & (one << i)) else 0
@ -70,10 +70,10 @@ cdef class BitArray:
# TODO portability
cdef uchar[4] chars
chars[0] = <uchar>ord(self.data[start_byte])
chars[1] = <uchar>ord(self.data[start_byte+1])
chars[2] = <uchar>ord(self.data[start_byte+2])
chars[3] = <uchar>ord(self.data[start_byte+3])
chars[0] = self.data[start_byte]
chars[1] = self.data[start_byte+1]
chars[2] = self.data[start_byte+2]
chars[3] = self.data[start_byte+3]
cdef uint32_t output
memcpy(&output, chars, 4)
self.i += 32
@ -85,8 +85,7 @@ cdef class BitArray:
byte = chr(self.byte)
# Jump through some hoops for Python3
if isinstance(byte, unicode):
byte_char = <unsigned char>byte
return self.data + <bytes>&byte_char
return self.data + <bytes>(&self.byte)[:1]
else:
return self.data + chr(self.byte)
else:
@ -101,7 +100,7 @@ cdef class BitArray:
self.bit_of_byte += 1
self.i += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.data += bytearray((self.byte,))
self.byte = 0
self.bit_of_byte = 0

View File

@ -110,14 +110,14 @@ cdef class HuffmanCodec:
cdef int branch
cdef int n_msg = msg.shape[0]
cdef bytes bytes_ = bits.as_bytes()
cdef bytearray bytes_ = bits.as_bytes()
cdef unsigned char byte
cdef int i_msg = 0
cdef int i_byte = bits.i // 8
cdef unsigned char i_bit = 0
cdef unsigned char one = 1
while i_msg < n_msg:
byte = ord(bytes_[i_byte])
byte = bytes_[i_byte]
i_byte += 1
for i_bit in range(8):
branch = node.right if (byte & (one << i_bit)) else node.left
@ -138,11 +138,11 @@ cdef class HuffmanCodec:
def __get__(self):
output = []
cdef int i, j
cdef bytes string
cdef unicode string
cdef Code code
for i in range(self.codes.size()):
code = self.codes[i]
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
string = '{0:b}'.format(code.bits).rjust(code.length, '0')
string = string[::-1]
output.append(string)
return output

View File

@ -66,7 +66,7 @@ def _gen_orths(Vocab vocab):
def _gen_chars(Vocab vocab):
cdef attr_t orth
cdef size_t addr
char_weights = {chr(i): 1e-20 for i in range(256)}
char_weights = {i: 1e-20 for i in range(256)}
cdef unicode string
cdef bytes char
cdef bytes utf8_str
@ -75,9 +75,9 @@ def _gen_chars(Vocab vocab):
string = vocab.strings[lex.orth]
utf8_str = string.encode('utf8')
for char in utf8_str:
char_weights.setdefault(char, 0.0)
char_weights[char] += c_exp(lex.prob)
char_weights[b' '] += c_exp(lex.prob)
char_weights.setdefault(ord(char), 0.0)
char_weights[ord(char)] += c_exp(lex.prob)
char_weights[ord(' ')] += c_exp(lex.prob)
return char_weights.items()
@ -110,12 +110,12 @@ cdef class Packer:
codec.encode(array[:, i], bits)
return bits.as_bytes()
def unpack(self, bytes data):
def unpack(self, data):
doc = Doc(self.vocab)
self.unpack_into(data, doc)
return doc
def unpack_into(self, bytes byte_string, Doc doc):
def unpack_into(self, byte_string, Doc doc):
bits = BitArray(byte_string)
bits.seek(0)
cdef int32_t length = bits.read32()
@ -149,7 +149,7 @@ cdef class Packer:
cdef int32_t length = len(utf8_str)
# Signal chars with negative length
bits.extend(-length, 32)
self.char_codec.encode(utf8_str, bits)
self.char_codec.encode(bytearray(utf8_str), bits)
cdef int i, j
for i in range(doc.length):
for j in range(doc.data[i].lex.length-1):
@ -167,7 +167,7 @@ cdef class Packer:
spaces = iter(bits)
for i in range(n):
orth = orths[i]
space = spaces.next()
space = next(spaces)
lex = self.vocab.get_by_orth(doc.mem, orth)
doc.push_back(lex, space)
return doc