* Fix bytes problems for Python3

This commit is contained in:
Matthew Honnibal 2015-07-24 03:48:23 +02:00
parent ce984f471c
commit c4ff410fdb
4 changed files with 22 additions and 23 deletions

View File

@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
cdef class BitArray: cdef class BitArray:
cdef bytes data cdef bytearray data
cdef uchar byte cdef uchar byte
cdef uchar bit_of_byte cdef uchar bit_of_byte
cdef uint32_t i cdef uint32_t i

View File

@ -16,8 +16,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
cdef class BitArray: cdef class BitArray:
def __init__(self, bytes data=b''): def __init__(self, data=b''):
self.data = data self.data = bytearray(data)
self.byte = 0 self.byte = 0
self.bit_of_byte = 0 self.bit_of_byte = 0
self.i = 0 self.i = 0
@ -47,7 +47,7 @@ cdef class BitArray:
start_bit = self.i % 8 start_bit = self.i % 8
if start_bit != 0 and start_byte < len(self.data): if start_bit != 0 and start_byte < len(self.data):
byte = ord(self.data[start_byte]) byte = self.data[start_byte]
for i in range(start_bit, 8): for i in range(start_bit, 8):
self.i += 1 self.i += 1
yield 1 if (byte & (one << i)) else 0 yield 1 if (byte & (one << i)) else 0
@ -70,10 +70,10 @@ cdef class BitArray:
# TODO portability # TODO portability
cdef uchar[4] chars cdef uchar[4] chars
chars[0] = <uchar>ord(self.data[start_byte]) chars[0] = self.data[start_byte]
chars[1] = <uchar>ord(self.data[start_byte+1]) chars[1] = self.data[start_byte+1]
chars[2] = <uchar>ord(self.data[start_byte+2]) chars[2] = self.data[start_byte+2]
chars[3] = <uchar>ord(self.data[start_byte+3]) chars[3] = self.data[start_byte+3]
cdef uint32_t output cdef uint32_t output
memcpy(&output, chars, 4) memcpy(&output, chars, 4)
self.i += 32 self.i += 32
@ -85,8 +85,7 @@ cdef class BitArray:
byte = chr(self.byte) byte = chr(self.byte)
# Jump through some hoops for Python3 # Jump through some hoops for Python3
if isinstance(byte, unicode): if isinstance(byte, unicode):
byte_char = <unsigned char>byte return self.data + <bytes>(&self.byte)[:1]
return self.data + <bytes>&byte_char
else: else:
return self.data + chr(self.byte) return self.data + chr(self.byte)
else: else:
@ -101,7 +100,7 @@ cdef class BitArray:
self.bit_of_byte += 1 self.bit_of_byte += 1
self.i += 1 self.i += 1
if self.bit_of_byte == 8: if self.bit_of_byte == 8:
self.data += chr(self.byte) self.data += bytearray((self.byte,))
self.byte = 0 self.byte = 0
self.bit_of_byte = 0 self.bit_of_byte = 0

View File

@ -110,14 +110,14 @@ cdef class HuffmanCodec:
cdef int branch cdef int branch
cdef int n_msg = msg.shape[0] cdef int n_msg = msg.shape[0]
cdef bytes bytes_ = bits.as_bytes() cdef bytearray bytes_ = bits.as_bytes()
cdef unsigned char byte cdef unsigned char byte
cdef int i_msg = 0 cdef int i_msg = 0
cdef int i_byte = bits.i // 8 cdef int i_byte = bits.i // 8
cdef unsigned char i_bit = 0 cdef unsigned char i_bit = 0
cdef unsigned char one = 1 cdef unsigned char one = 1
while i_msg < n_msg: while i_msg < n_msg:
byte = ord(bytes_[i_byte]) byte = bytes_[i_byte]
i_byte += 1 i_byte += 1
for i_bit in range(8): for i_bit in range(8):
branch = node.right if (byte & (one << i_bit)) else node.left branch = node.right if (byte & (one << i_bit)) else node.left
@ -138,11 +138,11 @@ cdef class HuffmanCodec:
def __get__(self): def __get__(self):
output = [] output = []
cdef int i, j cdef int i, j
cdef bytes string cdef unicode string
cdef Code code cdef Code code
for i in range(self.codes.size()): for i in range(self.codes.size()):
code = self.codes[i] code = self.codes[i]
string = b'{0:b}'.format(code.bits).rjust(code.length, '0') string = '{0:b}'.format(code.bits).rjust(code.length, '0')
string = string[::-1] string = string[::-1]
output.append(string) output.append(string)
return output return output

View File

@ -66,7 +66,7 @@ def _gen_orths(Vocab vocab):
def _gen_chars(Vocab vocab): def _gen_chars(Vocab vocab):
cdef attr_t orth cdef attr_t orth
cdef size_t addr cdef size_t addr
char_weights = {chr(i): 1e-20 for i in range(256)} char_weights = {i: 1e-20 for i in range(256)}
cdef unicode string cdef unicode string
cdef bytes char cdef bytes char
cdef bytes utf8_str cdef bytes utf8_str
@ -75,9 +75,9 @@ def _gen_chars(Vocab vocab):
string = vocab.strings[lex.orth] string = vocab.strings[lex.orth]
utf8_str = string.encode('utf8') utf8_str = string.encode('utf8')
for char in utf8_str: for char in utf8_str:
char_weights.setdefault(char, 0.0) char_weights.setdefault(ord(char), 0.0)
char_weights[char] += c_exp(lex.prob) char_weights[ord(char)] += c_exp(lex.prob)
char_weights[b' '] += c_exp(lex.prob) char_weights[ord(' ')] += c_exp(lex.prob)
return char_weights.items() return char_weights.items()
@ -110,12 +110,12 @@ cdef class Packer:
codec.encode(array[:, i], bits) codec.encode(array[:, i], bits)
return bits.as_bytes() return bits.as_bytes()
def unpack(self, bytes data): def unpack(self, data):
doc = Doc(self.vocab) doc = Doc(self.vocab)
self.unpack_into(data, doc) self.unpack_into(data, doc)
return doc return doc
def unpack_into(self, bytes byte_string, Doc doc): def unpack_into(self, byte_string, Doc doc):
bits = BitArray(byte_string) bits = BitArray(byte_string)
bits.seek(0) bits.seek(0)
cdef int32_t length = bits.read32() cdef int32_t length = bits.read32()
@ -149,7 +149,7 @@ cdef class Packer:
cdef int32_t length = len(utf8_str) cdef int32_t length = len(utf8_str)
# Signal chars with negative length # Signal chars with negative length
bits.extend(-length, 32) bits.extend(-length, 32)
self.char_codec.encode(utf8_str, bits) self.char_codec.encode(bytearray(utf8_str), bits)
cdef int i, j cdef int i, j
for i in range(doc.length): for i in range(doc.length):
for j in range(doc.data[i].lex.length-1): for j in range(doc.data[i].lex.length-1):
@ -167,7 +167,7 @@ cdef class Packer:
spaces = iter(bits) spaces = iter(bits)
for i in range(n): for i in range(n):
orth = orths[i] orth = orths[i]
space = spaces.next() space = next(spaces)
lex = self.vocab.get_by_orth(doc.mem, orth) lex = self.vocab.get_by_orth(doc.mem, orth)
doc.push_back(lex, space) doc.push_back(lex, space)
return doc return doc