diff --git a/spacy/serialize/bits.pxd b/spacy/serialize/bits.pxd index fea5ad786..9c7593a92 100644 --- a/spacy/serialize/bits.pxd +++ b/spacy/serialize/bits.pxd @@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil cdef class BitArray: - cdef bytes data + cdef bytearray data cdef uchar byte cdef uchar bit_of_byte cdef uint32_t i diff --git a/spacy/serialize/bits.pyx b/spacy/serialize/bits.pyx index b062104ad..2f0fb30f6 100644 --- a/spacy/serialize/bits.pyx +++ b/spacy/serialize/bits.pyx @@ -16,8 +16,8 @@ cdef Code bit_append(Code code, bint bit) nogil: cdef class BitArray: - def __init__(self, bytes data=b''): - self.data = data + def __init__(self, data=b''): + self.data = bytearray(data) self.byte = 0 self.bit_of_byte = 0 self.i = 0 @@ -47,7 +47,7 @@ cdef class BitArray: start_bit = self.i % 8 if start_bit != 0 and start_byte < len(self.data): - byte = ord(self.data[start_byte]) + byte = self.data[start_byte] for i in range(start_bit, 8): self.i += 1 yield 1 if (byte & (one << i)) else 0 @@ -70,10 +70,10 @@ cdef class BitArray: # TODO portability cdef uchar[4] chars - chars[0] = ord(self.data[start_byte]) - chars[1] = ord(self.data[start_byte+1]) - chars[2] = ord(self.data[start_byte+2]) - chars[3] = ord(self.data[start_byte+3]) + chars[0] = self.data[start_byte] + chars[1] = self.data[start_byte+1] + chars[2] = self.data[start_byte+2] + chars[3] = self.data[start_byte+3] cdef uint32_t output memcpy(&output, chars, 4) self.i += 32 @@ -85,8 +85,7 @@ cdef class BitArray: byte = chr(self.byte) # Jump through some hoops for Python3 if isinstance(byte, unicode): - byte_char = byte - return self.data + &byte_char + return self.data + (&self.byte)[:1] else: return self.data + chr(self.byte) else: @@ -101,7 +100,7 @@ cdef class BitArray: self.bit_of_byte += 1 self.i += 1 if self.bit_of_byte == 8: - self.data += chr(self.byte) + self.data += bytearray((self.byte,)) self.byte = 0 self.bit_of_byte = 0 diff --git a/spacy/serialize/huffman.pyx b/spacy/serialize/huffman.pyx index 54895d03e..1bed83d74 100644 --- a/spacy/serialize/huffman.pyx +++ b/spacy/serialize/huffman.pyx @@ -110,14 +110,14 @@ cdef class HuffmanCodec: cdef int branch cdef int n_msg = msg.shape[0] - cdef bytes bytes_ = bits.as_bytes() + cdef bytearray bytes_ = bits.as_bytes() cdef unsigned char byte cdef int i_msg = 0 cdef int i_byte = bits.i // 8 cdef unsigned char i_bit = 0 cdef unsigned char one = 1 while i_msg < n_msg: - byte = ord(bytes_[i_byte]) + byte = bytes_[i_byte] i_byte += 1 for i_bit in range(8): branch = node.right if (byte & (one << i_bit)) else node.left @@ -138,11 +138,11 @@ cdef class HuffmanCodec: def __get__(self): output = [] cdef int i, j - cdef bytes string + cdef unicode string cdef Code code for i in range(self.codes.size()): code = self.codes[i] - string = b'{0:b}'.format(code.bits).rjust(code.length, '0') + string = '{0:b}'.format(code.bits).rjust(code.length, '0') string = string[::-1] output.append(string) return output diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx index bc0cad217..8acf478e0 100644 --- a/spacy/serialize/packer.pyx +++ b/spacy/serialize/packer.pyx @@ -66,7 +66,7 @@ def _gen_orths(Vocab vocab): def _gen_chars(Vocab vocab): cdef attr_t orth cdef size_t addr - char_weights = {chr(i): 1e-20 for i in range(256)} + char_weights = {i: 1e-20 for i in range(256)} cdef unicode string cdef bytes char cdef bytes utf8_str @@ -75,9 +75,9 @@ def _gen_chars(Vocab vocab): string = vocab.strings[lex.orth] utf8_str = string.encode('utf8') for char in utf8_str: - char_weights.setdefault(char, 0.0) - char_weights[char] += c_exp(lex.prob) - char_weights[b' '] += c_exp(lex.prob) + char_weights.setdefault(ord(char), 0.0) + char_weights[ord(char)] += c_exp(lex.prob) + char_weights[ord(' ')] += c_exp(lex.prob) return char_weights.items() @@ -110,12 +110,12 @@ cdef class Packer: codec.encode(array[:, i], bits) return bits.as_bytes() - def unpack(self, bytes data): + def unpack(self, data): doc = Doc(self.vocab) self.unpack_into(data, doc) return doc - def unpack_into(self, bytes byte_string, Doc doc): + def unpack_into(self, byte_string, Doc doc): bits = BitArray(byte_string) bits.seek(0) cdef int32_t length = bits.read32() @@ -149,7 +149,7 @@ cdef class Packer: cdef int32_t length = len(utf8_str) # Signal chars with negative length bits.extend(-length, 32) - self.char_codec.encode(utf8_str, bits) + self.char_codec.encode(bytearray(utf8_str), bits) cdef int i, j for i in range(doc.length): for j in range(doc.data[i].lex.length-1): @@ -167,7 +167,7 @@ cdef class Packer: spaces = iter(bits) for i in range(n): orth = orths[i] - space = spaces.next() + space = next(spaces) lex = self.vocab.get_by_orth(doc.mem, orth) doc.push_back(lex, space) return doc