* Fix bytes problems for Python3

2015-07-24 03:48:23 +02:00 · 2015-07-24 03:48:23 +02:00 · c4ff410fdb
parent ce984f471c
commit c4ff410fdb
4 changed files with 22 additions and 23 deletions
--- a/spacy/serialize/bits.pxd
+++ b/spacy/serialize/bits.pxd
@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil


 cdef class BitArray:
-    cdef bytes data
+    cdef bytearray data
    cdef uchar byte
    cdef uchar bit_of_byte
    cdef uint32_t i
--- a/spacy/serialize/bits.pyx
+++ b/spacy/serialize/bits.pyx
@ -16,8 +16,8 @@ cdef Code bit_append(Code code, bint bit) nogil:


 cdef class BitArray:
-    def __init__(self, bytes data=b''):
-        self.data = data
+    def __init__(self, data=b''):
+        self.data = bytearray(data)
        self.byte = 0
        self.bit_of_byte = 0
        self.i = 0
@ -47,7 +47,7 @@ cdef class BitArray:
        start_bit = self.i % 8

        if start_bit != 0 and start_byte < len(self.data):
-            byte = ord(self.data[start_byte])
+            byte = self.data[start_byte]
            for i in range(start_bit, 8):
                self.i += 1
                yield 1 if (byte & (one << i)) else 0
@ -70,10 +70,10 @@ cdef class BitArray:

        # TODO portability
        cdef uchar[4] chars
-        chars[0] = <uchar>ord(self.data[start_byte])
-        chars[1] = <uchar>ord(self.data[start_byte+1])
-        chars[2] = <uchar>ord(self.data[start_byte+2])
-        chars[3] = <uchar>ord(self.data[start_byte+3])
+        chars[0] = self.data[start_byte]
+        chars[1] = self.data[start_byte+1]
+        chars[2] = self.data[start_byte+2]
+        chars[3] = self.data[start_byte+3]
        cdef uint32_t output
        memcpy(&output, chars, 4)
        self.i += 32
@ -85,8 +85,7 @@ cdef class BitArray:
            byte = chr(self.byte)
            # Jump through some hoops for Python3
            if isinstance(byte, unicode):
-                byte_char = <unsigned char>byte
-                return self.data + <bytes>&byte_char
+                return self.data + <bytes>(&self.byte)[:1]
            else:
                return self.data + chr(self.byte)
        else:
@ -101,7 +100,7 @@ cdef class BitArray:
        self.bit_of_byte += 1
        self.i += 1
        if self.bit_of_byte == 8:
-            self.data += chr(self.byte)
+            self.data += bytearray((self.byte,))
            self.byte = 0
            self.bit_of_byte = 0

--- a/spacy/serialize/huffman.pyx
+++ b/spacy/serialize/huffman.pyx
@ -110,14 +110,14 @@ cdef class HuffmanCodec:
        cdef int branch

        cdef int n_msg = msg.shape[0]
-        cdef bytes bytes_ = bits.as_bytes()
+        cdef bytearray bytes_ = bits.as_bytes()
        cdef unsigned char byte
        cdef int i_msg = 0
        cdef int i_byte = bits.i // 8
        cdef unsigned char i_bit = 0
        cdef unsigned char one = 1
        while i_msg < n_msg:
-            byte = ord(bytes_[i_byte])
+            byte = bytes_[i_byte]
            i_byte += 1
            for i_bit in range(8):
                branch = node.right if (byte & (one << i_bit)) else node.left
@ -138,11 +138,11 @@ cdef class HuffmanCodec:
        def __get__(self):
            output = []
            cdef int i, j
-            cdef bytes string
+            cdef unicode string
            cdef Code code
            for i in range(self.codes.size()):
                code = self.codes[i]
-                string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
+                string = '{0:b}'.format(code.bits).rjust(code.length, '0')
                string = string[::-1]
                output.append(string)
            return output
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@ -66,7 +66,7 @@ def _gen_orths(Vocab vocab):
 def _gen_chars(Vocab vocab):
    cdef attr_t orth
    cdef size_t addr
-    char_weights = {chr(i): 1e-20 for i in range(256)}
+    char_weights = {i: 1e-20 for i in range(256)}
    cdef unicode string
    cdef bytes char
    cdef bytes utf8_str
@ -75,9 +75,9 @@ def _gen_chars(Vocab vocab):
        string = vocab.strings[lex.orth]
        utf8_str = string.encode('utf8')
        for char in utf8_str:
-            char_weights.setdefault(char, 0.0)
-            char_weights[char] += c_exp(lex.prob)
-        char_weights[b' '] += c_exp(lex.prob)
+            char_weights.setdefault(ord(char), 0.0)
+            char_weights[ord(char)] += c_exp(lex.prob)
+        char_weights[ord(' ')] += c_exp(lex.prob)
    return char_weights.items()


@ -110,12 +110,12 @@ cdef class Packer:
                codec.encode(array[:, i], bits)
        return bits.as_bytes()

-    def unpack(self, bytes data):
+    def unpack(self, data):
        doc = Doc(self.vocab)
        self.unpack_into(data, doc)
        return doc

-    def unpack_into(self, bytes byte_string, Doc doc):
+    def unpack_into(self, byte_string, Doc doc):
        bits = BitArray(byte_string)
        bits.seek(0)
        cdef int32_t length = bits.read32()
@ -149,7 +149,7 @@ cdef class Packer:
        cdef int32_t length = len(utf8_str)
        # Signal chars with negative length
        bits.extend(-length, 32)
-        self.char_codec.encode(utf8_str, bits)
+        self.char_codec.encode(bytearray(utf8_str), bits)
        cdef int i, j
        for i in range(doc.length):
            for j in range(doc.data[i].lex.length-1):
@ -167,7 +167,7 @@ cdef class Packer:
        spaces = iter(bits)
        for i in range(n):
            orth = orths[i]
-            space = spaces.next()
+            space = next(spaces)
            lex = self.vocab.get_by_orth(doc.mem, orth)
            doc.push_back(lex, space)
        return doc