From c4ff410fdb90acafa3c1e58936fefbcb03619614 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 24 Jul 2015 03:48:23 +0200
Subject: [PATCH] * Fix bytes problems for Python3

---
 spacy/serialize/bits.pxd    |  2 +-
 spacy/serialize/bits.pyx    | 19 +++++++++----------
 spacy/serialize/huffman.pyx |  8 ++++----
 spacy/serialize/packer.pyx  | 16 ++++++++--------
 4 files changed, 22 insertions(+), 23 deletions(-)
diff --git a/spacy/serialize/bits.pxd b/spacy/serialize/bits.pxd
index fea5ad786..9c7593a92 100644
--- a/spacy/serialize/bits.pxd
+++ b/spacy/serialize/bits.pxd
@@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
 
 
 cdef class BitArray:
-    cdef bytes data
+    cdef bytearray data
     cdef uchar byte
     cdef uchar bit_of_byte
     cdef uint32_t i
diff --git a/spacy/serialize/bits.pyx b/spacy/serialize/bits.pyx
index b062104ad..2f0fb30f6 100644
--- a/spacy/serialize/bits.pyx
+++ b/spacy/serialize/bits.pyx
@@ -16,8 +16,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
 
 
 cdef class BitArray:
-    def __init__(self, bytes data=b''):
-        self.data = data
+    def __init__(self, data=b''):
+        self.data = bytearray(data)
         self.byte = 0
         self.bit_of_byte = 0
         self.i = 0
@@ -47,7 +47,7 @@ cdef class BitArray:
         start_bit = self.i % 8
 
         if start_bit != 0 and start_byte < len(self.data):
-            byte = ord(self.data[start_byte])
+            byte = self.data[start_byte]
             for i in range(start_bit, 8):
                 self.i += 1
                 yield 1 if (byte & (one << i)) else 0
@@ -70,10 +70,10 @@ cdef class BitArray:
 
         # TODO portability
         cdef uchar[4] chars
-        chars[0] = <uchar>ord(self.data[start_byte])
-        chars[1] = <uchar>ord(self.data[start_byte+1])
-        chars[2] = <uchar>ord(self.data[start_byte+2])
-        chars[3] = <uchar>ord(self.data[start_byte+3])
+        chars[0] = self.data[start_byte]
+        chars[1] = self.data[start_byte+1]
+        chars[2] = self.data[start_byte+2]
+        chars[3] = self.data[start_byte+3]
         cdef uint32_t output
         memcpy(&output, chars, 4)
         self.i += 32
@@ -85,8 +85,7 @@ cdef class BitArray:
             byte = chr(self.byte)
             # Jump through some hoops for Python3
             if isinstance(byte, unicode):
-                byte_char = <unsigned char>byte
-                return self.data + <bytes>&byte_char
+                return self.data + <bytes>(&self.byte)[:1]
             else:
                 return self.data + chr(self.byte)
         else:
@@ -101,7 +100,7 @@ cdef class BitArray:
         self.bit_of_byte += 1
         self.i += 1
         if self.bit_of_byte == 8:
-            self.data += chr(self.byte)
+            self.data += bytearray((self.byte,))
             self.byte = 0
             self.bit_of_byte = 0
 
diff --git a/spacy/serialize/huffman.pyx b/spacy/serialize/huffman.pyx
index 54895d03e..1bed83d74 100644
--- a/spacy/serialize/huffman.pyx
+++ b/spacy/serialize/huffman.pyx
@@ -110,14 +110,14 @@ cdef class HuffmanCodec:
         cdef int branch
 
         cdef int n_msg = msg.shape[0]
-        cdef bytes bytes_ = bits.as_bytes()
+        cdef bytearray bytes_ = bits.as_bytes()
         cdef unsigned char byte
         cdef int i_msg = 0
         cdef int i_byte = bits.i // 8
         cdef unsigned char i_bit = 0
         cdef unsigned char one = 1
         while i_msg < n_msg:
-            byte = ord(bytes_[i_byte])
+            byte = bytes_[i_byte]
             i_byte += 1
             for i_bit in range(8):
                 branch = node.right if (byte & (one << i_bit)) else node.left
@@ -138,11 +138,11 @@ cdef class HuffmanCodec:
         def __get__(self):
             output = []
             cdef int i, j
-            cdef bytes string
+            cdef unicode string
             cdef Code code
             for i in range(self.codes.size()):
                 code = self.codes[i]
-                string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
+                string = '{0:b}'.format(code.bits).rjust(code.length, '0')
                 string = string[::-1]
                 output.append(string)
             return output
diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx
index bc0cad217..8acf478e0 100644
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@@ -66,7 +66,7 @@ def _gen_orths(Vocab vocab):
 def _gen_chars(Vocab vocab):
     cdef attr_t orth
     cdef size_t addr
-    char_weights = {chr(i): 1e-20 for i in range(256)}
+    char_weights = {i: 1e-20 for i in range(256)}
     cdef unicode string
     cdef bytes char
     cdef bytes utf8_str
@@ -75,9 +75,9 @@ def _gen_chars(Vocab vocab):
         string = vocab.strings[lex.orth]
         utf8_str = string.encode('utf8')
         for char in utf8_str:
-            char_weights.setdefault(char, 0.0)
-            char_weights[char] += c_exp(lex.prob)
-        char_weights[b' '] += c_exp(lex.prob)
+            char_weights.setdefault(ord(char), 0.0)
+            char_weights[ord(char)] += c_exp(lex.prob)
+        char_weights[ord(' ')] += c_exp(lex.prob)
     return char_weights.items()
 
 
@@ -110,12 +110,12 @@ cdef class Packer:
                 codec.encode(array[:, i], bits)
         return bits.as_bytes()
 
-    def unpack(self, bytes data):
+    def unpack(self, data):
         doc = Doc(self.vocab)
         self.unpack_into(data, doc)
         return doc
 
-    def unpack_into(self, bytes byte_string, Doc doc):
+    def unpack_into(self, byte_string, Doc doc):
         bits = BitArray(byte_string)
         bits.seek(0)
         cdef int32_t length = bits.read32()
@@ -149,7 +149,7 @@ cdef class Packer:
         cdef int32_t length = len(utf8_str)
         # Signal chars with negative length
         bits.extend(-length, 32)
-        self.char_codec.encode(utf8_str, bits)
+        self.char_codec.encode(bytearray(utf8_str), bits)
         cdef int i, j
         for i in range(doc.length):
             for j in range(doc.data[i].lex.length-1):
@@ -167,7 +167,7 @@ cdef class Packer:
         spaces = iter(bits)
         for i in range(n):
             orth = orths[i]
-            space = spaces.next()
+            space = next(spaces)
             lex = self.vocab.get_by_orth(doc.mem, orth)
             doc.push_back(lex, space)
         return doc