From 1601e488eee1c7ce87a1081936cb0ca990753233 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 27 Jul 2015 21:43:58 +0200
Subject: [PATCH] * Fix bug in decoding non-ascii characters

---
 spacy/serialize/packer.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx
index c3a0bfe3d..976bc1c35 100644
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@@ -152,7 +152,7 @@ cdef class Packer:
         cdef int32_t length = len(utf8_str)
         # Signal chars with negative length
         bits.extend(-length, 32)
-        self.char_codec.encode(bytearray(utf8_str), bits)
+        self.char_codec.encode(utf8_str, bits)
         cdef int i, j
         for i in range(doc.length):
             for j in range(doc.data[i].lex.length-1):
@@ -175,24 +175,24 @@ cdef class Packer:
             doc.push_back(lex, space)
         return doc
 
-    def _char_decode(self, BitArray bits, int32_t n, Doc doc):
-        cdef bytearray utf8_str = bytearray(n)
+    def _char_decode(self, BitArray bits, int32_t n_bytes, Doc doc):
+        cdef bytearray utf8_str = bytearray(n_bytes)
         self.char_codec.decode(bits, utf8_str)
 
         cdef unicode string = utf8_str.decode('utf8')
         cdef int start = 0
         cdef bint is_spacy
-        cdef int length = len(string)
+        cdef int n_unicode_chars = len(string)
         cdef int i = 0
         cdef bint is_end_token
         for is_end_token in bits:
             if is_end_token:
                 span = string[start:i+1]
                 lex = self.vocab.get(doc.mem, span)
-                is_spacy = (i+1) < length and string[i+1] == u' '
+                is_spacy = (i+1) < n_unicode_chars and string[i+1] == u' '
                 doc.push_back(lex, is_spacy)
                 start = i + 1 + is_spacy
             i += 1
-            if i >= n:
+            if i >= n_unicode_chars:
                 break
         return doc