From 4d61239eac78077430f7c37e2ea71c831238e15e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 22 Jul 2015 04:53:01 +0200
Subject: [PATCH] * Reorganize the serialization functions on Doc

---
 spacy/tokens/doc.pyx | 123 ++++++++++++++++++++++---------------------
 1 file changed, 64 insertions(+), 59 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 8f9e698b4..d67afeed7 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -5,7 +5,6 @@ import numpy
 import struct
 
 from ..lexeme cimport EMPTY_LEXEME
-from ..strings cimport slice_unicode
 from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
@@ -15,7 +14,6 @@ from ..parts_of_speech cimport CONJ, PUNCT
 from ..lexeme cimport check_flag
 from ..lexeme cimport get_attr as get_lex_attr
 from .spans import Span
-from ..structs cimport UniStr
 from .token cimport Token
 from ..serialize.bits cimport BitArray
 
@@ -222,16 +220,16 @@ cdef class Doc:
         """Produce a dict of {attribute (int): count (ints)} frequencies, keyed
         by the values of the given attribute ID.
 
-          >>> from spacy.en import English, attrs
-          >>> nlp = English()
-          >>> tokens = nlp(u'apple apple orange banana')
-          >>> tokens.count_by(attrs.ORTH)
-          {12800L: 1, 11880L: 2, 7561L: 1}
-          >>> tokens.to_array([attrs.ORTH])
-          array([[11880],
-                 [11880],
-                 [ 7561],
-                 [12800]])
+        >>> from spacy.en import English, attrs
+        >>> nlp = English()
+        >>> tokens = nlp(u'apple apple orange banana')
+        >>> tokens.count_by(attrs.ORTH)
+        {12800L: 1, 11880L: 2, 7561L: 1}
+        >>> tokens.to_array([attrs.ORTH])
+        array([[11880],
+               [11880],
+               [ 7561],
+               [12800]])
         """
         cdef int i
         cdef attr_t attr
@@ -273,10 +271,62 @@ cdef class Doc:
     cdef int set_parse(self, const TokenC* parsed) except -1:
         # TODO: This method is fairly misleading atm. It's used by Parser
         # to actually apply the parse calculated. Need to rethink this.
+
+        # Probably we should use from_array?
         self.is_parsed = True
         for i in range(self.length):
             self.data[i] = parsed[i]
 
+    def from_array(self, attrs, array):
+        cdef int i, col
+        cdef attr_id_t attr_id
+        cdef TokenC* tokens = self.data
+        cdef int length = len(array)
+        for col, attr_id in enumerate(attrs): 
+            values = array[:, col]
+            if attr_id == HEAD:
+                # TODO: Set left and right children
+                for i in range(length):
+                    tokens[i].head = values[i]
+            elif attr_id == TAG:
+                for i in range(length):
+                    tokens[i].tag = values[i]
+            elif attr_id == DEP:
+                for i in range(length):
+                    tokens[i].dep = values[i]
+            elif attr_id == ENT_IOB:
+                for i in range(length):
+                    tokens[i].ent_iob = values[i]
+            elif attr_id == ENT_TYPE:
+                for i in range(length):
+                    tokens[i].ent_type = values[i]
+        return self
+
+    def to_bytes(self):
+        bits = self.vocab.packer.pack(self)
+        return struct.pack('I', len(bits)) + bits.as_bytes()
+
+    def from_bytes(self, data):
+        bits = BitArray(data)
+        self.vocab.packer.unpack_into(bits, self)
+        return self
+    
+    @staticmethod
+    def read_bytes(file_):
+        keep_reading = True
+        while keep_reading:
+            try:
+                n_bits_str = file_.read(4)
+                if len(n_bits_str) < 4:
+                    break
+                n_bits = struct.unpack('I', n_bits_str)[0]
+                n_bytes = n_bits // 8 + bool(n_bits % 8)
+                data = file_.read(n_bytes)
+            except StopIteration:
+                keep_reading = False
+            yield data
+
+    # This function is terrible --- need to fix this.
     def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
               unicode ent_type):
         """Merge a multi-word expression into a single token.  Currently
@@ -296,9 +346,8 @@ cdef class Doc:
             return None
         cdef unicode string = self.string
         # Get LexemeC for newly merged token
-        cdef UniStr new_orth_c
-        slice_unicode(&new_orth_c, string, start_idx, end_idx)
-        cdef const LexemeC* lex = self.vocab.get(self.mem, &new_orth_c)
+        new_orth = string[start_idx:end_idx]
+        cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
         # House the new merged token where it starts
         cdef TokenC* token = &self.data[start]
         # Update fields
@@ -361,47 +410,3 @@ cdef class Doc:
 
         # Return the merged Python object
         return self[start]
-
-    def from_array(self, attrs, array):
-        cdef int i, col
-        cdef attr_id_t attr_id
-        cdef TokenC* tokens = self.data
-        cdef int length = len(array)
-        for col, attr_id in enumerate(attrs): 
-            values = array[:, col]
-            if attr_id == HEAD:
-                for i in range(length):
-                    tokens[i].head = values[i]
-            elif attr_id == TAG:
-                for i in range(length):
-                    tokens[i].tag = values[i]
-            elif attr_id == DEP:
-                for i in range(length):
-                    tokens[i].dep = values[i]
-            elif attr_id == ENT_IOB:
-                for i in range(length):
-                    tokens[i].ent_iob = values[i]
-            elif attr_id == ENT_TYPE:
-                for i in range(length):
-                    tokens[i].ent_type = values[i]
-
-    def to_bytes(self):
-        bits = self.vocab.packer.pack(self)
-        return struct.pack('I', len(bits)) + bits.as_bytes()
-
-    @staticmethod
-    def from_bytes(Vocab vocab, file_):
-        keep_reading = True
-        while keep_reading:
-            try:
-                n_bits_str = file_.read(4)
-                if len(n_bits_str) < 4:
-                    break
-                n_bits = struct.unpack('I', n_bits_str)[0]
-                n_bytes = n_bits // 8 + bool(n_bits % 8)
-                data = file_.read(n_bytes)
-            except StopIteration:
-                keep_reading = False
-            bits = BitArray(data)
-            doc = vocab.packer.unpack(bits)
-            yield doc