* Add working to/from bytes API to Doc

2015-07-23 01:14:45 +02:00 · 2015-07-23 01:14:45 +02:00 · a0e36e8efc
parent 1f31d96bf9
commit a0e36e8efc
1 changed files with 8 additions and 21 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -71,17 +71,6 @@ cdef class Doc:
        self.is_tagged = False
        self.is_parsed = False
        self._py_tokens = []
-        cdef const LexemeC* lex
-        cdef attr_t orth
-        cdef bint space
-        if orths_and_spaces is not None:
-            for orth, space in orths_and_spaces:
-                lex = <LexemeC*>self.vocab._by_orth.get(orth)
-                if lex != NULL:
-                    assert lex.orth == orth
-                    self.push_back(lex, space)
-                else:
-                    raise Exception('Lexeme not found: %d' % orth)

    def __getitem__(self, object i):
        """Get a token.
@ -303,12 +292,11 @@ cdef class Doc:
        return self

    def to_bytes(self):
-        bits = self.vocab.packer.pack(self)
-        return struct.pack('I', len(bits)) + bits.as_bytes()
+        byte_string = self.vocab.serializer.pack(self)
+        return struct.pack('I', len(byte_string)) + byte_string

-    def from_bytes(self, data):
-        bits = BitArray(data)
-        self.vocab.packer.unpack_into(bits, self)
+    def from_bytes(self, bytes data):
+        self.vocab.serializer.unpack_into(data[4:], self)
        return self
    
    @staticmethod
@ -316,15 +304,14 @@ cdef class Doc:
        keep_reading = True
        while keep_reading:
            try:
-                n_bits_str = file_.read(4)
-                if len(n_bits_str) < 4:
+                n_bytes_str = file_.read(4)
+                if len(n_bytes_str) < 4:
                    break
-                n_bits = struct.unpack('I', n_bits_str)[0]
-                n_bytes = n_bits // 8 + bool(n_bits % 8)
+                n_bytes = struct.unpack('I', n_bytes_str)[0]
                data = file_.read(n_bytes)
            except StopIteration:
                keep_reading = False
-            yield data
+            yield n_bytes_str + data

    # This function is terrible --- need to fix this.
    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,