Implement Doc.to_bytes and Doc.from_bytes methods

This commit is contained in:
Matthew Honnibal 2017-05-09 18:11:34 +02:00
parent 9e167b7bb6
commit 1166b0c491
1 changed files with 35 additions and 3 deletions

View File

@ -6,6 +6,7 @@ cimport numpy as np
import numpy import numpy
import numpy.linalg import numpy.linalg
import struct import struct
import dill
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
@ -18,7 +19,7 @@ from ..lexeme cimport EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
@ -609,13 +610,44 @@ cdef class Doc:
""" """
Serialize, producing a byte string. Serialize, producing a byte string.
""" """
raise NotImplementedError return dill.dumps(
(self.text,
self.to_array([LENGTH,SPACY,TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE]),
self.sentiment,
self.tensor,
self.noun_chunks_iterator,
self.user_data,
(self.user_hooks, self.user_token_hooks, self.user_span_hooks)),
protocol=-1)
def from_bytes(self, data): def from_bytes(self, data):
""" """
Deserialize, loading from bytes. Deserialize, loading from bytes.
""" """
raise NotImplementedError if self.length != 0:
raise ValueError("Cannot load into non-empty Doc")
cdef int[:, :] attrs
cdef int i, start, end, has_space
fields = dill.loads(data)
text, attrs = fields[:2]
self.sentiment, self.tensor = fields[2:4]
self.noun_chunks_iterator, self.user_data = fields[4:6]
self.user_hooks, self.user_token_hooks, self.user_span_hooks = fields[6]
start = 0
cdef const LexemeC* lex
cdef unicode orth_
for i in range(attrs.shape[0]):
end = start + attrs[i, 0]
has_space = attrs[i, 1]
orth_ = text[start:end]
lex = self.vocab.get(self.mem, orth_)
self.push_back(lex, has_space)
start = end + has_space
self.from_array(attrs[:, 2:],
[TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE])
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
""" """