mirror of https://github.com/explosion/spaCy.git
* draft de/serialization functions in doc.pyx
This commit is contained in:
parent
9d956b07e9
commit
0e07c1ed2a
|
@ -278,7 +278,7 @@ cdef class Doc:
|
||||||
self.data[i].lex = &EMPTY_LEXEME
|
self.data[i].lex = &EMPTY_LEXEME
|
||||||
|
|
||||||
cdef int set_parse(self, const TokenC* parsed) except -1:
|
cdef int set_parse(self, const TokenC* parsed) except -1:
|
||||||
# TODO: This method is fairly misleading atm. It's used by GreedyParser
|
# TODO: This method is fairly misleading atm. It's used by Parser
|
||||||
# to actually apply the parse calculated. Need to rethink this.
|
# to actually apply the parse calculated. Need to rethink this.
|
||||||
self.is_parsed = True
|
self.is_parsed = True
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
|
@ -369,40 +369,40 @@ cdef class Doc:
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
return self[start]
|
return self[start]
|
||||||
|
|
||||||
def serialize(self, bits=None):
|
def serialize(self, codecs, bits=None):
|
||||||
if bits is None:
|
if bits is None:
|
||||||
bits = BitArray()
|
bits = BitArray()
|
||||||
codec = self.vocab.codec
|
array = self.to_array([codec.attr_id for codec in codecs])
|
||||||
ids = numpy.zeros(shape=(len(self),), dtype=numpy.uint32)
|
for i, codec in enumerate(codecs):
|
||||||
cdef int i
|
codec.encode(array[i,], bits)
|
||||||
for i in range(self.length):
|
|
||||||
ids[i] = self.data[i].lex.id
|
|
||||||
bits = codec.encode(ids, bits=bits)
|
|
||||||
for i in range(self.length):
|
|
||||||
bits.append(self.data[i].spacy)
|
|
||||||
return bits
|
return bits
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def deserialize(Vocab vocab, bits):
|
def deserialize(Vocab vocab, bits):
|
||||||
biterator = iter(bits)
|
biterator = iter(bits)
|
||||||
ids = vocab.codec.decode(biterator)
|
ids = vocab.lex_codec.decode(bits)
|
||||||
spaces = []
|
|
||||||
for bit in biterator:
|
|
||||||
spaces.append(bit)
|
|
||||||
if len(spaces) == len(ids):
|
|
||||||
break
|
|
||||||
string = u''
|
|
||||||
cdef const LexemeC* lex
|
|
||||||
for id_, space in zip(ids, spaces):
|
|
||||||
lex = vocab.lexemes[id_]
|
|
||||||
string += vocab.strings[lex.orth]
|
|
||||||
if space:
|
|
||||||
string += u' '
|
|
||||||
cdef Doc doc = Doc(vocab)
|
cdef Doc doc = Doc(vocab)
|
||||||
cdef bint has_space = False
|
cdef int id_
|
||||||
cdef int idx = 0
|
for id_ in ids:
|
||||||
for i, id_ in enumerate(ids):
|
is_spacy = biterator.next()
|
||||||
lex = vocab.lexemes[id_]
|
doc.push_back(vocab.lexemes.at(id_), is_spacy)
|
||||||
has_space = spaces[i]
|
|
||||||
doc.push_back(lex, has_space)
|
cdef int i
|
||||||
|
for codec in vocab.annotation_codecs:
|
||||||
|
values = codec.decode(biterator)
|
||||||
|
if codec.attr_id == HEAD:
|
||||||
|
for i, head in enumerate(values):
|
||||||
|
doc.data[i].head = head
|
||||||
|
elif codec.attr_id == TAG:
|
||||||
|
for i, tag in enumerate(values):
|
||||||
|
doc.data[i].tag = tag
|
||||||
|
elif codec.attr_id == DEP:
|
||||||
|
for i, dep in enumerate(values):
|
||||||
|
doc.data[i].dep = dep
|
||||||
|
elif codec.attr_id == ENT_IOB:
|
||||||
|
for i, ent_iob in enumerate(values):
|
||||||
|
doc.data[i].ent_iob = ent_iob
|
||||||
|
elif codec.attr_id == ENT_TYPE:
|
||||||
|
for i, ent_type in enumerate(values):
|
||||||
|
doc.data[i].ent_type = ent_type
|
||||||
return doc
|
return doc
|
||||||
|
|
Loading…
Reference in New Issue