fix sent_start in serialization

This commit is contained in:
Thomas Opsomer 2018-01-28 19:50:42 +01:00
parent 45d62561f7
commit 515e25910e
2 changed files with 18 additions and 2 deletions

View File

@ -701,9 +701,12 @@ cdef class Doc:
for i in range(length):
if array[i, col] != 0:
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
set_children_from_heads(self.c, self.length)
# set flags
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
self.is_tagged = bool(TAG in attrs or POS in attrs)
# if document is parsed, set children
if self.is_parsed:
set_children_from_heads(self.c, self.length)
return self
def get_lca_matrix(self):
@ -779,7 +782,16 @@ cdef class Doc:
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
"""
array_head = [LENGTH, SPACY, TAG, LEMMA, HEAD, DEP, ENT_IOB, ENT_TYPE]
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
if self.is_tagged:
array_head.append(TAG)
# if doc parsed add head and dep attribute
if self.is_parsed:
array_head.extend([HEAD, DEP])
# otherwise add sent_start
else:
array_head.append(SENT_START)
# Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within
# keys, we must have tuples. In values we just have to hope

View File

@ -48,6 +48,8 @@ cdef class Token:
return token.ent_iob
elif feat_name == ENT_TYPE:
return token.ent_type
elif feat_name == SENT_START:
return token.sent_start
else:
return Lexeme.get_struct_attr(token.lex, feat_name)
@ -70,3 +72,5 @@ cdef class Token:
token.ent_iob = value
elif feat_name == ENT_TYPE:
token.ent_type = value
elif feat_name == SENT_START:
token.sent_start = value