mirror of https://github.com/explosion/spaCy.git
fix sent_start in serialization
This commit is contained in:
parent
45d62561f7
commit
515e25910e
|
@ -701,9 +701,12 @@ cdef class Doc:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
if array[i, col] != 0:
|
if array[i, col] != 0:
|
||||||
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
||||||
set_children_from_heads(self.c, self.length)
|
# set flags
|
||||||
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
|
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
|
||||||
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
||||||
|
# if document is parsed, set children
|
||||||
|
if self.is_parsed:
|
||||||
|
set_children_from_heads(self.c, self.length)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get_lca_matrix(self):
|
def get_lca_matrix(self):
|
||||||
|
@ -779,7 +782,16 @@ cdef class Doc:
|
||||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||||
all annotations.
|
all annotations.
|
||||||
"""
|
"""
|
||||||
array_head = [LENGTH, SPACY, TAG, LEMMA, HEAD, DEP, ENT_IOB, ENT_TYPE]
|
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
|
||||||
|
|
||||||
|
if self.is_tagged:
|
||||||
|
array_head.append(TAG)
|
||||||
|
# if doc parsed add head and dep attribute
|
||||||
|
if self.is_parsed:
|
||||||
|
array_head.extend([HEAD, DEP])
|
||||||
|
# otherwise add sent_start
|
||||||
|
else:
|
||||||
|
array_head.append(SENT_START)
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
|
|
|
@ -48,6 +48,8 @@ cdef class Token:
|
||||||
return token.ent_iob
|
return token.ent_iob
|
||||||
elif feat_name == ENT_TYPE:
|
elif feat_name == ENT_TYPE:
|
||||||
return token.ent_type
|
return token.ent_type
|
||||||
|
elif feat_name == SENT_START:
|
||||||
|
return token.sent_start
|
||||||
else:
|
else:
|
||||||
return Lexeme.get_struct_attr(token.lex, feat_name)
|
return Lexeme.get_struct_attr(token.lex, feat_name)
|
||||||
|
|
||||||
|
@ -70,3 +72,5 @@ cdef class Token:
|
||||||
token.ent_iob = value
|
token.ent_iob = value
|
||||||
elif feat_name == ENT_TYPE:
|
elif feat_name == ENT_TYPE:
|
||||||
token.ent_type = value
|
token.ent_type = value
|
||||||
|
elif feat_name == SENT_START:
|
||||||
|
token.sent_start = value
|
||||||
|
|
Loading…
Reference in New Issue