diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 5873b23ac..a1550b1ef 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -9,6 +9,7 @@ cdef struct GoldParseC: int* tags int* heads int* has_dep + int* sent_start attr_t* labels int** brackets Transition* ner diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 096f265a9..f00d04109 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -426,6 +426,7 @@ cdef class GoldParse: self.c.heads = self.mem.alloc(len(doc), sizeof(int)) self.c.labels = self.mem.alloc(len(doc), sizeof(attr_t)) self.c.has_dep = self.mem.alloc(len(doc), sizeof(int)) + self.c.sent_start = self.mem.alloc(len(doc), sizeof(int)) self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) self.cats = list(cats) @@ -482,6 +483,10 @@ cdef class GoldParse: """ return not nonproj.is_nonproj_tree(self.heads) + @property + def sent_starts(self): + return [self.c.sent_start[i] for i in range(self.length)] + def biluo_tags_from_offsets(doc, entities, missing='O'): """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out