Add SENT_START attribute, for custom sentence boundary detection

This commit is contained in:
Matthew Honnibal 2016-05-05 12:11:57 +02:00
parent fb0ff0272f
commit d68dd1f251
5 changed files with 20 additions and 0 deletions

View File

@ -83,6 +83,7 @@ cpdef enum attr_id_t:
ENT_IOB
ENT_TYPE
HEAD
SENT_START
SPACY
PROB

View File

@ -85,6 +85,7 @@ IDS = {
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY,
"PROB": PROB,
"LANG": LANG,

View File

@ -82,6 +82,7 @@ cpdef enum symbol_t:
ENT_IOB
ENT_TYPE
HEAD
SENT_START
SPACY
PROB

View File

@ -84,6 +84,7 @@ IDS = {
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY,
"PROB": PROB,

View File

@ -24,6 +24,7 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..attrs cimport SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..syntax.iterators import CHUNKERS
from ..util import normalize_slice
@ -52,6 +53,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
return token.dep
elif feat_name == HEAD:
return token.head
elif feat_name == SENT_START:
return token.sent_start
elif feat_name == SPACY:
return token.spacy
elif feat_name == ENT_IOB:
@ -559,6 +562,7 @@ cdef class Doc:
for i in range(self.length):
self.c[i] = parsed[i]
<<<<<<< HEAD
def from_array(self, attrs, int[:, :] array):
"""Load attributes from a numpy array. Write to a `Doc` object, from an
`(M, N)` array of attributes.
@ -567,6 +571,18 @@ cdef class Doc:
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
RETURNS (Doc): Itself.
"""
=======
def from_array(self, attrs, array):
if SENT_START in attrs and HEAD in attrs:
raise ValueError(
"Conflicting attributes specified in doc.from_array():\n"
"(HEAD, SENT_START)\n"
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
"based on the tree structure. This means the HEAD attribute would "
"potentially override the sentence boundaries set by SENT_START.\n"
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
"workarounds, and to propose solutions.")
>>>>>>> 45ad8684... * Add SENT_START attribute
cdef int i, col
cdef attr_id_t attr_id
cdef TokenC* tokens = self.c