Add SENT_START attribute, for custom sentence boundary detection

This commit is contained in:
Matthew Honnibal 2016-05-05 12:11:57 +02:00
parent fb0ff0272f
commit d68dd1f251
5 changed files with 20 additions and 0 deletions

View File

@ -83,6 +83,7 @@ cpdef enum attr_id_t:
ENT_IOB ENT_IOB
ENT_TYPE ENT_TYPE
HEAD HEAD
SENT_START
SPACY SPACY
PROB PROB

View File

@ -85,6 +85,7 @@ IDS = {
"ENT_IOB": ENT_IOB, "ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE, "ENT_TYPE": ENT_TYPE,
"HEAD": HEAD, "HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY, "SPACY": SPACY,
"PROB": PROB, "PROB": PROB,
"LANG": LANG, "LANG": LANG,

View File

@ -82,6 +82,7 @@ cpdef enum symbol_t:
ENT_IOB ENT_IOB
ENT_TYPE ENT_TYPE
HEAD HEAD
SENT_START
SPACY SPACY
PROB PROB

View File

@ -84,6 +84,7 @@ IDS = {
"ENT_IOB": ENT_IOB, "ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE, "ENT_TYPE": ENT_TYPE,
"HEAD": HEAD, "HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY, "SPACY": SPACY,
"PROB": PROB, "PROB": PROB,

View File

@ -24,6 +24,7 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..attrs cimport SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..syntax.iterators import CHUNKERS from ..syntax.iterators import CHUNKERS
from ..util import normalize_slice from ..util import normalize_slice
@ -52,6 +53,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
return token.dep return token.dep
elif feat_name == HEAD: elif feat_name == HEAD:
return token.head return token.head
elif feat_name == SENT_START:
return token.sent_start
elif feat_name == SPACY: elif feat_name == SPACY:
return token.spacy return token.spacy
elif feat_name == ENT_IOB: elif feat_name == ENT_IOB:
@ -559,6 +562,7 @@ cdef class Doc:
for i in range(self.length): for i in range(self.length):
self.c[i] = parsed[i] self.c[i] = parsed[i]
<<<<<<< HEAD
def from_array(self, attrs, int[:, :] array): def from_array(self, attrs, int[:, :] array):
"""Load attributes from a numpy array. Write to a `Doc` object, from an """Load attributes from a numpy array. Write to a `Doc` object, from an
`(M, N)` array of attributes. `(M, N)` array of attributes.
@ -567,6 +571,18 @@ cdef class Doc:
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load. array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
RETURNS (Doc): Itself. RETURNS (Doc): Itself.
""" """
=======
def from_array(self, attrs, array):
if SENT_START in attrs and HEAD in attrs:
raise ValueError(
"Conflicting attributes specified in doc.from_array():\n"
"(HEAD, SENT_START)\n"
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
"based on the tree structure. This means the HEAD attribute would "
"potentially override the sentence boundaries set by SENT_START.\n"
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
"workarounds, and to propose solutions.")
>>>>>>> 45ad8684... * Add SENT_START attribute
cdef int i, col cdef int i, col
cdef attr_id_t attr_id cdef attr_id_t attr_id
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c