mirror of https://github.com/explosion/spaCy.git
Add SENT_START attribute, for custom sentence boundary detection
This commit is contained in:
parent
fb0ff0272f
commit
d68dd1f251
|
@ -83,6 +83,7 @@ cpdef enum attr_id_t:
|
||||||
ENT_IOB
|
ENT_IOB
|
||||||
ENT_TYPE
|
ENT_TYPE
|
||||||
HEAD
|
HEAD
|
||||||
|
SENT_START
|
||||||
SPACY
|
SPACY
|
||||||
PROB
|
PROB
|
||||||
|
|
||||||
|
|
|
@ -85,6 +85,7 @@ IDS = {
|
||||||
"ENT_IOB": ENT_IOB,
|
"ENT_IOB": ENT_IOB,
|
||||||
"ENT_TYPE": ENT_TYPE,
|
"ENT_TYPE": ENT_TYPE,
|
||||||
"HEAD": HEAD,
|
"HEAD": HEAD,
|
||||||
|
"SENT_START": SENT_START,
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
"PROB": PROB,
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
|
|
|
@ -82,6 +82,7 @@ cpdef enum symbol_t:
|
||||||
ENT_IOB
|
ENT_IOB
|
||||||
ENT_TYPE
|
ENT_TYPE
|
||||||
HEAD
|
HEAD
|
||||||
|
SENT_START
|
||||||
SPACY
|
SPACY
|
||||||
PROB
|
PROB
|
||||||
|
|
||||||
|
|
|
@ -84,6 +84,7 @@ IDS = {
|
||||||
"ENT_IOB": ENT_IOB,
|
"ENT_IOB": ENT_IOB,
|
||||||
"ENT_TYPE": ENT_TYPE,
|
"ENT_TYPE": ENT_TYPE,
|
||||||
"HEAD": HEAD,
|
"HEAD": HEAD,
|
||||||
|
"SENT_START": SENT_START,
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
"PROB": PROB,
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
|
from ..attrs cimport SENT_START
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
from ..syntax.iterators import CHUNKERS
|
from ..syntax.iterators import CHUNKERS
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
|
@ -52,6 +53,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||||
return token.dep
|
return token.dep
|
||||||
elif feat_name == HEAD:
|
elif feat_name == HEAD:
|
||||||
return token.head
|
return token.head
|
||||||
|
elif feat_name == SENT_START:
|
||||||
|
return token.sent_start
|
||||||
elif feat_name == SPACY:
|
elif feat_name == SPACY:
|
||||||
return token.spacy
|
return token.spacy
|
||||||
elif feat_name == ENT_IOB:
|
elif feat_name == ENT_IOB:
|
||||||
|
@ -559,6 +562,7 @@ cdef class Doc:
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
self.c[i] = parsed[i]
|
self.c[i] = parsed[i]
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
def from_array(self, attrs, int[:, :] array):
|
def from_array(self, attrs, int[:, :] array):
|
||||||
"""Load attributes from a numpy array. Write to a `Doc` object, from an
|
"""Load attributes from a numpy array. Write to a `Doc` object, from an
|
||||||
`(M, N)` array of attributes.
|
`(M, N)` array of attributes.
|
||||||
|
@ -567,6 +571,18 @@ cdef class Doc:
|
||||||
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
|
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
|
||||||
RETURNS (Doc): Itself.
|
RETURNS (Doc): Itself.
|
||||||
"""
|
"""
|
||||||
|
=======
|
||||||
|
def from_array(self, attrs, array):
|
||||||
|
if SENT_START in attrs and HEAD in attrs:
|
||||||
|
raise ValueError(
|
||||||
|
"Conflicting attributes specified in doc.from_array():\n"
|
||||||
|
"(HEAD, SENT_START)\n"
|
||||||
|
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
|
||||||
|
"based on the tree structure. This means the HEAD attribute would "
|
||||||
|
"potentially override the sentence boundaries set by SENT_START.\n"
|
||||||
|
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
|
||||||
|
"workarounds, and to propose solutions.")
|
||||||
|
>>>>>>> 45ad8684... * Add SENT_START attribute
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
|
|
Loading…
Reference in New Issue