Add SENT_START attribute, for custom sentence boundary detection

2016-05-05 12:11:57 +02:00 · 2016-05-05 12:11:57 +02:00 · d68dd1f251
parent fb0ff0272f
commit d68dd1f251
5 changed files with 20 additions and 0 deletions
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -83,6 +83,7 @@ cpdef enum attr_id_t:
    ENT_IOB
    ENT_TYPE
    HEAD
+    SENT_START
    SPACY
    PROB

--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -85,6 +85,7 @@ IDS = {
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "HEAD": HEAD,
+    "SENT_START": SENT_START,
    "SPACY": SPACY,
    "PROB": PROB,
    "LANG": LANG,
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -82,6 +82,7 @@ cpdef enum symbol_t:
    ENT_IOB
    ENT_TYPE
    HEAD
+    SENT_START
    SPACY
    PROB

--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -84,6 +84,7 @@ IDS = {
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "HEAD": HEAD,
+    "SENT_START": SENT_START,
    "SPACY": SPACY,
    "PROB": PROB,

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -24,6 +24,7 @@ from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
+from ..attrs cimport SENT_START
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 from ..syntax.iterators import CHUNKERS
 from ..util import normalize_slice
@ -52,6 +53,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
        return token.dep
    elif feat_name == HEAD:
        return token.head
+    elif feat_name == SENT_START:
+        return token.sent_start
    elif feat_name == SPACY:
        return token.spacy
    elif feat_name == ENT_IOB:
@ -559,6 +562,7 @@ cdef class Doc:
        for i in range(self.length):
            self.c[i] = parsed[i]

+<<<<<<< HEAD
    def from_array(self, attrs, int[:, :] array):
        """Load attributes from a numpy array. Write to a `Doc` object, from an
        `(M, N)` array of attributes.
@ -567,6 +571,18 @@ cdef class Doc:
        array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
        RETURNS (Doc): Itself.
        """
+=======
+    def from_array(self, attrs, array):
+        if SENT_START in attrs and HEAD in attrs:
+            raise ValueError(
+                "Conflicting attributes specified in doc.from_array():\n"
+                "(HEAD, SENT_START)\n"
+                "The HEAD attribute currently sets sentence boundaries implicitly,\n"
+                "based on the tree structure. This means the HEAD attribute would "
+                "potentially override the sentence boundaries set by SENT_START.\n"
+                "See https://github.com/spacy-io/spaCy/issues/235 for details and "
+                "workarounds, and to propose solutions.")
+>>>>>>> 45ad8684... * Add SENT_START attribute
        cdef int i, col
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.c