Add doc.is_sentenced attribute, re #1959

2018-02-18 14:16:55 +01:00 · 2018-02-18 14:16:55 +01:00 · cf0e320f2b
parent 1e5aeb4eec
commit cf0e320f2b
2 changed files with 34 additions and 32 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -186,6 +186,20 @@ cdef class Doc:
    def _(self):
        return Underscore(Underscore.doc_extensions, self)
    @property
    def is_sentenced(self):
        # Check if the document has sentence boundaries,
        # i.e at least one tok has the sent_start in (-1, 1)
        if 'sents' in self.user_hooks:
            return True
        if self.is_parsed:
            return True
        for i in range(self.length):
            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                return True
        else:
            return False
    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.
@ -515,29 +529,23 @@ cdef class Doc:
            >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
        """
        def __get__(self):
            if not self.is_sentenced:
                raise ValueError(
                    "Sentence boundaries unset. You can add the 'sentencizer' "
                    "component to the pipeline with: "
                    "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
                    "Alternatively, add the dependency parser, or set "
                    "sentence boundaries by setting doc[i].sent_start")
            if 'sents' in self.user_hooks:
                yield from self.user_hooks['sents'](self)
-                return
+            else:
-
+                start = 0
            cdef int i
            if not self.is_parsed:
                for i in range(1, self.length):
-                    if self.c[i].sent_start != 0:
+                    if self.c[i].sent_start == 1:
-                        break
+                        yield Span(self, start, i)
-                else:
+                        start = i
-                    raise ValueError(
+                if start != self.length:
-                        "Sentence boundaries unset. You can add the 'sentencizer' "
+                    yield Span(self, start, self.length)
                        "component to the pipeline with: "
                        "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
                        "Alternatively, add the dependency parser, or set "
                        "sentence boundaries by setting doc[i].sent_start")
            start = 0
            for i in range(1, self.length):
                if self.c[i].sent_start == 1:
                    yield Span(self, start, i)
                    start = i
            if start != self.length:
                yield Span(self, start, self.length)
    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
        if self.length == 0:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -300,17 +300,7 @@ cdef class Span:
                    if n >= self.doc.length:
                        raise RuntimeError
                return self.doc[root.l_edge:root.r_edge + 1]
-            else:
+            elif self.doc.is_sentenced:
                # Check if the document has sentence boundaries,
                # i.e at least one tok has the sent_start == 1
                for i in range(self.doc.length):
                    if self.doc.c[i].sent_start == 1:
                        break
                else:
                    raise ValueError(
                        "Access to sentence requires either the dependency parse "
                        "or sentence boundaries to be set by setting " +
                        "doc[i].is_sent_start = True")
                # find start of the sentence
                start = self.start
                while self.doc.c[start].sent_start != 1 and start > 0:
@ -323,7 +313,11 @@ cdef class Span:
                        break
                #
                return self.doc[start:end]
-
+            else:
                raise ValueError(
                    "Access to sentence requires either the dependency parse "
                    "or sentence boundaries to be set by setting " +
                    "doc[i].is_sent_start = True")
    property has_vector:
        """RETURNS (bool): Whether a word vector is associated with the object.