From cf0e320f2b99c98dea72961b23c100e57dd3ac12 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Feb 2018 14:16:55 +0100 Subject: [PATCH] Add doc.is_sentenced attribute, re #1959 --- spacy/tokens/doc.pyx | 48 +++++++++++++++++++++++++------------------ spacy/tokens/span.pyx | 18 ++++++---------- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index daab22434..b06c7433c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -186,6 +186,20 @@ cdef class Doc: def _(self): return Underscore(Underscore.doc_extensions, self) + @property + def is_sentenced(self): + # Check if the document has sentence boundaries, + # i.e at least one tok has the sent_start in (-1, 1) + if 'sents' in self.user_hooks: + return True + if self.is_parsed: + return True + for i in range(self.length): + if self.c[i].sent_start == -1 or self.c[i].sent_start == 1: + return True + else: + return False + def __getitem__(self, object i): """Get a `Token` or `Span` object. @@ -515,29 +529,23 @@ cdef class Doc: >>> assert [s.root.text for s in doc.sents] == ["is", "'s"] """ def __get__(self): + if not self.is_sentenced: + raise ValueError( + "Sentence boundaries unset. You can add the 'sentencizer' " + "component to the pipeline with: " + "nlp.add_pipe(nlp.create_pipe('sentencizer')) " + "Alternatively, add the dependency parser, or set " + "sentence boundaries by setting doc[i].sent_start") if 'sents' in self.user_hooks: yield from self.user_hooks['sents'](self) - return - - cdef int i - if not self.is_parsed: + else: + start = 0 for i in range(1, self.length): - if self.c[i].sent_start != 0: - break - else: - raise ValueError( - "Sentence boundaries unset. You can add the 'sentencizer' " - "component to the pipeline with: " - "nlp.add_pipe(nlp.create_pipe('sentencizer')) " - "Alternatively, add the dependency parser, or set " - "sentence boundaries by setting doc[i].sent_start") - start = 0 - for i in range(1, self.length): - if self.c[i].sent_start == 1: - yield Span(self, start, i) - start = i - if start != self.length: - yield Span(self, start, self.length) + if self.c[i].sent_start == 1: + yield Span(self, start, i) + start = i + if start != self.length: + yield Span(self, start, self.length) cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: if self.length == 0: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index cc4b0a26a..f794e1d3f 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -300,17 +300,7 @@ cdef class Span: if n >= self.doc.length: raise RuntimeError return self.doc[root.l_edge:root.r_edge + 1] - else: - # Check if the document has sentence boundaries, - # i.e at least one tok has the sent_start == 1 - for i in range(self.doc.length): - if self.doc.c[i].sent_start == 1: - break - else: - raise ValueError( - "Access to sentence requires either the dependency parse " - "or sentence boundaries to be set by setting " + - "doc[i].is_sent_start = True") + elif self.doc.is_sentenced: # find start of the sentence start = self.start while self.doc.c[start].sent_start != 1 and start > 0: @@ -323,7 +313,11 @@ cdef class Span: break # return self.doc[start:end] - + else: + raise ValueError( + "Access to sentence requires either the dependency parse " + "or sentence boundaries to be set by setting " + + "doc[i].is_sent_start = True") property has_vector: """RETURNS (bool): Whether a word vector is associated with the object.