Add doc.is_sentenced attribute, re #1959

This commit is contained in:
Matthew Honnibal 2018-02-18 14:16:55 +01:00
parent 1e5aeb4eec
commit cf0e320f2b
2 changed files with 34 additions and 32 deletions

View File

@ -186,6 +186,20 @@ cdef class Doc:
def _(self): def _(self):
return Underscore(Underscore.doc_extensions, self) return Underscore(Underscore.doc_extensions, self)
@property
def is_sentenced(self):
# Check if the document has sentence boundaries,
# i.e at least one tok has the sent_start in (-1, 1)
if 'sents' in self.user_hooks:
return True
if self.is_parsed:
return True
for i in range(self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True
else:
return False
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a `Token` or `Span` object. """Get a `Token` or `Span` object.
@ -515,29 +529,23 @@ cdef class Doc:
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"] >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
""" """
def __get__(self): def __get__(self):
if not self.is_sentenced:
raise ValueError(
"Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set "
"sentence boundaries by setting doc[i].sent_start")
if 'sents' in self.user_hooks: if 'sents' in self.user_hooks:
yield from self.user_hooks['sents'](self) yield from self.user_hooks['sents'](self)
return else:
start = 0
cdef int i
if not self.is_parsed:
for i in range(1, self.length): for i in range(1, self.length):
if self.c[i].sent_start != 0: if self.c[i].sent_start == 1:
break yield Span(self, start, i)
else: start = i
raise ValueError( if start != self.length:
"Sentence boundaries unset. You can add the 'sentencizer' " yield Span(self, start, self.length)
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set "
"sentence boundaries by setting doc[i].sent_start")
start = 0
for i in range(1, self.length):
if self.c[i].sent_start == 1:
yield Span(self, start, i)
start = i
if start != self.length:
yield Span(self, start, self.length)
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
if self.length == 0: if self.length == 0:

View File

@ -300,17 +300,7 @@ cdef class Span:
if n >= self.doc.length: if n >= self.doc.length:
raise RuntimeError raise RuntimeError
return self.doc[root.l_edge:root.r_edge + 1] return self.doc[root.l_edge:root.r_edge + 1]
else: elif self.doc.is_sentenced:
# Check if the document has sentence boundaries,
# i.e at least one tok has the sent_start == 1
for i in range(self.doc.length):
if self.doc.c[i].sent_start == 1:
break
else:
raise ValueError(
"Access to sentence requires either the dependency parse "
"or sentence boundaries to be set by setting " +
"doc[i].is_sent_start = True")
# find start of the sentence # find start of the sentence
start = self.start start = self.start
while self.doc.c[start].sent_start != 1 and start > 0: while self.doc.c[start].sent_start != 1 and start > 0:
@ -323,7 +313,11 @@ cdef class Span:
break break
# #
return self.doc[start:end] return self.doc[start:end]
else:
raise ValueError(
"Access to sentence requires either the dependency parse "
"or sentence boundaries to be set by setting " +
"doc[i].is_sent_start = True")
property has_vector: property has_vector:
"""RETURNS (bool): Whether a word vector is associated with the object. """RETURNS (bool): Whether a word vector is associated with the object.