mirror of https://github.com/explosion/spaCy.git
Add doc.is_sentenced attribute, re #1959
This commit is contained in:
parent
1e5aeb4eec
commit
cf0e320f2b
|
@ -186,6 +186,20 @@ cdef class Doc:
|
||||||
def _(self):
|
def _(self):
|
||||||
return Underscore(Underscore.doc_extensions, self)
|
return Underscore(Underscore.doc_extensions, self)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_sentenced(self):
|
||||||
|
# Check if the document has sentence boundaries,
|
||||||
|
# i.e at least one tok has the sent_start in (-1, 1)
|
||||||
|
if 'sents' in self.user_hooks:
|
||||||
|
return True
|
||||||
|
if self.is_parsed:
|
||||||
|
return True
|
||||||
|
for i in range(self.length):
|
||||||
|
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a `Token` or `Span` object.
|
"""Get a `Token` or `Span` object.
|
||||||
|
|
||||||
|
@ -515,29 +529,23 @@ cdef class Doc:
|
||||||
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
if not self.is_sentenced:
|
||||||
|
raise ValueError(
|
||||||
|
"Sentence boundaries unset. You can add the 'sentencizer' "
|
||||||
|
"component to the pipeline with: "
|
||||||
|
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||||
|
"Alternatively, add the dependency parser, or set "
|
||||||
|
"sentence boundaries by setting doc[i].sent_start")
|
||||||
if 'sents' in self.user_hooks:
|
if 'sents' in self.user_hooks:
|
||||||
yield from self.user_hooks['sents'](self)
|
yield from self.user_hooks['sents'](self)
|
||||||
return
|
else:
|
||||||
|
start = 0
|
||||||
cdef int i
|
|
||||||
if not self.is_parsed:
|
|
||||||
for i in range(1, self.length):
|
for i in range(1, self.length):
|
||||||
if self.c[i].sent_start != 0:
|
if self.c[i].sent_start == 1:
|
||||||
break
|
yield Span(self, start, i)
|
||||||
else:
|
start = i
|
||||||
raise ValueError(
|
if start != self.length:
|
||||||
"Sentence boundaries unset. You can add the 'sentencizer' "
|
yield Span(self, start, self.length)
|
||||||
"component to the pipeline with: "
|
|
||||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
|
||||||
"Alternatively, add the dependency parser, or set "
|
|
||||||
"sentence boundaries by setting doc[i].sent_start")
|
|
||||||
start = 0
|
|
||||||
for i in range(1, self.length):
|
|
||||||
if self.c[i].sent_start == 1:
|
|
||||||
yield Span(self, start, i)
|
|
||||||
start = i
|
|
||||||
if start != self.length:
|
|
||||||
yield Span(self, start, self.length)
|
|
||||||
|
|
||||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
||||||
if self.length == 0:
|
if self.length == 0:
|
||||||
|
|
|
@ -300,17 +300,7 @@ cdef class Span:
|
||||||
if n >= self.doc.length:
|
if n >= self.doc.length:
|
||||||
raise RuntimeError
|
raise RuntimeError
|
||||||
return self.doc[root.l_edge:root.r_edge + 1]
|
return self.doc[root.l_edge:root.r_edge + 1]
|
||||||
else:
|
elif self.doc.is_sentenced:
|
||||||
# Check if the document has sentence boundaries,
|
|
||||||
# i.e at least one tok has the sent_start == 1
|
|
||||||
for i in range(self.doc.length):
|
|
||||||
if self.doc.c[i].sent_start == 1:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"Access to sentence requires either the dependency parse "
|
|
||||||
"or sentence boundaries to be set by setting " +
|
|
||||||
"doc[i].is_sent_start = True")
|
|
||||||
# find start of the sentence
|
# find start of the sentence
|
||||||
start = self.start
|
start = self.start
|
||||||
while self.doc.c[start].sent_start != 1 and start > 0:
|
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||||
|
@ -323,7 +313,11 @@ cdef class Span:
|
||||||
break
|
break
|
||||||
#
|
#
|
||||||
return self.doc[start:end]
|
return self.doc[start:end]
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Access to sentence requires either the dependency parse "
|
||||||
|
"or sentence boundaries to be set by setting " +
|
||||||
|
"doc[i].is_sent_start = True")
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
|
|
Loading…
Reference in New Issue