Use Token.sent_start for Span.sent (#5439)

Use `Token.sent_start` for sentence boundaries in `Span.sent` so that
`Doc.sents` and `Span.sent` return the same sentence boundaries.
This commit is contained in:
adrianeboyd 2020-05-14 18:22:51 +02:00 committed by GitHub
parent 780b869345
commit e63880e081
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 2 additions and 12 deletions

View File

@ -389,19 +389,9 @@ cdef class Span:
return self.doc.user_span_hooks["sent"](self)
# This should raise if not parsed / no custom sentence boundaries
self.doc.sents
# If doc is parsed we can use the deps to find the sentence
# otherwise we use the `sent_start` token attribute
# Use `sent_start` token attribute to find sentence boundaries
cdef int n = 0
cdef int i
if self.doc.is_parsed:
root = &self.doc.c[self.start]
while root.head != 0:
root += root.head
n += 1
if n >= self.doc.length:
raise RuntimeError(Errors.E038)
return self.doc[root.l_edge:root.r_edge + 1]
elif self.doc.is_sentenced:
if self.doc.is_sentenced:
# Find start of the sentence
start = self.start
while self.doc.c[start].sent_start != 1 and start > 0: