mirror of https://github.com/explosion/spaCy.git
Fix/span.sent (#6083)
* add fail test * fix test * fix span.sent * Remove incorrect implicit check Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
4cbb954281
commit
3243ddac8f
|
@ -174,19 +174,25 @@ def test_spans_by_character(doc):
|
||||||
assert span1.end_char == span2.end_char
|
assert span1.end_char == span2.end_char
|
||||||
assert span2.label_ == "GPE"
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
|
span2 = doc.char_span(
|
||||||
|
span1.start_char, span1.end_char, label="GPE", alignment_mode="strict"
|
||||||
|
)
|
||||||
assert span1.start_char == span2.start_char
|
assert span1.start_char == span2.start_char
|
||||||
assert span1.end_char == span2.end_char
|
assert span1.end_char == span2.end_char
|
||||||
assert span2.label_ == "GPE"
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
# alignment mode "contract"
|
# alignment mode "contract"
|
||||||
span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
|
span2 = doc.char_span(
|
||||||
|
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
|
||||||
|
)
|
||||||
assert span1.start_char == span2.start_char
|
assert span1.start_char == span2.start_char
|
||||||
assert span1.end_char == span2.end_char
|
assert span1.end_char == span2.end_char
|
||||||
assert span2.label_ == "GPE"
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
# alignment mode "expand"
|
# alignment mode "expand"
|
||||||
span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
|
span2 = doc.char_span(
|
||||||
|
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand"
|
||||||
|
)
|
||||||
assert span1.start_char == span2.start_char
|
assert span1.start_char == span2.start_char
|
||||||
assert span1.end_char == span2.end_char
|
assert span1.end_char == span2.end_char
|
||||||
assert span2.label_ == "GPE"
|
assert span2.label_ == "GPE"
|
||||||
|
@ -318,3 +324,11 @@ def test_span_boundaries(doc):
|
||||||
_ = span[-5]
|
_ = span[-5]
|
||||||
with pytest.raises(IndexError):
|
with pytest.raises(IndexError):
|
||||||
_ = span[5]
|
_ = span[5]
|
||||||
|
|
||||||
|
|
||||||
|
def test_sent(en_tokenizer):
|
||||||
|
doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.")
|
||||||
|
span = doc[1:3]
|
||||||
|
assert not span.doc.is_sentenced
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span.sent
|
||||||
|
|
|
@ -391,8 +391,6 @@ cdef class Span:
|
||||||
"""RETURNS (Span): The sentence span that the span is a part of."""
|
"""RETURNS (Span): The sentence span that the span is a part of."""
|
||||||
if "sent" in self.doc.user_span_hooks:
|
if "sent" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["sent"](self)
|
return self.doc.user_span_hooks["sent"](self)
|
||||||
# This should raise if not parsed / no custom sentence boundaries
|
|
||||||
self.doc.sents
|
|
||||||
# Use `sent_start` token attribute to find sentence boundaries
|
# Use `sent_start` token attribute to find sentence boundaries
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
if self.doc.is_sentenced:
|
if self.doc.is_sentenced:
|
||||||
|
@ -402,13 +400,14 @@ cdef class Span:
|
||||||
start += -1
|
start += -1
|
||||||
# Find end of the sentence
|
# Find end of the sentence
|
||||||
end = self.end
|
end = self.end
|
||||||
n = 0
|
|
||||||
while end < self.doc.length and self.doc.c[end].sent_start != 1:
|
while end < self.doc.length and self.doc.c[end].sent_start != 1:
|
||||||
end += 1
|
end += 1
|
||||||
n += 1
|
n += 1
|
||||||
if n >= self.doc.length:
|
if n >= self.doc.length:
|
||||||
break
|
break
|
||||||
return self.doc[start:end]
|
return self.doc[start:end]
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E030)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ents(self):
|
def ents(self):
|
||||||
|
|
Loading…
Reference in New Issue