From 3243ddac8f699a69ce2e4e39ae80c62cfd30ad12 Mon Sep 17 00:00:00 2001 From: Yohei Tamura Date: Thu, 1 Oct 2020 21:01:52 +0900 Subject: [PATCH] Fix/span.sent (#6083) * add fail test * fix test * fix span.sent * Remove incorrect implicit check Co-authored-by: Adriane Boyd --- spacy/tests/doc/test_span.py | 20 +++++++++++++++++--- spacy/tokens/span.pyx | 5 ++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 107078df9..df41aedf5 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -174,19 +174,25 @@ def test_spans_by_character(doc): assert span1.end_char == span2.end_char assert span2.label_ == "GPE" - span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict") + span2 = doc.char_span( + span1.start_char, span1.end_char, label="GPE", alignment_mode="strict" + ) assert span1.start_char == span2.start_char assert span1.end_char == span2.end_char assert span2.label_ == "GPE" # alignment mode "contract" - span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract") + span2 = doc.char_span( + span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract" + ) assert span1.start_char == span2.start_char assert span1.end_char == span2.end_char assert span2.label_ == "GPE" # alignment mode "expand" - span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand") + span2 = doc.char_span( + span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand" + ) assert span1.start_char == span2.start_char assert span1.end_char == span2.end_char assert span2.label_ == "GPE" @@ -318,3 +324,11 @@ def test_span_boundaries(doc): _ = span[-5] with pytest.raises(IndexError): _ = span[5] + + +def test_sent(en_tokenizer): + doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.") + span = doc[1:3] + assert not span.doc.is_sentenced + with pytest.raises(ValueError): + span.sent diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 29b87fa8d..cf0775bae 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -391,8 +391,6 @@ cdef class Span: """RETURNS (Span): The sentence span that the span is a part of.""" if "sent" in self.doc.user_span_hooks: return self.doc.user_span_hooks["sent"](self) - # This should raise if not parsed / no custom sentence boundaries - self.doc.sents # Use `sent_start` token attribute to find sentence boundaries cdef int n = 0 if self.doc.is_sentenced: @@ -402,13 +400,14 @@ cdef class Span: start += -1 # Find end of the sentence end = self.end - n = 0 while end < self.doc.length and self.doc.c[end].sent_start != 1: end += 1 n += 1 if n >= self.doc.length: break return self.doc[start:end] + else: + raise ValueError(Errors.E030) @property def ents(self):