Token sent attributes more consistent (#10164)

* remove duplicate line * add sent start/end token attributes to the docs * let has_annotation work with IS_SENT_END * elif instead of if * add has_annotation test for sent attributes * fix typo * remove duplicate is_sent_start entry in docs
2022-02-08 08:35:37 +01:00 · 2022-02-08 08:35:37 +01:00 · deb143fa70
parent 836f689cc7
commit deb143fa70
6 changed files with 27 additions and 21 deletions
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -310,7 +310,6 @@ GLOSSARY = {
    "re": "repeated element",
    "rs": "reported speech",
    "sb": "subject",
-    "sb": "subject",
    "sbp": "passivized subject (PP)",
    "sp": "subject or predicate",
    "svp": "separable verb prefix",
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -684,6 +684,7 @@ def test_has_annotation(en_vocab):
    attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
    for attr in attrs:
        assert not doc.has_annotation(attr)
+        assert not doc.has_annotation(attr, require_complete=True)

    doc[0].tag_ = "A"
    doc[0].pos_ = "X"
@ -709,6 +710,27 @@ def test_has_annotation(en_vocab):
        assert doc.has_annotation(attr, require_complete=True)


+def test_has_annotation_sents(en_vocab):
+    doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
+    attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
+    for attr in attrs:
+        assert not doc.has_annotation(attr)
+        assert not doc.has_annotation(attr, require_complete=True)
+
+    # The first token (index 0) is always assumed to be a sentence start,
+    # and ignored by the check in doc.has_annotation
+
+    doc[1].is_sent_start = False
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert not doc.has_annotation(attr, require_complete=True)
+
+    doc[2].is_sent_start = False
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert doc.has_annotation(attr, require_complete=True)
+
+
 def test_is_flags_deprecated(en_tokenizer):
    doc = en_tokenizer("test")
    with pytest.deprecated_call():
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -420,6 +420,8 @@ cdef class Doc:
        cdef int range_start = 0
        if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
            attr = SENT_START
+        elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]:
+            attr = SENT_START
        attr = intify_attr(attr)
        # adjust attributes
        if attr == HEAD:
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -487,8 +487,6 @@ cdef class Token:

        RETURNS (bool / None): Whether the token starts a sentence.
            None if unknown.
-
-        DOCS: https://spacy.io/api/token#is_sent_start
        """
        def __get__(self):
            if self.c.sent_start == 0:
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.

 ## Doc.has_annotation {#has_annotation tag="method"}

-Check whether the doc contains annotation on a token attribute.
+Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes).

 <Infobox title="Changed in v3.0" variant="warning">

--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants.
 | ---------- | ------------------------------------------------------------------------------------ |
 | **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |

-## Token.is_sent_start {#is_sent_start tag="property" new="2"}
-
-A boolean value indicating whether the token starts a sentence. `None` if
-unknown. Defaults to `True` for the first token in the `Doc`.
-
-> #### Example
->
-> ```python
-> doc = nlp("Give it back! He pleaded.")
-> assert doc[4].is_sent_start
-> assert not doc[5].is_sent_start
-> ```
-
-| Name        | Description                                             |
-| ----------- | ------------------------------------------------------- |
-| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
-
 ## Token.has_vector {#has_vector tag="property" model="vectors"}

 A boolean value indicating whether a word vector is associated with the token.
@ -465,6 +448,8 @@ The L2 norm of the token's vector representation.
 | `is_punct`                                   | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                   |
 | `is_left_punct`                              | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                          |
 | `is_right_punct`                             | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                         |
+| `is_sent_start`                              | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`.                                                                                                                                                 |
+| `is_sent_end`                                | Does the token end a sentence? ~~bool~~ or `None` if unknown.                                                                                                                                                                                                        |
 | `is_space`                                   | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                      |
 | `is_bracket`                                 | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                     |
 | `is_quote`                                   | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                              |