mirror of https://github.com/explosion/spaCy.git
Token sent attributes more consistent (#10164)
* remove duplicate line * add sent start/end token attributes to the docs * let has_annotation work with IS_SENT_END * elif instead of if * add has_annotation test for sent attributes * fix typo * remove duplicate is_sent_start entry in docs
This commit is contained in:
parent
836f689cc7
commit
deb143fa70
|
@ -310,7 +310,6 @@ GLOSSARY = {
|
|||
"re": "repeated element",
|
||||
"rs": "reported speech",
|
||||
"sb": "subject",
|
||||
"sb": "subject",
|
||||
"sbp": "passivized subject (PP)",
|
||||
"sp": "subject or predicate",
|
||||
"svp": "separable verb prefix",
|
||||
|
|
|
@ -684,6 +684,7 @@ def test_has_annotation(en_vocab):
|
|||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
||||
for attr in attrs:
|
||||
assert not doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
doc[0].tag_ = "A"
|
||||
doc[0].pos_ = "X"
|
||||
|
@ -709,6 +710,27 @@ def test_has_annotation(en_vocab):
|
|||
assert doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
|
||||
def test_has_annotation_sents(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
|
||||
attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
|
||||
for attr in attrs:
|
||||
assert not doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
# The first token (index 0) is always assumed to be a sentence start,
|
||||
# and ignored by the check in doc.has_annotation
|
||||
|
||||
doc[1].is_sent_start = False
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
doc[2].is_sent_start = False
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
|
||||
def test_is_flags_deprecated(en_tokenizer):
|
||||
doc = en_tokenizer("test")
|
||||
with pytest.deprecated_call():
|
||||
|
|
|
@ -420,6 +420,8 @@ cdef class Doc:
|
|||
cdef int range_start = 0
|
||||
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
|
||||
attr = SENT_START
|
||||
elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]:
|
||||
attr = SENT_START
|
||||
attr = intify_attr(attr)
|
||||
# adjust attributes
|
||||
if attr == HEAD:
|
||||
|
|
|
@ -487,8 +487,6 @@ cdef class Token:
|
|||
|
||||
RETURNS (bool / None): Whether the token starts a sentence.
|
||||
None if unknown.
|
||||
|
||||
DOCS: https://spacy.io/api/token#is_sent_start
|
||||
"""
|
||||
def __get__(self):
|
||||
if self.c.sent_start == 0:
|
||||
|
|
|
@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
|
|||
|
||||
## Doc.has_annotation {#has_annotation tag="method"}
|
||||
|
||||
Check whether the doc contains annotation on a token attribute.
|
||||
Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes).
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
|
|
|
@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants.
|
|||
| ---------- | ------------------------------------------------------------------------------------ |
|
||||
| **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |
|
||||
|
||||
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
|
||||
|
||||
A boolean value indicating whether the token starts a sentence. `None` if
|
||||
unknown. Defaults to `True` for the first token in the `Doc`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("Give it back! He pleaded.")
|
||||
> assert doc[4].is_sent_start
|
||||
> assert not doc[5].is_sent_start
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------- |
|
||||
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
|
||||
|
||||
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
||||
|
||||
A boolean value indicating whether a word vector is associated with the token.
|
||||
|
@ -465,6 +448,8 @@ The L2 norm of the token's vector representation.
|
|||
| `is_punct` | Is the token punctuation? ~~bool~~ |
|
||||
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
|
||||
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
|
||||
| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. |
|
||||
| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. |
|
||||
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
|
||||
| `is_bracket` | Is the token a bracket? ~~bool~~ |
|
||||
| `is_quote` | Is the token a quotation mark? ~~bool~~ |
|
||||
|
|
Loading…
Reference in New Issue