additional information if doc is empty

This commit is contained in:
svlandeg 2020-03-09 18:08:18 +01:00
parent 1d6aec805d
commit 1724a4f75b
3 changed files with 13 additions and 6 deletions

View File

@ -5,7 +5,7 @@ import pytest
import re
from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token
from spacy.tokens import Doc, Token, Span
from ..doc.test_underscore import clean_underscore
@ -458,3 +458,10 @@ def test_matcher_callback(en_vocab):
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
matches = matcher(doc)
mock.assert_called_once_with(matcher, doc, 0, matches)
def test_matcher_span(matcher):
text = "JavaScript is good but Java is better"
doc = Doc(matcher.vocab, words=text.split())
span = Span(doc, 0, 3)
matches = matcher(span.as_doc())
assert len(matches) == 1

View File

@ -260,7 +260,7 @@ cdef class Doc:
def is_nered(self):
"""Check if the document has named entities set. Will return True if
*any* of the tokens has a named entity tag set (even if the others are
unknown values).
unknown values), or if the document is empty.
"""
if len(self) == 0:
return True

View File

@ -657,10 +657,10 @@ The L2 norm of the document's vector representation.
| `user_data` | - | A generic storage area, for user custom data. |
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
| `sentiment` | float | The document's positivity/negativity score, if available. |
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |