diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 1c3c948c3..8eed2c267 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -4,9 +4,10 @@ from __future__ import unicode_literals
import pytest
import numpy
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from spacy.errors import ModelsWarning
+from spacy.attrs import ENT_TYPE, ENT_IOB
from ..util import get_doc
@@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
assert lca[1, 1] == 1
assert lca[0, 1] == 2
assert lca[1, 2] == 2
+
+
+def test_doc_is_nered(en_vocab):
+ words = ["I", "live", "in", "New", "York"]
+ doc = Doc(en_vocab, words=words)
+ assert not doc.is_nered
+ doc.ents = [Span(doc, 3, 5, label="GPE")]
+ assert doc.is_nered
+ # Test creating doc from array with unknown values
+ arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
+ doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
+ assert doc.is_nered
+ # Test serialization
+ new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
+ assert new_doc.is_nered
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 1dfcd1687..ff38d825f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -240,8 +240,18 @@ cdef class Doc:
for i in range(1, self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True
- else:
- return False
+ return False
+
+ @property
+ def is_nered(self):
+ """Check if the document has named entities set. Will return True if
+ *any* of the tokens has a named entity tag set (even if the others are
+ uknown values).
+ """
+ for i in range(self.length):
+ if self.c[i].ent_iob != 0:
+ return True
+ return False
def __getitem__(self, object i):
"""Get a `Token` or `Span` object.
@@ -990,11 +1000,11 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#to_json
"""
data = {"text": self.text}
- if self.ents:
+ if self.is_nered:
data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
"label": ent.label_} for ent in self.ents]
- sents = list(self.sents)
- if sents:
+ if self.is_sentenced:
+ sents = list(self.sents)
data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
for sent in sents]
if self.cats:
@@ -1002,13 +1012,11 @@ cdef class Doc:
data["tokens"] = []
for token in self:
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
- if token.pos_:
+ if self.is_tagged:
token_data["pos"] = token.pos_
- if token.tag_:
token_data["tag"] = token.tag_
- if token.dep_:
+ if self.is_parsed:
token_data["dep"] = token.dep_
- if token.head:
token_data["head"] = token.head.i
data["tokens"].append(token_data)
if underscore:
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 267d8f711..e53619cff 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -237,7 +237,7 @@ attribute ID.
> from spacy.attrs import ORTH
> doc = nlp(u"apple apple orange banana")
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
-> doc.to_array([attrs.ORTH])
+> doc.to_array([ORTH])
> # array([[11880], [11880], [7561], [12800]])
> ```
@@ -640,20 +640,21 @@ The L2 norm of the document's vector representation.
## Attributes {#attributes}
-| Name | Type | Description |
-| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `text` | unicode | A unicode representation of the document text. |
-| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
-| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
-| `vocab` | `Vocab` | The store of lexical types. |
-| `tensor` 2 | object | Container for dense vector representations. |
-| `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
-| `user_data` | - | A generic storage area, for user custom data. |
-| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
-| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
-| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
-| `sentiment` | float | The document's positivity/negativity score, if available. |
-| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
-| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
-| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
-| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
+| Name | Type | Description |
+| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `text` | unicode | A unicode representation of the document text. |
+| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
+| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
+| `vocab` | `Vocab` | The store of lexical types. |
+| `tensor` 2 | object | Container for dense vector representations. |
+| `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
+| `user_data` | - | A generic storage area, for user custom data. |
+| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
+| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
+| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
+| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. |
+| `sentiment` | float | The document's positivity/negativity score, if available. |
+| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
+| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
+| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
+| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |