💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else * Add Doc.is_nered to indicate if entities have been set * Add properties in Doc.to_json if they were set, not if they're available This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.
2019-03-10 15:24:34 +01:00 · 2019-03-10 15:24:34 +01:00 · 0426689db8
parent 7984543953
commit 0426689db8
3 changed files with 53 additions and 28 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -4,9 +4,10 @@ from __future__ import unicode_literals

 import pytest
 import numpy
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.errors import ModelsWarning
+from spacy.attrs import ENT_TYPE, ENT_IOB

 from ..util import get_doc

@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
    assert lca[1, 1] == 1
    assert lca[0, 1] == 2
    assert lca[1, 2] == 2
+
+
+def test_doc_is_nered(en_vocab):
+    words = ["I", "live", "in", "New", "York"]
+    doc = Doc(en_vocab, words=words)
+    assert not doc.is_nered
+    doc.ents = [Span(doc, 3, 5, label="GPE")]
+    assert doc.is_nered
+    # Test creating doc from array with unknown values
+    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
+    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
+    assert doc.is_nered
+    # Test serialization
+    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
+    assert new_doc.is_nered
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -240,8 +240,18 @@ cdef class Doc:
        for i in range(1, self.length):
            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                return True
-        else:
-            return False
+        return False
+
+    @property
+    def is_nered(self):
+        """Check if the document has named entities set. Will return True if
+        *any* of the tokens has a named entity tag set (even if the others are
+        uknown values).
+        """
+        for i in range(self.length):
+            if self.c[i].ent_iob != 0:
+                return True
+        return False

    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.
@ -990,11 +1000,11 @@ cdef class Doc:
        DOCS: https://spacy.io/api/doc#to_json
        """
        data = {"text": self.text}
-        if self.ents:
+        if self.is_nered:
            data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
                            "label": ent.label_} for ent in self.ents]
-        sents = list(self.sents)
-        if sents:
+        if self.is_sentenced:
+            sents = list(self.sents)
            data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
                             for sent in sents]
        if self.cats:
@ -1002,13 +1012,11 @@ cdef class Doc:
        data["tokens"] = []
        for token in self:
            token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
-            if token.pos_:
+            if self.is_tagged:
                token_data["pos"] = token.pos_
-            if token.tag_:
                token_data["tag"] = token.tag_
-            if token.dep_:
+            if self.is_parsed:
                token_data["dep"] = token.dep_
-            if token.head:
                token_data["head"] = token.head.i
            data["tokens"].append(token_data)
        if underscore:
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -237,7 +237,7 @@ attribute ID.
 > from spacy.attrs import ORTH
 > doc = nlp(u"apple apple orange banana")
 > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
-> doc.to_array([attrs.ORTH])
+> doc.to_array([ORTH])
 > # array([[11880], [11880], [7561], [12800]])
 > ```

@ -640,20 +640,21 @@ The L2 norm of the document's vector representation.

 ## Attributes {#attributes}

-| Name                                | Type         | Description                                                                                                                                                                                                                                                                                |
-| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `text`                              | unicode      | A unicode representation of the document text.                                                                                                                                                                                                                                             |
-| `text_with_ws`                      | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
-| `mem`                               | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                                                                                                                                  |
-| `vocab`                             | `Vocab`      | The store of lexical types.                                                                                                                                                                                                                                                                |
-| `tensor` <Tag variant="new">2</Tag> | object       | Container for dense vector representations.                                                                                                                                                                                                                                                |
-| `cats` <Tag variant="new">2</Tag>   | dictionary   | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
-| `user_data`                         | -            | A generic storage area, for user custom data.                                                                                                                                                                                                                                              |
-| `is_tagged`                         | bool         | A flag indicating that the document has been part-of-speech tagged.                                                                                                                                                                                                                        |
-| `is_parsed`                         | bool         | A flag indicating that the document has been syntactically parsed.                                                                                                                                                                                                                         |
-| `is_sentenced`                      | bool         | A flag indicating that sentence boundaries have been applied to the document.                                                                                                                                                                                                              |
-| `sentiment`                         | float        | The document's positivity/negativity score, if available.                                                                                                                                                                                                                                  |
-| `user_hooks`                        | dict         | A dictionary that allows customization of the `Doc`'s properties.                                                                                                                                                                                                                          |
-| `user_token_hooks`                  | dict         | A dictionary that allows customization of properties of `Token` children.                                                                                                                                                                                                                  |
-| `user_span_hooks`                   | dict         | A dictionary that allows customization of properties of `Span` children.                                                                                                                                                                                                                   |
-| `_`                                 | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes).                                                                                                                                                                             |
+| Name                                    | Type         | Description                                                                                                                                                                                                                                                                                |
+| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `text`                                  | unicode      | A unicode representation of the document text.                                                                                                                                                                                                                                             |
+| `text_with_ws`                          | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
+| `mem`                                   | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                                                                                                                                  |
+| `vocab`                                 | `Vocab`      | The store of lexical types.                                                                                                                                                                                                                                                                |
+| `tensor` <Tag variant="new">2</Tag>     | object       | Container for dense vector representations.                                                                                                                                                                                                                                                |
+| `cats` <Tag variant="new">2</Tag>       | dictionary   | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
+| `user_data`                             | -            | A generic storage area, for user custom data.                                                                                                                                                                                                                                              |
+| `is_tagged`                             | bool         | A flag indicating that the document has been part-of-speech tagged.                                                                                                                                                                                                                        |
+| `is_parsed`                             | bool         | A flag indicating that the document has been syntactically parsed.                                                                                                                                                                                                                         |
+| `is_sentenced`                          | bool         | A flag indicating that sentence boundaries have been applied to the document.                                                                                                                                                                                                              |
+| `is_nered` <Tag variant="new">2.1</Tag> | bool         | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown.                                                                                                                                      |
+| `sentiment`                             | float        | The document's positivity/negativity score, if available.                                                                                                                                                                                                                                  |
+| `user_hooks`                            | dict         | A dictionary that allows customization of the `Doc`'s properties.                                                                                                                                                                                                                          |
+| `user_token_hooks`                      | dict         | A dictionary that allows customization of properties of `Token` children.                                                                                                                                                                                                                  |
+| `user_span_hooks`                       | dict         | A dictionary that allows customization of properties of `Span` children.                                                                                                                                                                                                                   |
+| `_`                                     | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes).                                                                                                                                                                             |