From 0426689db872ff6be74bd8250bef2197b07c8b2b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 10 Mar 2019 15:24:34 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Improve=20Doc.to=5Fjson=20and=20?=
 =?UTF-8?q?add=20Doc.is=5Fnered=20(#3381)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.
---
 spacy/tests/doc/test_doc_api.py | 18 +++++++++++++++-
 spacy/tokens/doc.pyx            | 26 +++++++++++++++--------
 website/docs/api/doc.md         | 37 +++++++++++++++++----------------
 3 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 1c3c948c3..8eed2c267 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -4,9 +4,10 @@ from __future__ import unicode_literals
 
 import pytest
 import numpy
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.errors import ModelsWarning
+from spacy.attrs import ENT_TYPE, ENT_IOB
 
 from ..util import get_doc
 
@@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
     assert lca[1, 1] == 1
     assert lca[0, 1] == 2
     assert lca[1, 2] == 2
+
+
+def test_doc_is_nered(en_vocab):
+    words = ["I", "live", "in", "New", "York"]
+    doc = Doc(en_vocab, words=words)
+    assert not doc.is_nered
+    doc.ents = [Span(doc, 3, 5, label="GPE")]
+    assert doc.is_nered
+    # Test creating doc from array with unknown values
+    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
+    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
+    assert doc.is_nered
+    # Test serialization
+    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
+    assert new_doc.is_nered
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 1dfcd1687..ff38d825f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -240,8 +240,18 @@ cdef class Doc:
         for i in range(1, self.length):
             if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                 return True
-        else:
-            return False
+        return False
+
+    @property
+    def is_nered(self):
+        """Check if the document has named entities set. Will return True if
+        *any* of the tokens has a named entity tag set (even if the others are
+        uknown values).
+        """
+        for i in range(self.length):
+            if self.c[i].ent_iob != 0:
+                return True
+        return False
 
     def __getitem__(self, object i):
         """Get a `Token` or `Span` object.
@@ -990,11 +1000,11 @@ cdef class Doc:
         DOCS: https://spacy.io/api/doc#to_json
         """
         data = {"text": self.text}
-        if self.ents:
+        if self.is_nered:
             data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
                             "label": ent.label_} for ent in self.ents]
-        sents = list(self.sents)
-        if sents:
+        if self.is_sentenced:
+            sents = list(self.sents)
             data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
                              for sent in sents]
         if self.cats:
@@ -1002,13 +1012,11 @@ cdef class Doc:
         data["tokens"] = []
         for token in self:
             token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
-            if token.pos_:
+            if self.is_tagged:
                 token_data["pos"] = token.pos_
-            if token.tag_:
                 token_data["tag"] = token.tag_
-            if token.dep_:
+            if self.is_parsed:
                 token_data["dep"] = token.dep_
-            if token.head:
                 token_data["head"] = token.head.i
             data["tokens"].append(token_data)
         if underscore:
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 267d8f711..e53619cff 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -237,7 +237,7 @@ attribute ID.
 > from spacy.attrs import ORTH
 > doc = nlp(u"apple apple orange banana")
 > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
-> doc.to_array([attrs.ORTH])
+> doc.to_array([ORTH])
 > # array([[11880], [11880], [7561], [12800]])
 > ```
 
@@ -640,20 +640,21 @@ The L2 norm of the document's vector representation.
 
 ## Attributes {#attributes}
 
-| Name                                | Type         | Description                                                                                                                                                                                                                                                                                |
-| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `text`                              | unicode      | A unicode representation of the document text.                                                                                                                                                                                                                                             |
-| `text_with_ws`                      | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
-| `mem`                               | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                                                                                                                                  |
-| `vocab`                             | `Vocab`      | The store of lexical types.                                                                                                                                                                                                                                                                |
-| `tensor` <Tag variant="new">2</Tag> | object       | Container for dense vector representations.                                                                                                                                                                                                                                                |
-| `cats` <Tag variant="new">2</Tag>   | dictionary   | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
-| `user_data`                         | -            | A generic storage area, for user custom data.                                                                                                                                                                                                                                              |
-| `is_tagged`                         | bool         | A flag indicating that the document has been part-of-speech tagged.                                                                                                                                                                                                                        |
-| `is_parsed`                         | bool         | A flag indicating that the document has been syntactically parsed.                                                                                                                                                                                                                         |
-| `is_sentenced`                      | bool         | A flag indicating that sentence boundaries have been applied to the document.                                                                                                                                                                                                              |
-| `sentiment`                         | float        | The document's positivity/negativity score, if available.                                                                                                                                                                                                                                  |
-| `user_hooks`                        | dict         | A dictionary that allows customization of the `Doc`'s properties.                                                                                                                                                                                                                          |
-| `user_token_hooks`                  | dict         | A dictionary that allows customization of properties of `Token` children.                                                                                                                                                                                                                  |
-| `user_span_hooks`                   | dict         | A dictionary that allows customization of properties of `Span` children.                                                                                                                                                                                                                   |
-| `_`                                 | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes).                                                                                                                                                                             |
+| Name                                    | Type         | Description                                                                                                                                                                                                                                                                                |
+| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `text`                                  | unicode      | A unicode representation of the document text.                                                                                                                                                                                                                                             |
+| `text_with_ws`                          | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
+| `mem`                                   | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                                                                                                                                  |
+| `vocab`                                 | `Vocab`      | The store of lexical types.                                                                                                                                                                                                                                                                |
+| `tensor` <Tag variant="new">2</Tag>     | object       | Container for dense vector representations.                                                                                                                                                                                                                                                |
+| `cats` <Tag variant="new">2</Tag>       | dictionary   | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
+| `user_data`                             | -            | A generic storage area, for user custom data.                                                                                                                                                                                                                                              |
+| `is_tagged`                             | bool         | A flag indicating that the document has been part-of-speech tagged.                                                                                                                                                                                                                        |
+| `is_parsed`                             | bool         | A flag indicating that the document has been syntactically parsed.                                                                                                                                                                                                                         |
+| `is_sentenced`                          | bool         | A flag indicating that sentence boundaries have been applied to the document.                                                                                                                                                                                                              |
+| `is_nered` <Tag variant="new">2.1</Tag> | bool         | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown.                                                                                                                                      |
+| `sentiment`                             | float        | The document's positivity/negativity score, if available.                                                                                                                                                                                                                                  |
+| `user_hooks`                            | dict         | A dictionary that allows customization of the `Doc`'s properties.                                                                                                                                                                                                                          |
+| `user_token_hooks`                      | dict         | A dictionary that allows customization of properties of `Token` children.                                                                                                                                                                                                                  |
+| `user_span_hooks`                       | dict         | A dictionary that allows customization of properties of `Span` children.                                                                                                                                                                                                                   |
+| `_`                                     | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes).                                                                                                                                                                             |