💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else * Add Doc.is_nered to indicate if entities have been set * Add properties in Doc.to_json if they were set, not if they're available This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.
2019-03-10 15:24:34 +01:00 · 2019-03-10 15:24:34 +01:00 · 0426689db8
parent 7984543953
commit 0426689db8
3 changed files with 53 additions and 28 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -4,9 +4,10 @@ from __future__ import unicode_literals
 import pytest
 import numpy
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.errors import ModelsWarning
 from spacy.attrs import ENT_TYPE, ENT_IOB
 from ..util import get_doc
@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
    assert lca[1, 1] == 1
    assert lca[0, 1] == 2
    assert lca[1, 2] == 2
 def test_doc_is_nered(en_vocab):
    words = ["I", "live", "in", "New", "York"]
    doc = Doc(en_vocab, words=words)
    assert not doc.is_nered
    doc.ents = [Span(doc, 3, 5, label="GPE")]
    assert doc.is_nered
    # Test creating doc from array with unknown values
    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
    assert doc.is_nered
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_nered
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -240,7 +240,17 @@ cdef class Doc:
        for i in range(1, self.length):
            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                return True
-        else:
+        return False
    @property
    def is_nered(self):
        """Check if the document has named entities set. Will return True if
        *any* of the tokens has a named entity tag set (even if the others are
        uknown values).
        """
        for i in range(self.length):
            if self.c[i].ent_iob != 0:
                return True
        return False
    def __getitem__(self, object i):
@ -990,11 +1000,11 @@ cdef class Doc:
        DOCS: https://spacy.io/api/doc#to_json
        """
        data = {"text": self.text}
-        if self.ents:
+        if self.is_nered:
            data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
                            "label": ent.label_} for ent in self.ents]
        if self.is_sentenced:
            sents = list(self.sents)
        if sents:
            data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
                             for sent in sents]
        if self.cats:
@ -1002,13 +1012,11 @@ cdef class Doc:
        data["tokens"] = []
        for token in self:
            token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
-            if token.pos_:
+            if self.is_tagged:
                token_data["pos"] = token.pos_
            if token.tag_:
                token_data["tag"] = token.tag_
-            if token.dep_:
+            if self.is_parsed:
                token_data["dep"] = token.dep_
            if token.head:
                token_data["head"] = token.head.i
            data["tokens"].append(token_data)
        if underscore:
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -237,7 +237,7 @@ attribute ID.
 > from spacy.attrs import ORTH
 > doc = nlp(u"apple apple orange banana")
 > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
-> doc.to_array([attrs.ORTH])
+> doc.to_array([ORTH])
 > # array([[11880], [11880], [7561], [12800]])
 > ```
@ -641,7 +641,7 @@ The L2 norm of the document's vector representation.
 ## Attributes {#attributes}
 | Name                                    | Type         | Description                                                                                                                                                                                                                                                                                |
-| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `text`                                  | unicode      | A unicode representation of the document text.                                                                                                                                                                                                                                             |
 | `text_with_ws`                          | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
 | `mem`                                   | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                                                                                                                                  |
@ -652,6 +652,7 @@ The L2 norm of the document's vector representation.
 | `is_tagged`                             | bool         | A flag indicating that the document has been part-of-speech tagged.                                                                                                                                                                                                                        |
 | `is_parsed`                             | bool         | A flag indicating that the document has been syntactically parsed.                                                                                                                                                                                                                         |
 | `is_sentenced`                          | bool         | A flag indicating that sentence boundaries have been applied to the document.                                                                                                                                                                                                              |
 | `is_nered` <Tag variant="new">2.1</Tag> | bool         | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown.                                                                                                                                      |
 | `sentiment`                             | float        | The document's positivity/negativity score, if available.                                                                                                                                                                                                                                  |
 | `user_hooks`                            | dict         | A dictionary that allows customization of the `Doc`'s properties.                                                                                                                                                                                                                          |
 | `user_token_hooks`                      | dict         | A dictionary that allows customization of properties of `Token` children.                                                                                                                                                                                                                  |