💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.
This commit is contained in:
Ines Montani 2019-03-10 15:24:34 +01:00 committed by Matthew Honnibal
parent 7984543953
commit 0426689db8
3 changed files with 53 additions and 28 deletions

View File

@ -4,9 +4,10 @@ from __future__ import unicode_literals
import pytest import pytest
import numpy import numpy
from spacy.tokens import Doc from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.errors import ModelsWarning from spacy.errors import ModelsWarning
from spacy.attrs import ENT_TYPE, ENT_IOB
from ..util import get_doc from ..util import get_doc
@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
assert lca[1, 1] == 1 assert lca[1, 1] == 1
assert lca[0, 1] == 2 assert lca[0, 1] == 2
assert lca[1, 2] == 2 assert lca[1, 2] == 2
def test_doc_is_nered(en_vocab):
words = ["I", "live", "in", "New", "York"]
doc = Doc(en_vocab, words=words)
assert not doc.is_nered
doc.ents = [Span(doc, 3, 5, label="GPE")]
assert doc.is_nered
# Test creating doc from array with unknown values
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
assert doc.is_nered
# Test serialization
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert new_doc.is_nered

View File

@ -240,7 +240,17 @@ cdef class Doc:
for i in range(1, self.length): for i in range(1, self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1: if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True return True
else: return False
@property
def is_nered(self):
"""Check if the document has named entities set. Will return True if
*any* of the tokens has a named entity tag set (even if the others are
uknown values).
"""
for i in range(self.length):
if self.c[i].ent_iob != 0:
return True
return False return False
def __getitem__(self, object i): def __getitem__(self, object i):
@ -990,11 +1000,11 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#to_json DOCS: https://spacy.io/api/doc#to_json
""" """
data = {"text": self.text} data = {"text": self.text}
if self.ents: if self.is_nered:
data["ents"] = [{"start": ent.start_char, "end": ent.end_char, data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
"label": ent.label_} for ent in self.ents] "label": ent.label_} for ent in self.ents]
if self.is_sentenced:
sents = list(self.sents) sents = list(self.sents)
if sents:
data["sents"] = [{"start": sent.start_char, "end": sent.end_char} data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
for sent in sents] for sent in sents]
if self.cats: if self.cats:
@ -1002,13 +1012,11 @@ cdef class Doc:
data["tokens"] = [] data["tokens"] = []
for token in self: for token in self:
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)} token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
if token.pos_: if self.is_tagged:
token_data["pos"] = token.pos_ token_data["pos"] = token.pos_
if token.tag_:
token_data["tag"] = token.tag_ token_data["tag"] = token.tag_
if token.dep_: if self.is_parsed:
token_data["dep"] = token.dep_ token_data["dep"] = token.dep_
if token.head:
token_data["head"] = token.head.i token_data["head"] = token.head.i
data["tokens"].append(token_data) data["tokens"].append(token_data)
if underscore: if underscore:

View File

@ -237,7 +237,7 @@ attribute ID.
> from spacy.attrs import ORTH > from spacy.attrs import ORTH
> doc = nlp(u"apple apple orange banana") > doc = nlp(u"apple apple orange banana")
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2} > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
> doc.to_array([attrs.ORTH]) > doc.to_array([ORTH])
> # array([[11880], [11880], [7561], [12800]]) > # array([[11880], [11880], [7561], [12800]])
> ``` > ```
@ -641,7 +641,7 @@ The L2 norm of the document's vector representation.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Type | Description |
| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `text` | unicode | A unicode representation of the document text. | | `text` | unicode | A unicode representation of the document text. |
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | | `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | | `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
@ -652,6 +652,7 @@ The L2 norm of the document's vector representation.
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | | `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | | `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. | | `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. |
| `sentiment` | float | The document's positivity/negativity score, if available. | | `sentiment` | float | The document's positivity/negativity score, if available. |
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | | `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | | `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |