💫 Improve Doc.to_json and add Doc.is_nered (#3381)

* Use default return instead of else

* Add Doc.is_nered to indicate if entities have been set

* Add properties in Doc.to_json if they were set, not if they're available

This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.
This commit is contained in:
Ines Montani 2019-03-10 15:24:34 +01:00 committed by Matthew Honnibal
parent 7984543953
commit 0426689db8
3 changed files with 53 additions and 28 deletions

View File

@ -4,9 +4,10 @@ from __future__ import unicode_literals
import pytest
import numpy
from spacy.tokens import Doc
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from spacy.errors import ModelsWarning
from spacy.attrs import ENT_TYPE, ENT_IOB
from ..util import get_doc
@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
assert lca[1, 1] == 1
assert lca[0, 1] == 2
assert lca[1, 2] == 2
def test_doc_is_nered(en_vocab):
words = ["I", "live", "in", "New", "York"]
doc = Doc(en_vocab, words=words)
assert not doc.is_nered
doc.ents = [Span(doc, 3, 5, label="GPE")]
assert doc.is_nered
# Test creating doc from array with unknown values
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
assert doc.is_nered
# Test serialization
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert new_doc.is_nered

View File

@ -240,8 +240,18 @@ cdef class Doc:
for i in range(1, self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True
else:
return False
return False
@property
def is_nered(self):
"""Check if the document has named entities set. Will return True if
*any* of the tokens has a named entity tag set (even if the others are
uknown values).
"""
for i in range(self.length):
if self.c[i].ent_iob != 0:
return True
return False
def __getitem__(self, object i):
"""Get a `Token` or `Span` object.
@ -990,11 +1000,11 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#to_json
"""
data = {"text": self.text}
if self.ents:
if self.is_nered:
data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
"label": ent.label_} for ent in self.ents]
sents = list(self.sents)
if sents:
if self.is_sentenced:
sents = list(self.sents)
data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
for sent in sents]
if self.cats:
@ -1002,13 +1012,11 @@ cdef class Doc:
data["tokens"] = []
for token in self:
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
if token.pos_:
if self.is_tagged:
token_data["pos"] = token.pos_
if token.tag_:
token_data["tag"] = token.tag_
if token.dep_:
if self.is_parsed:
token_data["dep"] = token.dep_
if token.head:
token_data["head"] = token.head.i
data["tokens"].append(token_data)
if underscore:

View File

@ -237,7 +237,7 @@ attribute ID.
> from spacy.attrs import ORTH
> doc = nlp(u"apple apple orange banana")
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
> doc.to_array([attrs.ORTH])
> doc.to_array([ORTH])
> # array([[11880], [11880], [7561], [12800]])
> ```
@ -640,20 +640,21 @@ The L2 norm of the document's vector representation.
## Attributes {#attributes}
| Name | Type | Description |
| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `text` | unicode | A unicode representation of the document text. |
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
| `vocab` | `Vocab` | The store of lexical types. |
| `tensor` <Tag variant="new">2</Tag> | object | Container for dense vector representations. |
| `cats` <Tag variant="new">2</Tag> | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
| `user_data` | - | A generic storage area, for user custom data. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
| `sentiment` | float | The document's positivity/negativity score, if available. |
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
| Name | Type | Description |
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `text` | unicode | A unicode representation of the document text. |
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
| `vocab` | `Vocab` | The store of lexical types. |
| `tensor` <Tag variant="new">2</Tag> | object | Container for dense vector representations. |
| `cats` <Tag variant="new">2</Tag> | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
| `user_data` | - | A generic storage area, for user custom data. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. |
| `sentiment` | float | The document's positivity/negativity score, if available. |
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |