mirror of https://github.com/explosion/spaCy.git
💫 Improve Doc.to_json and add Doc.is_nered (#3381)
* Use default return instead of else * Add Doc.is_nered to indicate if entities have been set * Add properties in Doc.to_json if they were set, not if they're available This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.
This commit is contained in:
parent
7984543953
commit
0426689db8
|
@ -4,9 +4,10 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.errors import ModelsWarning
|
from spacy.errors import ModelsWarning
|
||||||
|
from spacy.attrs import ENT_TYPE, ENT_IOB
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
||||||
assert lca[1, 1] == 1
|
assert lca[1, 1] == 1
|
||||||
assert lca[0, 1] == 2
|
assert lca[0, 1] == 2
|
||||||
assert lca[1, 2] == 2
|
assert lca[1, 2] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_is_nered(en_vocab):
|
||||||
|
words = ["I", "live", "in", "New", "York"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
assert not doc.is_nered
|
||||||
|
doc.ents = [Span(doc, 3, 5, label="GPE")]
|
||||||
|
assert doc.is_nered
|
||||||
|
# Test creating doc from array with unknown values
|
||||||
|
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
|
||||||
|
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
|
||||||
|
assert doc.is_nered
|
||||||
|
# Test serialization
|
||||||
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||||
|
assert new_doc.is_nered
|
||||||
|
|
|
@ -240,8 +240,18 @@ cdef class Doc:
|
||||||
for i in range(1, self.length):
|
for i in range(1, self.length):
|
||||||
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
||||||
return True
|
return True
|
||||||
else:
|
return False
|
||||||
return False
|
|
||||||
|
@property
|
||||||
|
def is_nered(self):
|
||||||
|
"""Check if the document has named entities set. Will return True if
|
||||||
|
*any* of the tokens has a named entity tag set (even if the others are
|
||||||
|
uknown values).
|
||||||
|
"""
|
||||||
|
for i in range(self.length):
|
||||||
|
if self.c[i].ent_iob != 0:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a `Token` or `Span` object.
|
"""Get a `Token` or `Span` object.
|
||||||
|
@ -990,11 +1000,11 @@ cdef class Doc:
|
||||||
DOCS: https://spacy.io/api/doc#to_json
|
DOCS: https://spacy.io/api/doc#to_json
|
||||||
"""
|
"""
|
||||||
data = {"text": self.text}
|
data = {"text": self.text}
|
||||||
if self.ents:
|
if self.is_nered:
|
||||||
data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
|
data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
|
||||||
"label": ent.label_} for ent in self.ents]
|
"label": ent.label_} for ent in self.ents]
|
||||||
sents = list(self.sents)
|
if self.is_sentenced:
|
||||||
if sents:
|
sents = list(self.sents)
|
||||||
data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
|
data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
|
||||||
for sent in sents]
|
for sent in sents]
|
||||||
if self.cats:
|
if self.cats:
|
||||||
|
@ -1002,13 +1012,11 @@ cdef class Doc:
|
||||||
data["tokens"] = []
|
data["tokens"] = []
|
||||||
for token in self:
|
for token in self:
|
||||||
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
|
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
|
||||||
if token.pos_:
|
if self.is_tagged:
|
||||||
token_data["pos"] = token.pos_
|
token_data["pos"] = token.pos_
|
||||||
if token.tag_:
|
|
||||||
token_data["tag"] = token.tag_
|
token_data["tag"] = token.tag_
|
||||||
if token.dep_:
|
if self.is_parsed:
|
||||||
token_data["dep"] = token.dep_
|
token_data["dep"] = token.dep_
|
||||||
if token.head:
|
|
||||||
token_data["head"] = token.head.i
|
token_data["head"] = token.head.i
|
||||||
data["tokens"].append(token_data)
|
data["tokens"].append(token_data)
|
||||||
if underscore:
|
if underscore:
|
||||||
|
|
|
@ -237,7 +237,7 @@ attribute ID.
|
||||||
> from spacy.attrs import ORTH
|
> from spacy.attrs import ORTH
|
||||||
> doc = nlp(u"apple apple orange banana")
|
> doc = nlp(u"apple apple orange banana")
|
||||||
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
|
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
|
||||||
> doc.to_array([attrs.ORTH])
|
> doc.to_array([ORTH])
|
||||||
> # array([[11880], [11880], [7561], [12800]])
|
> # array([[11880], [11880], [7561], [12800]])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
@ -640,20 +640,21 @@ The L2 norm of the document's vector representation.
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `text` | unicode | A unicode representation of the document text. |
|
| `text` | unicode | A unicode representation of the document text. |
|
||||||
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
||||||
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
||||||
| `vocab` | `Vocab` | The store of lexical types. |
|
| `vocab` | `Vocab` | The store of lexical types. |
|
||||||
| `tensor` <Tag variant="new">2</Tag> | object | Container for dense vector representations. |
|
| `tensor` <Tag variant="new">2</Tag> | object | Container for dense vector representations. |
|
||||||
| `cats` <Tag variant="new">2</Tag> | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
|
| `cats` <Tag variant="new">2</Tag> | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. |
|
||||||
| `user_data` | - | A generic storage area, for user custom data. |
|
| `user_data` | - | A generic storage area, for user custom data. |
|
||||||
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
|
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. |
|
||||||
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
|
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. |
|
||||||
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
|
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. |
|
||||||
| `sentiment` | float | The document's positivity/negativity score, if available. |
|
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. |
|
||||||
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
|
| `sentiment` | float | The document's positivity/negativity score, if available. |
|
||||||
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
|
||||||
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
||||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
||||||
|
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||||
|
|
Loading…
Reference in New Issue