From 0426689db872ff6be74bd8250bef2197b07c8b2b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 10 Mar 2019 15:24:34 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Improve=20Doc.to=5Fjson=20and=20?= =?UTF-8?q?add=20Doc.is=5Fnered=20(#3381)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Use default return instead of else * Add Doc.is_nered to indicate if entities have been set * Add properties in Doc.to_json if they were set, not if they're available This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training. --- spacy/tests/doc/test_doc_api.py | 18 +++++++++++++++- spacy/tokens/doc.pyx | 26 +++++++++++++++-------- website/docs/api/doc.md | 37 +++++++++++++++++---------------- 3 files changed, 53 insertions(+), 28 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 1c3c948c3..8eed2c267 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -4,9 +4,10 @@ from __future__ import unicode_literals import pytest import numpy -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.errors import ModelsWarning +from spacy.attrs import ENT_TYPE, ENT_IOB from ..util import get_doc @@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix): assert lca[1, 1] == 1 assert lca[0, 1] == 2 assert lca[1, 2] == 2 + + +def test_doc_is_nered(en_vocab): + words = ["I", "live", "in", "New", "York"] + doc = Doc(en_vocab, words=words) + assert not doc.is_nered + doc.ents = [Span(doc, 3, 5, label="GPE")] + assert doc.is_nered + # Test creating doc from array with unknown values + arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") + doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) + assert doc.is_nered + # Test serialization + new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) + assert new_doc.is_nered diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1dfcd1687..ff38d825f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -240,8 +240,18 @@ cdef class Doc: for i in range(1, self.length): if self.c[i].sent_start == -1 or self.c[i].sent_start == 1: return True - else: - return False + return False + + @property + def is_nered(self): + """Check if the document has named entities set. Will return True if + *any* of the tokens has a named entity tag set (even if the others are + uknown values). + """ + for i in range(self.length): + if self.c[i].ent_iob != 0: + return True + return False def __getitem__(self, object i): """Get a `Token` or `Span` object. @@ -990,11 +1000,11 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_json """ data = {"text": self.text} - if self.ents: + if self.is_nered: data["ents"] = [{"start": ent.start_char, "end": ent.end_char, "label": ent.label_} for ent in self.ents] - sents = list(self.sents) - if sents: + if self.is_sentenced: + sents = list(self.sents) data["sents"] = [{"start": sent.start_char, "end": sent.end_char} for sent in sents] if self.cats: @@ -1002,13 +1012,11 @@ cdef class Doc: data["tokens"] = [] for token in self: token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)} - if token.pos_: + if self.is_tagged: token_data["pos"] = token.pos_ - if token.tag_: token_data["tag"] = token.tag_ - if token.dep_: + if self.is_parsed: token_data["dep"] = token.dep_ - if token.head: token_data["head"] = token.head.i data["tokens"].append(token_data) if underscore: diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 267d8f711..e53619cff 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -237,7 +237,7 @@ attribute ID. > from spacy.attrs import ORTH > doc = nlp(u"apple apple orange banana") > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2} -> doc.to_array([attrs.ORTH]) +> doc.to_array([ORTH]) > # array([[11880], [11880], [7561], [12800]]) > ``` @@ -640,20 +640,21 @@ The L2 norm of the document's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `text` | unicode | A unicode representation of the document text. | -| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | -| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | -| `vocab` | `Vocab` | The store of lexical types. | -| `tensor` 2 | object | Container for dense vector representations. | -| `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | -| `user_data` | - | A generic storage area, for user custom data. | -| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | -| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | -| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. | -| `sentiment` | float | The document's positivity/negativity score, if available. | -| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | -| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | -| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Type | Description | +| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `text` | unicode | A unicode representation of the document text. | +| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | +| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | +| `vocab` | `Vocab` | The store of lexical types. | +| `tensor` 2 | object | Container for dense vector representations. | +| `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | +| `user_data` | - | A generic storage area, for user custom data. | +| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | +| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | +| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. | +| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. | +| `sentiment` | float | The document's positivity/negativity score, if available. | +| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | +| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | +| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |