mirror of https://github.com/explosion/spaCy.git
Make `docs_to_json` backwards-compatible with v2 (#5714)
* In `spacy convert -t json` output the JSON docs wrapped in a list * Add back token-level `ner` alongside the doc-level `entities`
This commit is contained in:
parent
412dbb1f38
commit
c67fc6aa5b
|
@ -135,7 +135,7 @@ def convert(
|
|||
|
||||
def _print_docs_to_stdout(docs, output_type):
|
||||
if output_type == "json":
|
||||
srsly.write_json("-", docs_to_json(docs))
|
||||
srsly.write_json("-", [docs_to_json(docs)])
|
||||
else:
|
||||
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
|
||||
|
||||
|
@ -144,7 +144,7 @@ def _write_docs_to_file(docs, output_file, output_type):
|
|||
if not output_file.parent.exists():
|
||||
output_file.parent.mkdir(parents=True)
|
||||
if output_type == "json":
|
||||
srsly.write_json(output_file, docs_to_json(docs))
|
||||
srsly.write_json(output_file, [docs_to_json(docs)])
|
||||
else:
|
||||
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
|
|
|
@ -24,14 +24,15 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
|||
for cat, val in doc.cats.items():
|
||||
json_cat = {"label": cat, "value": val}
|
||||
json_para["cats"].append(json_cat)
|
||||
# warning: entities information is currently duplicated as
|
||||
# doc-level "entities" and token-level "ner"
|
||||
for ent in doc.ents:
|
||||
ent_tuple = (ent.start_char, ent.end_char, ent.label_)
|
||||
json_para["entities"].append(ent_tuple)
|
||||
if ent.kb_id_:
|
||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||
json_para["links"].append(link_dict)
|
||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
|
||||
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {"tokens": [], "brackets": []}
|
||||
for token in sent:
|
||||
|
@ -44,6 +45,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
|||
if doc.is_parsed:
|
||||
json_token["head"] = token.head.i-token.i
|
||||
json_token["dep"] = token.dep_
|
||||
json_token["ner"] = biluo_tags[token.i]
|
||||
json_sent["tokens"].append(json_token)
|
||||
json_para["sentences"].append(json_sent)
|
||||
json_doc["paragraphs"].append(json_para)
|
||||
|
|
Loading…
Reference in New Issue