diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 4e3975bda..c8c5a3902 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -135,7 +135,7 @@ def convert( def _print_docs_to_stdout(docs, output_type): if output_type == "json": - srsly.write_json("-", docs_to_json(docs)) + srsly.write_json("-", [docs_to_json(docs)]) else: sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes()) @@ -144,7 +144,7 @@ def _write_docs_to_file(docs, output_file, output_type): if not output_file.parent.exists(): output_file.parent.mkdir(parents=True) if output_type == "json": - srsly.write_json(output_file, docs_to_json(docs)) + srsly.write_json(output_file, [docs_to_json(docs)]) else: data = DocBin(docs=docs, store_user_data=True).to_bytes() with output_file.open("wb") as file_: diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index fbf8ebea7..5dc39eb31 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -24,14 +24,15 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): for cat, val in doc.cats.items(): json_cat = {"label": cat, "value": val} json_para["cats"].append(json_cat) + # warning: entities information is currently duplicated as + # doc-level "entities" and token-level "ner" for ent in doc.ents: ent_tuple = (ent.start_char, ent.end_char, ent.label_) json_para["entities"].append(ent_tuple) if ent.kb_id_: link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} json_para["links"].append(link_dict) - ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag) + biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag) for j, sent in enumerate(doc.sents): json_sent = {"tokens": [], "brackets": []} for token in sent: @@ -44,6 +45,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): if doc.is_parsed: json_token["head"] = token.head.i-token.i json_token["dep"] = token.dep_ + json_token["ner"] = biluo_tags[token.i] json_sent["tokens"].append(json_token) json_para["sentences"].append(json_sent) json_doc["paragraphs"].append(json_para)