mirror of https://github.com/explosion/spaCy.git
Fix jsonl to json conversion (#3419)
* Fix spacy.gold.docs_to_json function * Fix jsonl2json converter
This commit is contained in:
parent
0a4b074184
commit
47e110375d
|
@ -3,18 +3,51 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ...util import get_lang_class
|
from ...gold import docs_to_json
|
||||||
|
from ...util import get_lang_class, minibatch
|
||||||
|
|
||||||
|
|
||||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
||||||
if lang is None:
|
if lang is None:
|
||||||
raise ValueError("No --lang specified, but tokenization required")
|
raise ValueError("No --lang specified, but tokenization required")
|
||||||
json_docs = []
|
json_docs = []
|
||||||
input_tuples = [srsly.json_loads(line) for line in input_data]
|
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
|
||||||
nlp = get_lang_class(lang)()
|
nlp = get_lang_class(lang)()
|
||||||
for i, (raw_text, ents) in enumerate(input_tuples):
|
sentencizer = nlp.create_pipe("sentencizer")
|
||||||
doc = nlp.make_doc(raw_text)
|
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
|
||||||
doc[0].is_sent_start = True
|
docs = []
|
||||||
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
|
for record in batch:
|
||||||
json_docs.append(doc.to_json())
|
raw_text = record["text"]
|
||||||
|
if "entities" in record:
|
||||||
|
ents = record["entities"]
|
||||||
|
else:
|
||||||
|
ents = record["spans"]
|
||||||
|
ents = [(e["start"], e["end"], e["label"]) for e in ents]
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
|
sentencizer(doc)
|
||||||
|
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
|
||||||
|
doc.ents = _cleanup_spans(spans)
|
||||||
|
docs.append(doc)
|
||||||
|
json_docs.append(docs_to_json(docs, id=i))
|
||||||
return json_docs
|
return json_docs
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_spans(spans):
|
||||||
|
output = []
|
||||||
|
seen = set()
|
||||||
|
for span in spans:
|
||||||
|
if span is not None:
|
||||||
|
# Trim whitespace
|
||||||
|
while len(span) and span[0].is_space:
|
||||||
|
span = span[1:]
|
||||||
|
while len(span) and span[-1].is_space:
|
||||||
|
span = span[:-1]
|
||||||
|
if not len(span):
|
||||||
|
continue
|
||||||
|
for i in range(span.start, span.end):
|
||||||
|
if i in seen:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
output.append(span)
|
||||||
|
seen.update(range(span.start, span.end))
|
||||||
|
return output
|
||||||
|
|
|
@ -598,19 +598,35 @@ cdef class GoldParse:
|
||||||
self.c.sent_start[i] = 0
|
self.c.sent_start[i] = 0
|
||||||
|
|
||||||
|
|
||||||
def docs_to_json(docs, underscore=None):
|
def docs_to_json(docs, id=0):
|
||||||
"""Convert a list of Doc objects into the JSON-serializable format used by
|
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||||
the spacy train command.
|
the spacy train command.
|
||||||
|
|
||||||
docs (iterable / Doc): The Doc object(s) to convert.
|
docs (iterable / Doc): The Doc object(s) to convert.
|
||||||
underscore (list): Optional list of string names of custom doc._.
|
id (int): Id for the JSON.
|
||||||
attributes. Attribute values need to be JSON-serializable. Values will
|
|
||||||
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
|
||||||
RETURNS (list): The data in spaCy's JSON format.
|
RETURNS (list): The data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
return [doc.to_json(underscore=underscore) for doc in docs]
|
json_doc = {"id": id, "paragraphs": []}
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
json_para = {'raw': doc.text, "sentences": []}
|
||||||
|
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||||
|
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
||||||
|
for j, sent in enumerate(doc.sents):
|
||||||
|
json_sent = {"tokens": [], "brackets": []}
|
||||||
|
for token in sent:
|
||||||
|
json_token = {"id": token.i, "orth": token.text}
|
||||||
|
if doc.is_tagged:
|
||||||
|
json_token["tag"] = token.tag_
|
||||||
|
if doc.is_parsed:
|
||||||
|
json_token["head"] = token.head.i-token.i
|
||||||
|
json_token["dep"] = token.dep_
|
||||||
|
json_token["ner"] = biluo_tags[token.i]
|
||||||
|
json_sent["tokens"].append(json_token)
|
||||||
|
json_para["sentences"].append(json_sent)
|
||||||
|
json_doc["paragraphs"].append(json_para)
|
||||||
|
return json_doc
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
|
|
Loading…
Reference in New Issue