mirror of https://github.com/explosion/spaCy.git
Fix jsonl to json conversion (#3419)
* Fix spacy.gold.docs_to_json function * Fix jsonl2json converter
This commit is contained in:
parent
0a4b074184
commit
47e110375d
|
@ -3,18 +3,51 @@ from __future__ import unicode_literals
|
|||
|
||||
import srsly
|
||||
|
||||
from ...util import get_lang_class
|
||||
from ...gold import docs_to_json
|
||||
from ...util import get_lang_class, minibatch
|
||||
|
||||
|
||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
||||
if lang is None:
|
||||
raise ValueError("No --lang specified, but tokenization required")
|
||||
json_docs = []
|
||||
input_tuples = [srsly.json_loads(line) for line in input_data]
|
||||
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
|
||||
nlp = get_lang_class(lang)()
|
||||
for i, (raw_text, ents) in enumerate(input_tuples):
|
||||
doc = nlp.make_doc(raw_text)
|
||||
doc[0].is_sent_start = True
|
||||
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
|
||||
json_docs.append(doc.to_json())
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
|
||||
docs = []
|
||||
for record in batch:
|
||||
raw_text = record["text"]
|
||||
if "entities" in record:
|
||||
ents = record["entities"]
|
||||
else:
|
||||
ents = record["spans"]
|
||||
ents = [(e["start"], e["end"], e["label"]) for e in ents]
|
||||
doc = nlp.make_doc(raw_text)
|
||||
sentencizer(doc)
|
||||
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
|
||||
doc.ents = _cleanup_spans(spans)
|
||||
docs.append(doc)
|
||||
json_docs.append(docs_to_json(docs, id=i))
|
||||
return json_docs
|
||||
|
||||
|
||||
def _cleanup_spans(spans):
|
||||
output = []
|
||||
seen = set()
|
||||
for span in spans:
|
||||
if span is not None:
|
||||
# Trim whitespace
|
||||
while len(span) and span[0].is_space:
|
||||
span = span[1:]
|
||||
while len(span) and span[-1].is_space:
|
||||
span = span[:-1]
|
||||
if not len(span):
|
||||
continue
|
||||
for i in range(span.start, span.end):
|
||||
if i in seen:
|
||||
break
|
||||
else:
|
||||
output.append(span)
|
||||
seen.update(range(span.start, span.end))
|
||||
return output
|
||||
|
|
|
@ -598,19 +598,35 @@ cdef class GoldParse:
|
|||
self.c.sent_start[i] = 0
|
||||
|
||||
|
||||
def docs_to_json(docs, underscore=None):
|
||||
def docs_to_json(docs, id=0):
|
||||
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||
the spacy train command.
|
||||
|
||||
docs (iterable / Doc): The Doc object(s) to convert.
|
||||
underscore (list): Optional list of string names of custom doc._.
|
||||
attributes. Attribute values need to be JSON-serializable. Values will
|
||||
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
||||
id (int): Id for the JSON.
|
||||
RETURNS (list): The data in spaCy's JSON format.
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
return [doc.to_json(underscore=underscore) for doc in docs]
|
||||
json_doc = {"id": id, "paragraphs": []}
|
||||
for i, doc in enumerate(docs):
|
||||
json_para = {'raw': doc.text, "sentences": []}
|
||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {"tokens": [], "brackets": []}
|
||||
for token in sent:
|
||||
json_token = {"id": token.i, "orth": token.text}
|
||||
if doc.is_tagged:
|
||||
json_token["tag"] = token.tag_
|
||||
if doc.is_parsed:
|
||||
json_token["head"] = token.head.i-token.i
|
||||
json_token["dep"] = token.dep_
|
||||
json_token["ner"] = biluo_tags[token.i]
|
||||
json_sent["tokens"].append(json_token)
|
||||
json_para["sentences"].append(json_sent)
|
||||
json_doc["paragraphs"].append(json_para)
|
||||
return json_doc
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||
|
|
Loading…
Reference in New Issue