spaCy/spacy/cli/converters/jsonl2json.py

21 lines
671 B
Python

# coding: utf8
from __future__ import unicode_literals
import srsly
from ...util import get_lang_class
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
if lang is None:
raise ValueError("No --lang specified, but tokenization required")
json_docs = []
input_tuples = [srsly.json_loads(line) for line in input_data]
nlp = get_lang_class(lang)()
for i, (raw_text, ents) in enumerate(input_tuples):
doc = nlp.make_doc(raw_text)
doc[0].is_sent_start = True
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
json_docs.append(doc.to_json())
return json_docs