diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index 796208c1a..91dd42982 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -3,18 +3,51 @@ from __future__ import unicode_literals import srsly -from ...util import get_lang_class +from ...gold import docs_to_json +from ...util import get_lang_class, minibatch def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False): if lang is None: raise ValueError("No --lang specified, but tokenization required") json_docs = [] - input_tuples = [srsly.json_loads(line) for line in input_data] + input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")] nlp = get_lang_class(lang)() - for i, (raw_text, ents) in enumerate(input_tuples): - doc = nlp.make_doc(raw_text) - doc[0].is_sent_start = True - doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]] - json_docs.append(doc.to_json()) + sentencizer = nlp.create_pipe("sentencizer") + for i, batch in enumerate(minibatch(input_examples, size=n_sents)): + docs = [] + for record in batch: + raw_text = record["text"] + if "entities" in record: + ents = record["entities"] + else: + ents = record["spans"] + ents = [(e["start"], e["end"], e["label"]) for e in ents] + doc = nlp.make_doc(raw_text) + sentencizer(doc) + spans = [doc.char_span(s, e, label=L) for s, e, L in ents] + doc.ents = _cleanup_spans(spans) + docs.append(doc) + json_docs.append(docs_to_json(docs, id=i)) return json_docs + + +def _cleanup_spans(spans): + output = [] + seen = set() + for span in spans: + if span is not None: + # Trim whitespace + while len(span) and span[0].is_space: + span = span[1:] + while len(span) and span[-1].is_space: + span = span[:-1] + if not len(span): + continue + for i in range(span.start, span.end): + if i in seen: + break + else: + output.append(span) + seen.update(range(span.start, span.end)) + return output diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 02306c651..66607bbf8 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -598,19 +598,35 @@ cdef class GoldParse: self.c.sent_start[i] = 0 -def docs_to_json(docs, underscore=None): +def docs_to_json(docs, id=0): """Convert a list of Doc objects into the JSON-serializable format used by the spacy train command. docs (iterable / Doc): The Doc object(s) to convert. - underscore (list): Optional list of string names of custom doc._. - attributes. Attribute values need to be JSON-serializable. Values will - be added to an "_" key in the data, e.g. "_": {"foo": "bar"}. + id (int): Id for the JSON. RETURNS (list): The data in spaCy's JSON format. """ if isinstance(docs, Doc): docs = [docs] - return [doc.to_json(underscore=underscore) for doc in docs] + json_doc = {"id": id, "paragraphs": []} + for i, doc in enumerate(docs): + json_para = {'raw': doc.text, "sentences": []} + ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] + biluo_tags = biluo_tags_from_offsets(doc, ent_offsets) + for j, sent in enumerate(doc.sents): + json_sent = {"tokens": [], "brackets": []} + for token in sent: + json_token = {"id": token.i, "orth": token.text} + if doc.is_tagged: + json_token["tag"] = token.tag_ + if doc.is_parsed: + json_token["head"] = token.head.i-token.i + json_token["dep"] = token.dep_ + json_token["ner"] = biluo_tags[token.i] + json_sent["tokens"].append(json_token) + json_para["sentences"].append(json_sent) + json_doc["paragraphs"].append(json_para) + return json_doc def biluo_tags_from_offsets(doc, entities, missing="O"):