diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 9116275f8..39ff05124 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -563,6 +563,32 @@ cdef class GoldParse: self.c.sent_start[i] = 0 +def docs_to_json(id, docs): + '''Convert a list of Doc objects into the JSON-serializable format used by + the spacy train command. Each Doc in the list will be interpreted as a + paragraph. + ''' + if isinstance(docs, Doc): + docs = [docs] + json_doc = {'id': id, 'paragraphs': []} + for i, doc in enumerate(docs): + json_para = {'raw': doc.text, 'sentences': []} + ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] + biluo_tags = biluo_tags_from_offsets(doc, ent_offsets) + for j, sent in enumerate(doc.sents): + json_sent = {'tokens': [], 'brackets': []} + for token in sent: + json_token = {"id": token.i, "orth": token.text} + json_token['tag'] = token.tag_ if doc.is_tagged else None + json_token['head'] = (token.head.i-token.i) if doc.is_parsed else None + json_token['dep'] = token.dep_ if doc.is_parsed else None + json_token['ner'] = biluo_tags[token.i] + json_sent['tokens'].append(json_token) + json_para['sentences'].append(json_sent) + json_doc['paragraphs'].append(json_para) + return json_doc + + def biluo_tags_from_offsets(doc, entities, missing='O'): """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO). diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index e2354c9db..7a9198642 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags +from spacy.gold import docs_to_json from spacy.tokens import Doc +from .util import get_doc def test_gold_biluo_U(en_vocab): @@ -50,3 +52,31 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer): assert biluo_tags_converted == biluo_tags offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) assert offsets_converted == offsets + +def test_docs_to_json(en_vocab): + '''Test we can convert a list of Doc objects into the JSON-serializable + format we use for training. + ''' + docs = [ + get_doc( + en_vocab, + words=['a', 'b'], + pos=['VBP', 'NN'], + heads=[0, -1], + deps=['ROOT', 'dobj'], + ents=[]), + get_doc( + en_vocab, + words=['c', 'd', 'e'], + pos=['VBP', 'NN', 'NN'], + heads=[0, -1, -2], + deps=['ROOT', 'dobj', 'dobj'], + ents=[(1, 2, 'ORG')]), + ] + json_doc = docs_to_json(0, docs) + assert json_doc['id'] == 0 + assert len(json_doc['paragraphs']) == 2 + assert len(json_doc['paragraphs'][0]['sentences']) == 1 + assert len(json_doc['paragraphs'][1]['sentences']) == 1 + assert len(json_doc['paragraphs'][0]['sentences'][0]['tokens']) == 2 + assert len(json_doc['paragraphs'][1]['sentences'][0]['tokens']) == 3