From 2a5a61683ee4781cd821fe9eed380a14d6525d05 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 14 Aug 2018 13:13:10 +0200
Subject: [PATCH] Add function to get train format from Doc objects

Our JSON training format is annoying to work with, and we've wanted to
retire it for some time. In the meantime, we can at least add some
missing functions to make it easier to live with.

This patch adds a function that generates the JSON format from a list
of Doc objects, one per paragraph. This should be a convenient way to handle
a lot of data conversions: whatever format you have the source
information in, you can use it to setup a Doc object. This approach
should offer better future-proofing as well. Hopefully, we can steadily
rewrite code that is sensitive to the current data-format, so that it
instead goes through this function. Then when we change the data format,
we won't have such a problem.
---
 spacy/gold.pyx           | 26 ++++++++++++++++++++++++++
 spacy/tests/test_gold.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 9116275f8..39ff05124 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -563,6 +563,32 @@ cdef class GoldParse:
                         self.c.sent_start[i] = 0
 
 
+def docs_to_json(id, docs):
+    '''Convert a list of Doc objects into the JSON-serializable format used by
+    the spacy train command. Each Doc in the list will be interpreted as a
+    paragraph.
+    '''
+    if isinstance(docs, Doc):
+        docs = [docs]
+    json_doc = {'id': id, 'paragraphs': []}
+    for i, doc in enumerate(docs):
+        json_para = {'raw': doc.text, 'sentences': []}
+        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
+        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
+        for j, sent in enumerate(doc.sents):
+            json_sent = {'tokens': [], 'brackets': []}
+            for token in sent:
+                json_token = {"id": token.i, "orth": token.text}
+                json_token['tag'] = token.tag_ if doc.is_tagged else None
+                json_token['head'] = (token.head.i-token.i) if doc.is_parsed else None
+                json_token['dep'] = token.dep_ if doc.is_parsed else None
+                json_token['ner'] = biluo_tags[token.i]
+                json_sent['tokens'].append(json_token)
+            json_para['sentences'].append(json_sent)
+        json_doc['paragraphs'].append(json_para)
+    return json_doc
+
+
 def biluo_tags_from_offsets(doc, entities, missing='O'):
     """Encode labelled spans into per-token tags, using the
     Begin/In/Last/Unit/Out scheme (BILUO).
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index e2354c9db..7a9198642 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -2,7 +2,9 @@
 from __future__ import unicode_literals
 
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
+from spacy.gold import docs_to_json
 from spacy.tokens import Doc
+from .util import get_doc
 
 
 def test_gold_biluo_U(en_vocab):
@@ -50,3 +52,31 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
     assert biluo_tags_converted == biluo_tags
     offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
     assert offsets_converted == offsets
+
+def test_docs_to_json(en_vocab):
+    '''Test we can convert a list of Doc objects into the JSON-serializable
+    format we use for training.
+    '''
+    docs = [
+        get_doc(
+            en_vocab,
+            words=['a', 'b'],
+            pos=['VBP', 'NN'],
+            heads=[0, -1],
+            deps=['ROOT', 'dobj'],
+            ents=[]),
+        get_doc(
+            en_vocab,
+            words=['c', 'd', 'e'],
+            pos=['VBP', 'NN', 'NN'],
+            heads=[0, -1, -2],
+            deps=['ROOT', 'dobj', 'dobj'],
+            ents=[(1, 2, 'ORG')]),
+    ]
+    json_doc = docs_to_json(0, docs)
+    assert json_doc['id'] == 0
+    assert len(json_doc['paragraphs']) == 2
+    assert len(json_doc['paragraphs'][0]['sentences']) == 1
+    assert len(json_doc['paragraphs'][1]['sentences']) == 1
+    assert len(json_doc['paragraphs'][0]['sentences'][0]['tokens']) == 2
+    assert len(json_doc['paragraphs'][1]['sentences'][0]['tokens']) == 3