From 9e39a206dadfb6d396f504ef0b874899143867ef Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 30 May 2015 17:54:52 +0200
Subject: [PATCH] * Fix efficiency of JSON reading, by using ujson instead of
 stream

---
 spacy/gold.pyx | 46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 7cb9d92ac..52416c06b 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -2,6 +2,7 @@ import numpy
 import codecs
 import json
 import ijson
+import ujson
 import random
 import re
 import os
@@ -96,32 +97,35 @@ def _min_edit_path(cand_words, gold_words):
 
 
 def read_json_file(loc):
+    print loc
     if path.isdir(loc):
         for filename in os.listdir(loc):
             yield from read_json_file(path.join(loc, filename))
     else:
         with open(loc) as file_:
-            for doc in ijson.items(file_, 'item'):
-                paragraphs = []
-                for paragraph in doc['paragraphs']:
-                    sents = []
-                    for sent in paragraph['sentences']:
-                        words = []
-                        ids = []
-                        tags = []
-                        heads = []
-                        labels = []
-                        ner = []
-                        for i, token in enumerate(sent['tokens']):
-                            words.append(token['orth'])
-                            ids.append(i)
-                            tags.append(token['tag'])
-                            heads.append(token['head'] + i)
-                            labels.append(token['dep'])
-                            ner.append(token.get('ner', '-'))
-                        sents.append((
-                            (ids, words, tags, heads, labels, ner),
-                            sent.get('brackets', [])))
+            docs = ujson.load(file_)
+        for doc in docs:
+            paragraphs = []
+            for paragraph in doc['paragraphs']:
+                sents = []
+                for sent in paragraph['sentences']:
+                    words = []
+                    ids = []
+                    tags = []
+                    heads = []
+                    labels = []
+                    ner = []
+                    for i, token in enumerate(sent['tokens']):
+                        words.append(token['orth'])
+                        ids.append(i)
+                        tags.append(token['tag'])
+                        heads.append(token['head'] + i)
+                        labels.append(token['dep'])
+                        ner.append(token.get('ner', '-'))
+                    sents.append((
+                        (ids, words, tags, heads, labels, ner),
+                        sent.get('brackets', [])))
+                if sents:
                     yield (paragraph.get('raw', None), sents)