* Fix efficiency of JSON reading, by using ujson instead of stream

2015-05-30 17:54:52 +02:00 · 2015-05-30 17:54:52 +02:00 · 9e39a206da
parent 6bbdcc5db5
commit 9e39a206da
1 changed files with 25 additions and 21 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -2,6 +2,7 @@ import numpy
 import codecs
 import json
 import ijson
+import ujson
 import random
 import re
 import os
@ -96,32 +97,35 @@ def _min_edit_path(cand_words, gold_words):


 def read_json_file(loc):
+    print loc
    if path.isdir(loc):
        for filename in os.listdir(loc):
            yield from read_json_file(path.join(loc, filename))
    else:
        with open(loc) as file_:
-            for doc in ijson.items(file_, 'item'):
-                paragraphs = []
-                for paragraph in doc['paragraphs']:
-                    sents = []
-                    for sent in paragraph['sentences']:
-                        words = []
-                        ids = []
-                        tags = []
-                        heads = []
-                        labels = []
-                        ner = []
-                        for i, token in enumerate(sent['tokens']):
-                            words.append(token['orth'])
-                            ids.append(i)
-                            tags.append(token['tag'])
-                            heads.append(token['head'] + i)
-                            labels.append(token['dep'])
-                            ner.append(token.get('ner', '-'))
-                        sents.append((
-                            (ids, words, tags, heads, labels, ner),
-                            sent.get('brackets', [])))
+            docs = ujson.load(file_)
+        for doc in docs:
+            paragraphs = []
+            for paragraph in doc['paragraphs']:
+                sents = []
+                for sent in paragraph['sentences']:
+                    words = []
+                    ids = []
+                    tags = []
+                    heads = []
+                    labels = []
+                    ner = []
+                    for i, token in enumerate(sent['tokens']):
+                        words.append(token['orth'])
+                        ids.append(i)
+                        tags.append(token['tag'])
+                        heads.append(token['head'] + i)
+                        labels.append(token['dep'])
+                        ner.append(token.get('ner', '-'))
+                    sents.append((
+                        (ids, words, tags, heads, labels, ner),
+                        sent.get('brackets', [])))
+                if sents:
                    yield (paragraph.get('raw', None), sents)