From b643cb3d5c777a02343bc115b2cfc2e74fe2f68b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Jun 2015 02:42:08 +0200 Subject: [PATCH] * Allow training documents to be filtered in gold.pyx --- spacy/gold.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index fe53fdb8a..f3ed33d10 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -121,7 +121,7 @@ def _min_edit_path(cand_words, gold_words): return prev_costs[n_gold], previous_row[-1] -def read_json_file(loc): +def read_json_file(loc, docs_filter=None): print loc if path.isdir(loc): for filename in os.listdir(loc): @@ -130,6 +130,8 @@ def read_json_file(loc): with open(loc) as file_: docs = ujson.load(file_) for doc in docs: + if docs_filter is not None and not docs_filter(doc): + continue paragraphs = [] for paragraph in doc['paragraphs']: sents = []