From 4d8d490547ce6ceee558e398fa349f36914a9d53 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 31 May 2015 01:12:46 +0200 Subject: [PATCH] * Exclude empty sentences in prepare_treebank --- bin/prepare_treebank.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py index 95cb29f5c..d13ef7130 100644 --- a/bin/prepare_treebank.py +++ b/bin/prepare_treebank.py @@ -60,12 +60,13 @@ def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text): else: doc['paragraphs'] = [] for raw_sents in raw_paras: - doc['paragraphs'].append( - format_para( - ' '.join(raw_sents).replace('', ''), - ptb_sents[i:i+len(raw_sents)], - dep_sents[i:i+len(raw_sents)], - ner_sents[i:i+len(raw_sents)])) + para = format_para( + ' '.join(raw_sents).replace('', ''), + ptb_sents[i:i+len(raw_sents)], + dep_sents[i:i+len(raw_sents)], + ner_sents[i:i+len(raw_sents)]) + if para['sentences']: + doc['paragraphs'].append(para) i += len(raw_sents) return doc