mirror of https://github.com/explosion/spaCy.git
* Exclude empty sentences in prepare_treebank
This commit is contained in:
parent
87d6551d19
commit
4d8d490547
|
@ -60,12 +60,13 @@ def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
||||||
else:
|
else:
|
||||||
doc['paragraphs'] = []
|
doc['paragraphs'] = []
|
||||||
for raw_sents in raw_paras:
|
for raw_sents in raw_paras:
|
||||||
doc['paragraphs'].append(
|
para = format_para(
|
||||||
format_para(
|
' '.join(raw_sents).replace('<SEP>', ''),
|
||||||
' '.join(raw_sents).replace('<SEP>', ''),
|
ptb_sents[i:i+len(raw_sents)],
|
||||||
ptb_sents[i:i+len(raw_sents)],
|
dep_sents[i:i+len(raw_sents)],
|
||||||
dep_sents[i:i+len(raw_sents)],
|
ner_sents[i:i+len(raw_sents)])
|
||||||
ner_sents[i:i+len(raw_sents)]))
|
if para['sentences']:
|
||||||
|
doc['paragraphs'].append(para)
|
||||||
i += len(raw_sents)
|
i += len(raw_sents)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue