Improve max length check in corpus

This commit is contained in:
Matthw Honnibal 2020-07-01 15:16:43 +02:00
parent 2fa56484b2
commit 1f7709e9a6
1 changed files with 13 additions and 13 deletions

View File

@ -45,22 +45,22 @@ class Corpus:
def make_examples(self, nlp, reference_docs, max_length=0): def make_examples(self, nlp, reference_docs, max_length=0):
for reference in reference_docs: for reference in reference_docs:
if len(reference) >= max_length >= 1: if len(reference) == 0:
if reference.is_sentenced: continue
for ref_sent in reference.sents: elif max_length == 0 or len(reference) < max_length:
eg = Example( yield Example(
nlp.make_doc(ref_sent.text),
ref_sent.as_doc()
)
if len(eg.x):
yield eg
else:
eg = Example(
nlp.make_doc(reference.text), nlp.make_doc(reference.text),
reference reference
) )
if len(eg.x): elif reference.is_sentenced:
yield eg for ref_sent in reference.sents:
if len(ref_sent) == 0:
continue
elif max_length == 0 or len(ref_sent) < max_length:
yield Example(
nlp.make_doc(ref_sent.text),
ref_sent.as_doc()
)
def make_examples_gold_preproc(self, nlp, reference_docs): def make_examples_gold_preproc(self, nlp, reference_docs):
for reference in reference_docs: for reference in reference_docs: