Check for docs with 2+ sentences in debug-data (#4467)

This commit is contained in:
adrianeboyd 2019-10-18 10:59:16 +02:00 committed by Matthew Honnibal
parent 258eb9e064
commit 135e3de531
1 changed files with 10 additions and 0 deletions

View File

@ -360,6 +360,16 @@ def debug_data(
)
)
# check for documents with multiple sentences
sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
if sents_per_doc < 1.1:
msg.warn(
"The training data contains {:.2f} sentences per "
"document. When there are very few documents containing more "
"than one sentence, the parser will not learn how to segment "
"longer texts into sentences.".format(sents_per_doc)
)
# profile labels
labels_train = [label for label in gold_train_data["deps"]]
labels_train_unpreprocessed = [