Reload train corpus in debug data after initialize (#8776)

This commit is contained in:
Adriane Boyd 2021-07-21 22:38:40 +02:00 committed by GitHub
parent d48c01a6f7
commit 6bbc2b1956
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 4 additions and 3 deletions

View File

@ -101,13 +101,14 @@ def debug_data(
# Create the gold corpus to be able to better analyze data
dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
nlp.initialize(lambda: train_corpus(nlp))
msg.good("Pipeline can be initialized with data")
train_dataset = list(train_corpus(nlp))
dev_dataset = list(dev_corpus(nlp))
msg.good("Corpus is loadable")
nlp.initialize(lambda: train_dataset)
msg.good("Pipeline can be initialized with data")
# Create all gold data here to avoid iterating over the train_dataset constantly
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
gold_train_unpreprocessed_data = _compile_gold(