From 4dc0fc9954ccac2998f2ebd117817566f5584132 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 21 Feb 2018 15:59:22 +0100 Subject: [PATCH] Replace labels that didn't make freq cutoff --- examples/training/conllu.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/training/conllu.py b/examples/training/conllu.py index 2a25a5863..867501844 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -208,6 +208,13 @@ def main(spacy_model, conllu_train_loc, text_train_loc, conllu_dev_loc, text_dev if tag is not None: nlp.tagger.add_label(tag) optimizer = nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds)) + # Replace labels that didn't make the frequency cutoff + actions = set(nlp.parser.labels) + label_set = set([act.split('-')[1] for act in actions if '-' in act]) + for gold in golds: + for i, label in enumerate(gold.labels): + if label is not None and label not in label_set: + gold.labels[i] = label.split('||')[0] n_train_words = sum(len(doc) for doc in docs) print(n_train_words) print("Begin training")