From ea2fc5d45f41b33f6d85af9db3fd025caadd12d2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 21 Feb 2018 16:00:38 +0100 Subject: [PATCH] Improve length and freq cutoffs in parser --- spacy/syntax/nn_parser.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 6c7f5354d..35ff02692 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -659,7 +659,7 @@ cdef class Parser: _cleanup(beam) - def _init_gold_batch(self, whole_docs, whole_golds): + def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=2000): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N, where N is the shortest doc. We'll make two states, one representing @@ -668,7 +668,7 @@ cdef class Parser: StateClass state Transition action whole_states = self.moves.init_batch(whole_docs) - max_length = max(5, min(50, min([len(doc) for doc in whole_docs]))) + max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs]))) max_moves = 0 states = [] golds = [] @@ -830,7 +830,7 @@ cdef class Parser: if 'model' in cfg: self.model = cfg['model'] gold_tuples = nonproj.preprocess_training_data(gold_tuples, - label_freq_cutoff=100) + label_freq_cutoff=30) actions = self.moves.get_actions(gold_parses=gold_tuples) for action, labels in actions.items(): for label in labels: