From a1b6add4c8cb2ac6224bd690a6411bf792e6913c Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 1 Jul 2020 01:02:58 +0200 Subject: [PATCH] Fix parser gold cutting and gradient normalization --- spacy/syntax/nn_parser.pyx | 68 ++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 9949c0ef3..ceaea3c9c 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -265,11 +265,15 @@ cdef class Parser: free(is_valid) def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None): + cdef StateClass state if losses is None: losses = {} losses.setdefault(self.name, 0.) for multitask in self._multitasks: multitask.update(examples, drop=drop, sgd=sgd) + n_examples = len([eg for eg in examples if self.moves.has_gold(eg)]) + if n_examples == 0: + return losses set_dropout_rate(self.model, drop) # Prepare the stepwise model, and get the callback for finishing the batch model, backprop_tok2vec = self.model.begin_update( @@ -280,10 +284,13 @@ cdef class Parser: cut_size = self.cfg["update_with_oracle_cut_size"] states, golds, max_steps = self._init_gold_batch( examples, - max_length=numpy.random.choice(range(20, cut_size)) + max_length=numpy.random.choice(range(5, cut_size)) ) else: - states, golds, max_steps = self.moves.init_gold_batch(examples) + states, golds, _ = self.moves.init_gold_batch(examples) + max_steps = max([len(eg.x) for eg in examples]) + if not states: + return losses all_states = list(states) states_golds = zip(states, golds) for _ in range(max_steps): @@ -292,6 +299,17 @@ cdef class Parser: states, golds = zip(*states_golds) scores, backprop = model.begin_update(states) d_scores = self.get_batch_loss(states, golds, scores, losses) + if self.cfg["normalize_gradients_with_batch_size"]: + # We have to be very careful how we do this, because of the way we + # cut up the batch. We subdivide long sequences. If we normalize + # naively, we end up normalizing by sequence length, which + # is bad: that would mean that states in long sequences + # consistently get smaller gradients. Imagine if we have two + # sequences, one length 1000, one length 20. If we cut up + # the 1k sequence so that we have a "batch" of 50 subsequences, + # we don't want the gradients to get 50 times smaller! + d_scores /= n_examples + backprop(d_scores) # Follow the predicted action self.transition_states(states, scores) @@ -389,8 +407,6 @@ cdef class Parser: cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]) c_d_scores += d_scores.shape[1] - if len(states) and self.cfg["normalize_gradients_with_batch_size"]: - d_scores /= len(states) if losses is not None: losses.setdefault(self.name, 0.) losses[self.name] += (d_scores**2).sum() @@ -503,41 +519,49 @@ cdef class Parser: return self def _init_gold_batch(self, examples, min_length=5, max_length=500): - """Make a square batch, of length equal to the shortest doc. A long + """Make a square batch, of length equal to the shortest transition + sequence or a cap. A long doc will get multiple states. Let's say we have a doc of length 2*N, where N is the shortest doc. We'll make two states, one representing long_doc[:N], and another representing long_doc[N:].""" cdef: + StateClass start_state StateClass state Transition action all_states = self.moves.init_batch([eg.predicted for eg in examples]) kept = [] + max_length_seen = 0 for state, eg in zip(all_states, examples): if self.moves.has_gold(eg) and not state.is_final(): gold = self.moves.init_gold(state, eg) - kept.append((eg, state, gold)) - max_length = max(min_length, min(max_length, min([len(eg.x) for eg in examples]))) - max_moves = 0 + oracle_actions = self.moves.get_oracle_sequence_from_state( + state.copy(), gold) + kept.append((eg, state, gold, oracle_actions)) + min_length = min(min_length, len(oracle_actions)) + max_length_seen = max(max_length, len(oracle_actions)) + if not kept: + return [], [], 0 + max_length = max(min_length, min(max_length, max_length_seen)) states = [] golds = [] - for eg, state, gold in kept: - oracle_actions = self.moves.get_oracle_sequence_from_state( - state, gold) - start = 0 - while start < len(eg.predicted): - state = state.copy() + cdef int clas + max_moves = 0 + for eg, state, gold, oracle_actions in kept: + for i in range(0, len(oracle_actions), max_length): + start_state = state.copy() n_moves = 0 - while state.B(0) < start and not state.is_final(): - action = self.moves.c[oracle_actions.pop(0)] + for clas in oracle_actions[i:i+max_length]: + action = self.moves.c[clas] action.do(state.c, action.label) state.c.push_hist(action.clas) n_moves += 1 - has_gold = self.moves.has_gold(eg, start=start, - end=start+max_length) - if not state.is_final() and has_gold: - states.append(state) + if state.is_final(): + break + max_moves = max(max_moves, n_moves) + if self.moves.has_gold(eg, start_state.B(0), state.B(0)): + states.append(start_state) golds.append(gold) max_moves = max(max_moves, n_moves) - start += min(max_length, len(eg.x)-start) - max_moves = max(max_moves, len(oracle_actions)) + if state.is_final(): + break return states, golds, max_moves