From c6395b057a6cd65fe931f5b9b8aece35e94f16d7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Sep 2017 16:18:02 +0200 Subject: [PATCH] Improve parser feature extraction, for missing values --- spacy/syntax/_state.pxd | 13 +++++++++---- spacy/syntax/nn_parser.pyx | 16 +++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 3da9e5d4c..9a08691de 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -101,9 +101,10 @@ cdef cppclass StateC: elif n == 6: if this.B(0) >= 0: ids[0] = this.B(0) + ids[1] = this.B(0)-1 else: ids[0] = -1 - ids[1] = this.B(0) + ids[1] = -1 ids[2] = this.B(1) ids[3] = this.E(0) if ids[3] >= 1: @@ -118,8 +119,12 @@ cdef cppclass StateC: # TODO error =/ pass for i in range(n): + # Token vectors should be padded, so that there's a vector for + # missing values at the start. if ids[i] >= 0: - ids[i] += this.offset + ids[i] += this.offset + 1 + else: + ids[i] = 0 int S(int i) nogil const: if i >= this._s_i: @@ -162,9 +167,9 @@ cdef cppclass StateC: int E(int i) nogil const: if this._e_i <= 0 or this._e_i >= this.length: - return 0 + return -1 if i < 0 or i >= this._e_i: - return 0 + return -1 return this._ents[this._e_i - (i+1)].start int L(int i, int idx) nogil const: diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 552ea4f8f..ad6ed280e 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -394,7 +394,7 @@ cdef class Parser: tokvecs = self.model[0].ops.flatten(tokvecses) if USE_FINE_TUNE: tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) - + tokvecs = self._pad_tokvecs(tokvecs) nr_state = len(docs) nr_class = self.moves.n_moves nr_dim = tokvecs.shape[1] @@ -454,6 +454,7 @@ cdef class Parser: tokvecs = self.model[0].ops.flatten(tokvecses) if USE_FINE_TUNE: tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) + tokvecs = self._pad_tokvecs(tokvecs) cuda_stream = get_cuda_stream() state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0) @@ -534,6 +535,8 @@ cdef class Parser: tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) tokvecs = self.model[0].ops.flatten(tokvecs) + tokvecs = self._pad_tokvecs(tokvecs) + cuda_stream = get_cuda_stream() states, golds, max_steps = self._init_gold_batch(docs, golds) @@ -583,6 +586,7 @@ cdef class Parser: break self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) + d_tokvecs = self._unpad_tokvecs(d_tokvecs) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) if USE_FINE_TUNE: d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) @@ -639,10 +643,20 @@ cdef class Parser: d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) + d_tokvecs = self._unpad_tokvecs(d_tokvecs) if USE_FINE_TUNE: d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) return d_tokvecs + def _pad_tokvecs(self, tokvecs): + # Add a vector for missing values at the start of tokvecs + xp = get_array_module(tokvecs) + pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype) + return xp.vstack((pad, tokvecs)) + + def _unpad_tokvecs(self, d_tokvecs): + return d_tokvecs[1:] + def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N,