Improve parser feature extraction, for missing values

2017-09-14 16:18:02 +02:00 · 2017-09-14 16:18:02 +02:00 · c6395b057a
parent daf869ab3b
commit c6395b057a
2 changed files with 24 additions and 5 deletions
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -101,9 +101,10 @@ cdef cppclass StateC:
        elif n == 6:
            if this.B(0) >= 0:
                ids[0] = this.B(0)
+                ids[1] = this.B(0)-1
            else:
                ids[0] = -1
-            ids[1] = this.B(0)
+                ids[1] = -1
            ids[2] = this.B(1)
            ids[3] = this.E(0)
            if ids[3] >= 1:
@ -118,8 +119,12 @@ cdef cppclass StateC:
            # TODO error =/
            pass
        for i in range(n):
+            # Token vectors should be padded, so that there's a vector for
+            # missing values at the start.
            if ids[i] >= 0:
-                ids[i] += this.offset
+                ids[i] += this.offset + 1
+            else:
+                ids[i] = 0

    int S(int i) nogil const:
        if i >= this._s_i:
@ -162,9 +167,9 @@ cdef cppclass StateC:

    int E(int i) nogil const:
        if this._e_i <= 0 or this._e_i >= this.length:
-            return 0
+            return -1
        if i < 0 or i >= this._e_i:
-            return 0
+            return -1
        return this._ents[this._e_i - (i+1)].start

    int L(int i, int idx) nogil const:
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -394,7 +394,7 @@ cdef class Parser:
        tokvecs = self.model[0].ops.flatten(tokvecses)
        if USE_FINE_TUNE:
            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
-
+        tokvecs = self._pad_tokvecs(tokvecs)
        nr_state = len(docs)
        nr_class = self.moves.n_moves
        nr_dim = tokvecs.shape[1]
@ -454,6 +454,7 @@ cdef class Parser:
        tokvecs = self.model[0].ops.flatten(tokvecses)
        if USE_FINE_TUNE:
            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+        tokvecs = self._pad_tokvecs(tokvecs)
        cuda_stream = get_cuda_stream()
        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                     cuda_stream, 0.0)
@ -534,6 +535,8 @@ cdef class Parser:
            tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
            tokvecs = self.model[0].ops.flatten(tokvecs)

+        tokvecs = self._pad_tokvecs(tokvecs)
+
        cuda_stream = get_cuda_stream()

        states, golds, max_steps = self._init_gold_batch(docs, golds)
@ -583,6 +586,7 @@ cdef class Parser:
                break
        self._make_updates(d_tokvecs,
            backprops, sgd, cuda_stream)
+        d_tokvecs = self._unpad_tokvecs(d_tokvecs)
        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
        if USE_FINE_TUNE:
            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
@ -639,10 +643,20 @@ cdef class Parser:
        d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
        self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
+        d_tokvecs = self._unpad_tokvecs(d_tokvecs)
        if USE_FINE_TUNE:
            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
        return d_tokvecs

+    def _pad_tokvecs(self, tokvecs):
+        # Add a vector for missing values at the start of tokvecs
+        xp = get_array_module(tokvecs)
+        pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype)
+        return xp.vstack((pad, tokvecs))
+
+    def _unpad_tokvecs(self, d_tokvecs):
+        return d_tokvecs[1:]
+
    def _init_gold_batch(self, whole_docs, whole_golds):
        """Make a square batch, of length equal to the shortest doc. A long
        doc will get multiple states. Let's say we have a doc of length 2*N,