From c6395b057a6cd65fe931f5b9b8aece35e94f16d7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 14 Sep 2017 16:18:02 +0200
Subject: [PATCH] Improve parser feature extraction, for missing values

---
 spacy/syntax/_state.pxd    | 13 +++++++++----
 spacy/syntax/nn_parser.pyx | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 3da9e5d4c..9a08691de 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -101,9 +101,10 @@ cdef cppclass StateC:
         elif n == 6:
             if this.B(0) >= 0:
                 ids[0] = this.B(0)
+                ids[1] = this.B(0)-1
             else:
                 ids[0] = -1
-            ids[1] = this.B(0)
+                ids[1] = -1
             ids[2] = this.B(1)
             ids[3] = this.E(0)
             if ids[3] >= 1:
@@ -118,8 +119,12 @@ cdef cppclass StateC:
             # TODO error =/
             pass
         for i in range(n):
+            # Token vectors should be padded, so that there's a vector for
+            # missing values at the start.
             if ids[i] >= 0:
-                ids[i] += this.offset
+                ids[i] += this.offset + 1
+            else:
+                ids[i] = 0
 
     int S(int i) nogil const:
         if i >= this._s_i:
@@ -162,9 +167,9 @@ cdef cppclass StateC:
 
     int E(int i) nogil const:
         if this._e_i <= 0 or this._e_i >= this.length:
-            return 0
+            return -1
         if i < 0 or i >= this._e_i:
-            return 0
+            return -1
         return this._ents[this._e_i - (i+1)].start
 
     int L(int i, int idx) nogil const:
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 552ea4f8f..ad6ed280e 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -394,7 +394,7 @@ cdef class Parser:
         tokvecs = self.model[0].ops.flatten(tokvecses)
         if USE_FINE_TUNE:
             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
-
+        tokvecs = self._pad_tokvecs(tokvecs)
         nr_state = len(docs)
         nr_class = self.moves.n_moves
         nr_dim = tokvecs.shape[1]
@@ -454,6 +454,7 @@ cdef class Parser:
         tokvecs = self.model[0].ops.flatten(tokvecses)
         if USE_FINE_TUNE:
             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
+        tokvecs = self._pad_tokvecs(tokvecs)
         cuda_stream = get_cuda_stream()
         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
                                                      cuda_stream, 0.0)
@@ -534,6 +535,8 @@ cdef class Parser:
             tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
             tokvecs = self.model[0].ops.flatten(tokvecs)
 
+        tokvecs = self._pad_tokvecs(tokvecs)
+
         cuda_stream = get_cuda_stream()
 
         states, golds, max_steps = self._init_gold_batch(docs, golds)
@@ -583,6 +586,7 @@ cdef class Parser:
                 break
         self._make_updates(d_tokvecs,
             backprops, sgd, cuda_stream)
+        d_tokvecs = self._unpad_tokvecs(d_tokvecs)
         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
         if USE_FINE_TUNE:
             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
@@ -639,10 +643,20 @@ cdef class Parser:
         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
         self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
+        d_tokvecs = self._unpad_tokvecs(d_tokvecs)
         if USE_FINE_TUNE:
             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
         return d_tokvecs
 
+    def _pad_tokvecs(self, tokvecs):
+        # Add a vector for missing values at the start of tokvecs
+        xp = get_array_module(tokvecs)
+        pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype)
+        return xp.vstack((pad, tokvecs))
+
+    def _unpad_tokvecs(self, d_tokvecs):
+        return d_tokvecs[1:]
+
     def _init_gold_batch(self, whole_docs, whole_golds):
         """Make a square batch, of length equal to the shortest doc. A long
         doc will get multiple states. Let's say we have a doc of length 2*N,