Improve parser feature extraction, for missing values

This commit is contained in:
Matthew Honnibal 2017-09-14 16:18:02 +02:00
parent daf869ab3b
commit c6395b057a
2 changed files with 24 additions and 5 deletions

View File

@ -101,9 +101,10 @@ cdef cppclass StateC:
elif n == 6: elif n == 6:
if this.B(0) >= 0: if this.B(0) >= 0:
ids[0] = this.B(0) ids[0] = this.B(0)
ids[1] = this.B(0)-1
else: else:
ids[0] = -1 ids[0] = -1
ids[1] = this.B(0) ids[1] = -1
ids[2] = this.B(1) ids[2] = this.B(1)
ids[3] = this.E(0) ids[3] = this.E(0)
if ids[3] >= 1: if ids[3] >= 1:
@ -118,8 +119,12 @@ cdef cppclass StateC:
# TODO error =/ # TODO error =/
pass pass
for i in range(n): for i in range(n):
# Token vectors should be padded, so that there's a vector for
# missing values at the start.
if ids[i] >= 0: if ids[i] >= 0:
ids[i] += this.offset ids[i] += this.offset + 1
else:
ids[i] = 0
int S(int i) nogil const: int S(int i) nogil const:
if i >= this._s_i: if i >= this._s_i:
@ -162,9 +167,9 @@ cdef cppclass StateC:
int E(int i) nogil const: int E(int i) nogil const:
if this._e_i <= 0 or this._e_i >= this.length: if this._e_i <= 0 or this._e_i >= this.length:
return 0 return -1
if i < 0 or i >= this._e_i: if i < 0 or i >= this._e_i:
return 0 return -1
return this._ents[this._e_i - (i+1)].start return this._ents[this._e_i - (i+1)].start
int L(int i, int idx) nogil const: int L(int i, int idx) nogil const:

View File

@ -394,7 +394,7 @@ cdef class Parser:
tokvecs = self.model[0].ops.flatten(tokvecses) tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE: if USE_FINE_TUNE:
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
tokvecs = self._pad_tokvecs(tokvecs)
nr_state = len(docs) nr_state = len(docs)
nr_class = self.moves.n_moves nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1] nr_dim = tokvecs.shape[1]
@ -454,6 +454,7 @@ cdef class Parser:
tokvecs = self.model[0].ops.flatten(tokvecses) tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE: if USE_FINE_TUNE:
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
tokvecs = self._pad_tokvecs(tokvecs)
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
cuda_stream, 0.0) cuda_stream, 0.0)
@ -534,6 +535,8 @@ cdef class Parser:
tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs = self.model[0].ops.flatten(tokvecs) tokvecs = self.model[0].ops.flatten(tokvecs)
tokvecs = self._pad_tokvecs(tokvecs)
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds) states, golds, max_steps = self._init_gold_batch(docs, golds)
@ -583,6 +586,7 @@ cdef class Parser:
break break
self._make_updates(d_tokvecs, self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream) backprops, sgd, cuda_stream)
d_tokvecs = self._unpad_tokvecs(d_tokvecs)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
if USE_FINE_TUNE: if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
@ -639,10 +643,20 @@ cdef class Parser:
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
d_tokvecs = self._unpad_tokvecs(d_tokvecs)
if USE_FINE_TUNE: if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs return d_tokvecs
def _pad_tokvecs(self, tokvecs):
# Add a vector for missing values at the start of tokvecs
xp = get_array_module(tokvecs)
pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype)
return xp.vstack((pad, tokvecs))
def _unpad_tokvecs(self, d_tokvecs):
return d_tokvecs[1:]
def _init_gold_batch(self, whole_docs, whole_golds): def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long """Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N, doc will get multiple states. Let's say we have a doc of length 2*N,