mirror of https://github.com/explosion/spaCy.git
Improve parser feature extraction, for missing values
This commit is contained in:
parent
daf869ab3b
commit
c6395b057a
|
@ -101,9 +101,10 @@ cdef cppclass StateC:
|
|||
elif n == 6:
|
||||
if this.B(0) >= 0:
|
||||
ids[0] = this.B(0)
|
||||
ids[1] = this.B(0)-1
|
||||
else:
|
||||
ids[0] = -1
|
||||
ids[1] = this.B(0)
|
||||
ids[1] = -1
|
||||
ids[2] = this.B(1)
|
||||
ids[3] = this.E(0)
|
||||
if ids[3] >= 1:
|
||||
|
@ -118,8 +119,12 @@ cdef cppclass StateC:
|
|||
# TODO error =/
|
||||
pass
|
||||
for i in range(n):
|
||||
# Token vectors should be padded, so that there's a vector for
|
||||
# missing values at the start.
|
||||
if ids[i] >= 0:
|
||||
ids[i] += this.offset
|
||||
ids[i] += this.offset + 1
|
||||
else:
|
||||
ids[i] = 0
|
||||
|
||||
int S(int i) nogil const:
|
||||
if i >= this._s_i:
|
||||
|
@ -162,9 +167,9 @@ cdef cppclass StateC:
|
|||
|
||||
int E(int i) nogil const:
|
||||
if this._e_i <= 0 or this._e_i >= this.length:
|
||||
return 0
|
||||
return -1
|
||||
if i < 0 or i >= this._e_i:
|
||||
return 0
|
||||
return -1
|
||||
return this._ents[this._e_i - (i+1)].start
|
||||
|
||||
int L(int i, int idx) nogil const:
|
||||
|
|
|
@ -394,7 +394,7 @@ cdef class Parser:
|
|||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||
if USE_FINE_TUNE:
|
||||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||
|
||||
tokvecs = self._pad_tokvecs(tokvecs)
|
||||
nr_state = len(docs)
|
||||
nr_class = self.moves.n_moves
|
||||
nr_dim = tokvecs.shape[1]
|
||||
|
@ -454,6 +454,7 @@ cdef class Parser:
|
|||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||
if USE_FINE_TUNE:
|
||||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||
tokvecs = self._pad_tokvecs(tokvecs)
|
||||
cuda_stream = get_cuda_stream()
|
||||
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
||||
cuda_stream, 0.0)
|
||||
|
@ -534,6 +535,8 @@ cdef class Parser:
|
|||
tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||
tokvecs = self.model[0].ops.flatten(tokvecs)
|
||||
|
||||
tokvecs = self._pad_tokvecs(tokvecs)
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
|
||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||
|
@ -583,6 +586,7 @@ cdef class Parser:
|
|||
break
|
||||
self._make_updates(d_tokvecs,
|
||||
backprops, sgd, cuda_stream)
|
||||
d_tokvecs = self._unpad_tokvecs(d_tokvecs)
|
||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||
if USE_FINE_TUNE:
|
||||
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||
|
@ -639,10 +643,20 @@ cdef class Parser:
|
|||
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
|
||||
d_tokvecs = self._unpad_tokvecs(d_tokvecs)
|
||||
if USE_FINE_TUNE:
|
||||
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||
return d_tokvecs
|
||||
|
||||
def _pad_tokvecs(self, tokvecs):
|
||||
# Add a vector for missing values at the start of tokvecs
|
||||
xp = get_array_module(tokvecs)
|
||||
pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype)
|
||||
return xp.vstack((pad, tokvecs))
|
||||
|
||||
def _unpad_tokvecs(self, d_tokvecs):
|
||||
return d_tokvecs[1:]
|
||||
|
||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||
"""Make a square batch, of length equal to the shortest doc. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
|
|
Loading…
Reference in New Issue