mirror of https://github.com/explosion/spaCy.git
Improve parser feature extraction, for missing values
This commit is contained in:
parent
daf869ab3b
commit
c6395b057a
|
@ -101,9 +101,10 @@ cdef cppclass StateC:
|
||||||
elif n == 6:
|
elif n == 6:
|
||||||
if this.B(0) >= 0:
|
if this.B(0) >= 0:
|
||||||
ids[0] = this.B(0)
|
ids[0] = this.B(0)
|
||||||
|
ids[1] = this.B(0)-1
|
||||||
else:
|
else:
|
||||||
ids[0] = -1
|
ids[0] = -1
|
||||||
ids[1] = this.B(0)
|
ids[1] = -1
|
||||||
ids[2] = this.B(1)
|
ids[2] = this.B(1)
|
||||||
ids[3] = this.E(0)
|
ids[3] = this.E(0)
|
||||||
if ids[3] >= 1:
|
if ids[3] >= 1:
|
||||||
|
@ -118,8 +119,12 @@ cdef cppclass StateC:
|
||||||
# TODO error =/
|
# TODO error =/
|
||||||
pass
|
pass
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
|
# Token vectors should be padded, so that there's a vector for
|
||||||
|
# missing values at the start.
|
||||||
if ids[i] >= 0:
|
if ids[i] >= 0:
|
||||||
ids[i] += this.offset
|
ids[i] += this.offset + 1
|
||||||
|
else:
|
||||||
|
ids[i] = 0
|
||||||
|
|
||||||
int S(int i) nogil const:
|
int S(int i) nogil const:
|
||||||
if i >= this._s_i:
|
if i >= this._s_i:
|
||||||
|
@ -162,9 +167,9 @@ cdef cppclass StateC:
|
||||||
|
|
||||||
int E(int i) nogil const:
|
int E(int i) nogil const:
|
||||||
if this._e_i <= 0 or this._e_i >= this.length:
|
if this._e_i <= 0 or this._e_i >= this.length:
|
||||||
return 0
|
return -1
|
||||||
if i < 0 or i >= this._e_i:
|
if i < 0 or i >= this._e_i:
|
||||||
return 0
|
return -1
|
||||||
return this._ents[this._e_i - (i+1)].start
|
return this._ents[this._e_i - (i+1)].start
|
||||||
|
|
||||||
int L(int i, int idx) nogil const:
|
int L(int i, int idx) nogil const:
|
||||||
|
|
|
@ -394,7 +394,7 @@ cdef class Parser:
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
if USE_FINE_TUNE:
|
if USE_FINE_TUNE:
|
||||||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||||
|
tokvecs = self._pad_tokvecs(tokvecs)
|
||||||
nr_state = len(docs)
|
nr_state = len(docs)
|
||||||
nr_class = self.moves.n_moves
|
nr_class = self.moves.n_moves
|
||||||
nr_dim = tokvecs.shape[1]
|
nr_dim = tokvecs.shape[1]
|
||||||
|
@ -454,6 +454,7 @@ cdef class Parser:
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
if USE_FINE_TUNE:
|
if USE_FINE_TUNE:
|
||||||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||||
|
tokvecs = self._pad_tokvecs(tokvecs)
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
||||||
cuda_stream, 0.0)
|
cuda_stream, 0.0)
|
||||||
|
@ -534,6 +535,8 @@ cdef class Parser:
|
||||||
tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecs)
|
tokvecs = self.model[0].ops.flatten(tokvecs)
|
||||||
|
|
||||||
|
tokvecs = self._pad_tokvecs(tokvecs)
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
|
|
||||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||||
|
@ -583,6 +586,7 @@ cdef class Parser:
|
||||||
break
|
break
|
||||||
self._make_updates(d_tokvecs,
|
self._make_updates(d_tokvecs,
|
||||||
backprops, sgd, cuda_stream)
|
backprops, sgd, cuda_stream)
|
||||||
|
d_tokvecs = self._unpad_tokvecs(d_tokvecs)
|
||||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||||
if USE_FINE_TUNE:
|
if USE_FINE_TUNE:
|
||||||
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
|
@ -639,10 +643,20 @@ cdef class Parser:
|
||||||
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||||
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
|
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
|
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
|
||||||
|
d_tokvecs = self._unpad_tokvecs(d_tokvecs)
|
||||||
if USE_FINE_TUNE:
|
if USE_FINE_TUNE:
|
||||||
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
return d_tokvecs
|
return d_tokvecs
|
||||||
|
|
||||||
|
def _pad_tokvecs(self, tokvecs):
|
||||||
|
# Add a vector for missing values at the start of tokvecs
|
||||||
|
xp = get_array_module(tokvecs)
|
||||||
|
pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype)
|
||||||
|
return xp.vstack((pad, tokvecs))
|
||||||
|
|
||||||
|
def _unpad_tokvecs(self, d_tokvecs):
|
||||||
|
return d_tokvecs[1:]
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||||
|
|
Loading…
Reference in New Issue