mirror of https://github.com/explosion/spaCy.git
Small fixes to parser
This commit is contained in:
parent
188c0f6949
commit
e6d71e1778
|
@ -33,7 +33,7 @@ from preshed.maps cimport MapStruct
|
||||||
from preshed.maps cimport map_get
|
from preshed.maps cimport map_get
|
||||||
|
|
||||||
from thinc.api import layerize, chain
|
from thinc.api import layerize, chain
|
||||||
from thinc.neural import Affine, Model, Maxout
|
from thinc.neural import BatchNorm, Model, Affine, ELU, ReLu, Maxout
|
||||||
from thinc.neural.ops import NumpyOps
|
from thinc.neural.ops import NumpyOps
|
||||||
|
|
||||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||||
|
@ -62,7 +62,8 @@ def set_debug(val):
|
||||||
DEBUG = val
|
DEBUG = val
|
||||||
|
|
||||||
|
|
||||||
def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=None):
|
def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||||
|
drop=0.):
|
||||||
'''Allow a model to be "primed" by pre-computing input features in bulk.
|
'''Allow a model to be "primed" by pre-computing input features in bulk.
|
||||||
|
|
||||||
This is used for the parser, where we want to take a batch of documents,
|
This is used for the parser, where we want to take a batch of documents,
|
||||||
|
@ -79,16 +80,17 @@ def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=Non
|
||||||
we can do all our hard maths up front, packed into large multiplications,
|
we can do all our hard maths up front, packed into large multiplications,
|
||||||
and do the hard-to-program parsing on the CPU.
|
and do the hard-to-program parsing on the CPU.
|
||||||
'''
|
'''
|
||||||
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=0.)
|
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
||||||
cdef np.ndarray cached
|
cdef np.ndarray cached
|
||||||
if not isinstance(gpu_cached, numpy.ndarray):
|
if not isinstance(gpu_cached, numpy.ndarray):
|
||||||
cached = gpu_cached.get(stream=cuda_stream)
|
cached = gpu_cached.get(stream=cuda_stream)
|
||||||
else:
|
else:
|
||||||
cached = gpu_cached
|
cached = gpu_cached
|
||||||
nF = gpu_cached.shape[1]
|
nF = gpu_cached.shape[1]
|
||||||
|
nO = gpu_cached.shape[2]
|
||||||
nP = gpu_cached.shape[3]
|
nP = gpu_cached.shape[3]
|
||||||
ops = lower_model.ops
|
ops = lower_model.ops
|
||||||
features = numpy.zeros((batch_size, cached.shape[2], nP), dtype='f')
|
features = numpy.zeros((batch_size, nO, nP), dtype='f')
|
||||||
synchronized = False
|
synchronized = False
|
||||||
|
|
||||||
def forward(token_ids, drop=0.):
|
def forward(token_ids, drop=0.):
|
||||||
|
@ -108,7 +110,7 @@ def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=Non
|
||||||
cdef int[:, ::1] ids = token_ids
|
cdef int[:, ::1] ids = token_ids
|
||||||
_sum_features(<float*>&feats[0,0,0],
|
_sum_features(<float*>&feats[0,0,0],
|
||||||
<float*>cached.data, &ids[0,0],
|
<float*>cached.data, &ids[0,0],
|
||||||
token_ids.shape[0], nF, cached.shape[2]*nP)
|
token_ids.shape[0], nF, nO*nP)
|
||||||
|
|
||||||
if nP >= 2:
|
if nP >= 2:
|
||||||
best, which = ops.maxout(features)
|
best, which = ops.maxout(features)
|
||||||
|
@ -155,13 +157,16 @@ def get_batch_loss(TransitionSystem moves, states, golds, float[:, ::1] scores):
|
||||||
cdef int i
|
cdef int i
|
||||||
is_valid = <int*>mem.alloc(moves.n_moves, sizeof(int))
|
is_valid = <int*>mem.alloc(moves.n_moves, sizeof(int))
|
||||||
costs = <float*>mem.alloc(moves.n_moves, sizeof(float))
|
costs = <float*>mem.alloc(moves.n_moves, sizeof(float))
|
||||||
cdef np.ndarray d_scores = numpy.zeros((len(states), moves.n_moves), dtype='f')
|
cdef np.ndarray d_scores = numpy.zeros((len(states), moves.n_moves), dtype='f',
|
||||||
|
order='c')
|
||||||
c_d_scores = <float*>d_scores.data
|
c_d_scores = <float*>d_scores.data
|
||||||
for i, (state, gold) in enumerate(zip(states, golds)):
|
for i, (state, gold) in enumerate(zip(states, golds)):
|
||||||
memset(is_valid, 0, moves.n_moves * sizeof(int))
|
memset(is_valid, 0, moves.n_moves * sizeof(int))
|
||||||
memset(costs, 0, moves.n_moves * sizeof(float))
|
memset(costs, 0, moves.n_moves * sizeof(float))
|
||||||
moves.set_costs(is_valid, costs, state, gold)
|
moves.set_costs(is_valid, costs, state, gold)
|
||||||
cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||||
|
#cpu_regression_loss(c_d_scores,
|
||||||
|
# costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||||
c_d_scores += d_scores.shape[1]
|
c_d_scores += d_scores.shape[1]
|
||||||
return d_scores
|
return d_scores
|
||||||
|
|
||||||
|
@ -231,7 +236,7 @@ def init_states(TransitionSystem moves, docs):
|
||||||
def extract_token_ids(states, offsets=None, nF=1, nB=0, nS=2, nL=0, nR=0):
|
def extract_token_ids(states, offsets=None, nF=1, nB=0, nS=2, nL=0, nR=0):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
cdef int n_tokens = states[0].nr_context_tokens(nF, nB, nS, nL, nR)
|
cdef int n_tokens = states[0].nr_context_tokens(nF, nB, nS, nL, nR)
|
||||||
ids = numpy.zeros((len(states), n_tokens), dtype='i')
|
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
|
||||||
if offsets is None:
|
if offsets is None:
|
||||||
offsets = [0] * len(states)
|
offsets = [0] * len(states)
|
||||||
for i, (state, offset) in enumerate(zip(states, offsets)):
|
for i, (state, offset) in enumerate(zip(states, offsets)):
|
||||||
|
@ -240,6 +245,24 @@ def extract_token_ids(states, offsets=None, nF=1, nB=0, nS=2, nL=0, nR=0):
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
|
||||||
|
_n_iter = 0
|
||||||
|
@layerize
|
||||||
|
def print_mean_variance(X, drop=0.):
|
||||||
|
global _n_iter
|
||||||
|
_n_iter += 1
|
||||||
|
fwd_iter = _n_iter
|
||||||
|
means = X.mean(axis=0)
|
||||||
|
variance = X.var(axis=0)
|
||||||
|
print(fwd_iter, "M", ', '.join(('%.2f' % m) for m in means))
|
||||||
|
print(fwd_iter, "V", ', '.join(('%.2f' % m) for m in variance))
|
||||||
|
def backward(dX, sgd=None):
|
||||||
|
means = dX.mean(axis=0)
|
||||||
|
variance = dX.var(axis=0)
|
||||||
|
print(fwd_iter, "dM", ', '.join(('%.2f' % m) for m in means))
|
||||||
|
print(fwd_iter, "dV", ', '.join(('%.2f' % m) for m in variance))
|
||||||
|
return X, backward
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser:
|
||||||
"""
|
"""
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
|
@ -301,13 +324,14 @@ cdef class Parser:
|
||||||
nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR)
|
nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR)
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device('cpu'):
|
||||||
upper = chain(
|
upper = chain(
|
||||||
Maxout(token_vector_width),
|
Maxout(hidden_width, hidden_width),
|
||||||
zero_init(Affine(self.moves.n_moves, token_vector_width)))
|
#print_mean_variance,
|
||||||
|
zero_init(Affine(self.moves.n_moves, hidden_width)))
|
||||||
assert isinstance(upper.ops, NumpyOps)
|
assert isinstance(upper.ops, NumpyOps)
|
||||||
lower = PrecomputableMaxouts(token_vector_width, nF=nr_context_tokens, nI=token_vector_width,
|
lower = PrecomputableMaxouts(hidden_width, nF=nr_context_tokens, nI=token_vector_width,
|
||||||
pieces=cfg.get('maxout_pieces', 1))
|
pieces=cfg.get('maxout_pieces', 1))
|
||||||
upper.begin_training(upper.ops.allocate((500, token_vector_width)))
|
|
||||||
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
||||||
|
upper.begin_training(upper.ops.allocate((500, hidden_width)))
|
||||||
return upper, lower
|
return upper, lower
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
|
@ -390,13 +414,15 @@ cdef class Parser:
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None):
|
||||||
cdef:
|
cdef:
|
||||||
int nC
|
int nC
|
||||||
int[500] is_valid # Hack for now
|
|
||||||
Doc doc
|
Doc doc
|
||||||
StateClass state
|
StateClass state
|
||||||
np.ndarray scores
|
np.ndarray scores
|
||||||
|
|
||||||
docs, tokvecs = docs_tokvecs
|
docs, tokvecs = docs_tokvecs
|
||||||
cuda_stream = Stream()
|
cuda_stream = Stream()
|
||||||
|
lower_model = get_greedy_model_for_batch(len(docs),
|
||||||
|
tokvecs, self.feature_maps, cuda_stream=cuda_stream,
|
||||||
|
drop=drop)
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||||
return self.update(([docs], tokvecs), [golds], drop=drop)
|
return self.update(([docs], tokvecs), [golds], drop=drop)
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
|
@ -407,33 +433,47 @@ cdef class Parser:
|
||||||
todo = zip(states, offsets, golds)
|
todo = zip(states, offsets, golds)
|
||||||
todo = filter(lambda sp: not sp[0].py_is_final(), todo)
|
todo = filter(lambda sp: not sp[0].py_is_final(), todo)
|
||||||
|
|
||||||
lower_model = get_greedy_model_for_batch(len(todo),
|
cdef Pool mem = Pool()
|
||||||
tokvecs, self.feature_maps, cuda_stream=cuda_stream)
|
is_valid = <int*>mem.alloc(len(states) * self.moves.n_moves, sizeof(int))
|
||||||
|
costs = <float*>mem.alloc(len(states) * self.moves.n_moves, sizeof(float))
|
||||||
|
|
||||||
upper_model = self.model
|
upper_model = self.model
|
||||||
d_tokens = self.feature_maps.ops.allocate(tokvecs.shape)
|
d_tokens = self.feature_maps.ops.allocate(tokvecs.shape)
|
||||||
backprops = []
|
backprops = []
|
||||||
n_tokens = tokvecs.shape[0]
|
n_tokens = tokvecs.shape[0]
|
||||||
nF = self.feature_maps.nF
|
nF = self.feature_maps.nF
|
||||||
while todo:
|
loss = 0.
|
||||||
|
total = 1e-4
|
||||||
|
follow_gold = False
|
||||||
|
while len(todo) >= 4:
|
||||||
states, offsets, golds = zip(*todo)
|
states, offsets, golds = zip(*todo)
|
||||||
|
|
||||||
token_ids = extract_token_ids(states, offsets=offsets)
|
token_ids = extract_token_ids(states, offsets=offsets)
|
||||||
lower, bp_lower = lower_model(token_ids)
|
lower, bp_lower = lower_model(token_ids, drop=drop)
|
||||||
scores, bp_scores = upper_model.begin_update(lower)
|
scores, bp_scores = upper_model.begin_update(lower, drop=drop)
|
||||||
|
|
||||||
d_scores = get_batch_loss(self.moves, states, golds, scores)
|
d_scores = get_batch_loss(self.moves, states, golds, scores)
|
||||||
|
loss += numpy.abs(d_scores).sum()
|
||||||
|
total += d_scores.shape[0]
|
||||||
d_lower = bp_scores(d_scores, sgd=sgd)
|
d_lower = bp_scores(d_scores, sgd=sgd)
|
||||||
|
|
||||||
gpu_tok_ids = cupy.ndarray(token_ids.shape, dtype='i')
|
if isinstance(tokvecs, cupy.ndarray):
|
||||||
gpu_d_lower = cupy.ndarray(d_lower.shape, dtype='f')
|
gpu_tok_ids = cupy.ndarray(token_ids.shape, dtype='i', order='C')
|
||||||
gpu_tok_ids.set(token_ids, stream=cuda_stream)
|
gpu_d_lower = cupy.ndarray(d_lower.shape, dtype='f', order='C')
|
||||||
gpu_d_lower.set(d_lower, stream=cuda_stream)
|
gpu_tok_ids.set(token_ids, stream=cuda_stream)
|
||||||
backprops.append((gpu_tok_ids, gpu_d_lower, bp_lower))
|
gpu_d_lower.set(d_lower, stream=cuda_stream)
|
||||||
|
backprops.append((gpu_tok_ids, gpu_d_lower, bp_lower))
|
||||||
|
else:
|
||||||
|
backprops.append((token_ids, d_lower, bp_lower))
|
||||||
|
|
||||||
c_scores = <float*>scores.data
|
c_scores = <float*>scores.data
|
||||||
for state in states:
|
for state, gold in zip(states, golds):
|
||||||
self.moves.set_valid(is_valid, state.c)
|
if follow_gold:
|
||||||
guess = arg_max_if_valid(c_scores, is_valid, scores.shape[1])
|
self.moves.set_costs(is_valid, costs, state, gold)
|
||||||
|
guess = arg_max_if_gold(c_scores, costs, is_valid, scores.shape[1])
|
||||||
|
else:
|
||||||
|
self.moves.set_valid(is_valid, state.c)
|
||||||
|
guess = arg_max_if_valid(c_scores, is_valid, scores.shape[1])
|
||||||
action = self.moves.c[guess]
|
action = self.moves.c[guess]
|
||||||
action.do(state.c, action.label)
|
action.do(state.c, action.label)
|
||||||
c_scores += scores.shape[1]
|
c_scores += scores.shape[1]
|
||||||
|
@ -451,7 +491,7 @@ cdef class Parser:
|
||||||
else:
|
else:
|
||||||
self.model.ops.xp.add.at(d_tokens,
|
self.model.ops.xp.add.at(d_tokens,
|
||||||
token_ids, d_state_features * active_feats)
|
token_ids, d_state_features * active_feats)
|
||||||
return d_tokens
|
return d_tokens, loss / total
|
||||||
|
|
||||||
def step_through(self, Doc doc, GoldParse gold=None):
|
def step_through(self, Doc doc, GoldParse gold=None):
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue