Small fixes to parser

This commit is contained in:
Matthew Honnibal 2017-05-13 17:19:04 -05:00
parent 188c0f6949
commit e6d71e1778
1 changed files with 66 additions and 26 deletions

View File

@ -33,7 +33,7 @@ from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from thinc.api import layerize, chain from thinc.api import layerize, chain
from thinc.neural import Affine, Model, Maxout from thinc.neural import BatchNorm, Model, Affine, ELU, ReLu, Maxout
from thinc.neural.ops import NumpyOps from thinc.neural.ops import NumpyOps
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
@ -62,7 +62,8 @@ def set_debug(val):
DEBUG = val DEBUG = val
def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=None): def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=None,
drop=0.):
'''Allow a model to be "primed" by pre-computing input features in bulk. '''Allow a model to be "primed" by pre-computing input features in bulk.
This is used for the parser, where we want to take a batch of documents, This is used for the parser, where we want to take a batch of documents,
@ -79,16 +80,17 @@ def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=Non
we can do all our hard maths up front, packed into large multiplications, we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU. and do the hard-to-program parsing on the CPU.
''' '''
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=0.) gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
cdef np.ndarray cached cdef np.ndarray cached
if not isinstance(gpu_cached, numpy.ndarray): if not isinstance(gpu_cached, numpy.ndarray):
cached = gpu_cached.get(stream=cuda_stream) cached = gpu_cached.get(stream=cuda_stream)
else: else:
cached = gpu_cached cached = gpu_cached
nF = gpu_cached.shape[1] nF = gpu_cached.shape[1]
nO = gpu_cached.shape[2]
nP = gpu_cached.shape[3] nP = gpu_cached.shape[3]
ops = lower_model.ops ops = lower_model.ops
features = numpy.zeros((batch_size, cached.shape[2], nP), dtype='f') features = numpy.zeros((batch_size, nO, nP), dtype='f')
synchronized = False synchronized = False
def forward(token_ids, drop=0.): def forward(token_ids, drop=0.):
@ -108,7 +110,7 @@ def get_greedy_model_for_batch(batch_size, tokvecs, lower_model, cuda_stream=Non
cdef int[:, ::1] ids = token_ids cdef int[:, ::1] ids = token_ids
_sum_features(<float*>&feats[0,0,0], _sum_features(<float*>&feats[0,0,0],
<float*>cached.data, &ids[0,0], <float*>cached.data, &ids[0,0],
token_ids.shape[0], nF, cached.shape[2]*nP) token_ids.shape[0], nF, nO*nP)
if nP >= 2: if nP >= 2:
best, which = ops.maxout(features) best, which = ops.maxout(features)
@ -155,13 +157,16 @@ def get_batch_loss(TransitionSystem moves, states, golds, float[:, ::1] scores):
cdef int i cdef int i
is_valid = <int*>mem.alloc(moves.n_moves, sizeof(int)) is_valid = <int*>mem.alloc(moves.n_moves, sizeof(int))
costs = <float*>mem.alloc(moves.n_moves, sizeof(float)) costs = <float*>mem.alloc(moves.n_moves, sizeof(float))
cdef np.ndarray d_scores = numpy.zeros((len(states), moves.n_moves), dtype='f') cdef np.ndarray d_scores = numpy.zeros((len(states), moves.n_moves), dtype='f',
order='c')
c_d_scores = <float*>d_scores.data c_d_scores = <float*>d_scores.data
for i, (state, gold) in enumerate(zip(states, golds)): for i, (state, gold) in enumerate(zip(states, golds)):
memset(is_valid, 0, moves.n_moves * sizeof(int)) memset(is_valid, 0, moves.n_moves * sizeof(int))
memset(costs, 0, moves.n_moves * sizeof(float)) memset(costs, 0, moves.n_moves * sizeof(float))
moves.set_costs(is_valid, costs, state, gold) moves.set_costs(is_valid, costs, state, gold)
cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]) cpu_log_loss(c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1])
#cpu_regression_loss(c_d_scores,
# costs, is_valid, &scores[i, 0], d_scores.shape[1])
c_d_scores += d_scores.shape[1] c_d_scores += d_scores.shape[1]
return d_scores return d_scores
@ -231,7 +236,7 @@ def init_states(TransitionSystem moves, docs):
def extract_token_ids(states, offsets=None, nF=1, nB=0, nS=2, nL=0, nR=0): def extract_token_ids(states, offsets=None, nF=1, nB=0, nS=2, nL=0, nR=0):
cdef StateClass state cdef StateClass state
cdef int n_tokens = states[0].nr_context_tokens(nF, nB, nS, nL, nR) cdef int n_tokens = states[0].nr_context_tokens(nF, nB, nS, nL, nR)
ids = numpy.zeros((len(states), n_tokens), dtype='i') ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
if offsets is None: if offsets is None:
offsets = [0] * len(states) offsets = [0] * len(states)
for i, (state, offset) in enumerate(zip(states, offsets)): for i, (state, offset) in enumerate(zip(states, offsets)):
@ -240,6 +245,24 @@ def extract_token_ids(states, offsets=None, nF=1, nB=0, nS=2, nL=0, nR=0):
return ids return ids
_n_iter = 0
@layerize
def print_mean_variance(X, drop=0.):
global _n_iter
_n_iter += 1
fwd_iter = _n_iter
means = X.mean(axis=0)
variance = X.var(axis=0)
print(fwd_iter, "M", ', '.join(('%.2f' % m) for m in means))
print(fwd_iter, "V", ', '.join(('%.2f' % m) for m in variance))
def backward(dX, sgd=None):
means = dX.mean(axis=0)
variance = dX.var(axis=0)
print(fwd_iter, "dM", ', '.join(('%.2f' % m) for m in means))
print(fwd_iter, "dV", ', '.join(('%.2f' % m) for m in variance))
return X, backward
cdef class Parser: cdef class Parser:
""" """
Base class of the DependencyParser and EntityRecognizer. Base class of the DependencyParser and EntityRecognizer.
@ -301,13 +324,14 @@ cdef class Parser:
nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR) nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR)
with Model.use_device('cpu'): with Model.use_device('cpu'):
upper = chain( upper = chain(
Maxout(token_vector_width), Maxout(hidden_width, hidden_width),
zero_init(Affine(self.moves.n_moves, token_vector_width))) #print_mean_variance,
zero_init(Affine(self.moves.n_moves, hidden_width)))
assert isinstance(upper.ops, NumpyOps) assert isinstance(upper.ops, NumpyOps)
lower = PrecomputableMaxouts(token_vector_width, nF=nr_context_tokens, nI=token_vector_width, lower = PrecomputableMaxouts(hidden_width, nF=nr_context_tokens, nI=token_vector_width,
pieces=cfg.get('maxout_pieces', 1)) pieces=cfg.get('maxout_pieces', 1))
upper.begin_training(upper.ops.allocate((500, token_vector_width)))
lower.begin_training(lower.ops.allocate((500, token_vector_width))) lower.begin_training(lower.ops.allocate((500, token_vector_width)))
upper.begin_training(upper.ops.allocate((500, hidden_width)))
return upper, lower return upper, lower
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
@ -390,13 +414,15 @@ cdef class Parser:
def update(self, docs_tokvecs, golds, drop=0., sgd=None): def update(self, docs_tokvecs, golds, drop=0., sgd=None):
cdef: cdef:
int nC int nC
int[500] is_valid # Hack for now
Doc doc Doc doc
StateClass state StateClass state
np.ndarray scores np.ndarray scores
docs, tokvecs = docs_tokvecs docs, tokvecs = docs_tokvecs
cuda_stream = Stream() cuda_stream = Stream()
lower_model = get_greedy_model_for_batch(len(docs),
tokvecs, self.feature_maps, cuda_stream=cuda_stream,
drop=drop)
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
return self.update(([docs], tokvecs), [golds], drop=drop) return self.update(([docs], tokvecs), [golds], drop=drop)
for gold in golds: for gold in golds:
@ -407,33 +433,47 @@ cdef class Parser:
todo = zip(states, offsets, golds) todo = zip(states, offsets, golds)
todo = filter(lambda sp: not sp[0].py_is_final(), todo) todo = filter(lambda sp: not sp[0].py_is_final(), todo)
lower_model = get_greedy_model_for_batch(len(todo), cdef Pool mem = Pool()
tokvecs, self.feature_maps, cuda_stream=cuda_stream) is_valid = <int*>mem.alloc(len(states) * self.moves.n_moves, sizeof(int))
costs = <float*>mem.alloc(len(states) * self.moves.n_moves, sizeof(float))
upper_model = self.model upper_model = self.model
d_tokens = self.feature_maps.ops.allocate(tokvecs.shape) d_tokens = self.feature_maps.ops.allocate(tokvecs.shape)
backprops = [] backprops = []
n_tokens = tokvecs.shape[0] n_tokens = tokvecs.shape[0]
nF = self.feature_maps.nF nF = self.feature_maps.nF
while todo: loss = 0.
total = 1e-4
follow_gold = False
while len(todo) >= 4:
states, offsets, golds = zip(*todo) states, offsets, golds = zip(*todo)
token_ids = extract_token_ids(states, offsets=offsets) token_ids = extract_token_ids(states, offsets=offsets)
lower, bp_lower = lower_model(token_ids) lower, bp_lower = lower_model(token_ids, drop=drop)
scores, bp_scores = upper_model.begin_update(lower) scores, bp_scores = upper_model.begin_update(lower, drop=drop)
d_scores = get_batch_loss(self.moves, states, golds, scores) d_scores = get_batch_loss(self.moves, states, golds, scores)
loss += numpy.abs(d_scores).sum()
total += d_scores.shape[0]
d_lower = bp_scores(d_scores, sgd=sgd) d_lower = bp_scores(d_scores, sgd=sgd)
gpu_tok_ids = cupy.ndarray(token_ids.shape, dtype='i') if isinstance(tokvecs, cupy.ndarray):
gpu_d_lower = cupy.ndarray(d_lower.shape, dtype='f') gpu_tok_ids = cupy.ndarray(token_ids.shape, dtype='i', order='C')
gpu_tok_ids.set(token_ids, stream=cuda_stream) gpu_d_lower = cupy.ndarray(d_lower.shape, dtype='f', order='C')
gpu_d_lower.set(d_lower, stream=cuda_stream) gpu_tok_ids.set(token_ids, stream=cuda_stream)
backprops.append((gpu_tok_ids, gpu_d_lower, bp_lower)) gpu_d_lower.set(d_lower, stream=cuda_stream)
backprops.append((gpu_tok_ids, gpu_d_lower, bp_lower))
else:
backprops.append((token_ids, d_lower, bp_lower))
c_scores = <float*>scores.data c_scores = <float*>scores.data
for state in states: for state, gold in zip(states, golds):
self.moves.set_valid(is_valid, state.c) if follow_gold:
guess = arg_max_if_valid(c_scores, is_valid, scores.shape[1]) self.moves.set_costs(is_valid, costs, state, gold)
guess = arg_max_if_gold(c_scores, costs, is_valid, scores.shape[1])
else:
self.moves.set_valid(is_valid, state.c)
guess = arg_max_if_valid(c_scores, is_valid, scores.shape[1])
action = self.moves.c[guess] action = self.moves.c[guess]
action.do(state.c, action.label) action.do(state.c, action.label)
c_scores += scores.shape[1] c_scores += scores.shape[1]
@ -451,7 +491,7 @@ cdef class Parser:
else: else:
self.model.ops.xp.add.at(d_tokens, self.model.ops.xp.add.at(d_tokens,
token_ids, d_state_features * active_feats) token_ids, d_state_features * active_feats)
return d_tokens return d_tokens, loss / total
def step_through(self, Doc doc, GoldParse gold=None): def step_through(self, Doc doc, GoldParse gold=None):
""" """