2017-08-12 19:47:45 +00:00
|
|
|
# cython: infer_types=True
|
2017-08-13 10:37:26 +00:00
|
|
|
# cython: profile=True
|
2017-08-12 19:47:45 +00:00
|
|
|
cimport numpy as np
|
|
|
|
import numpy
|
|
|
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
|
|
|
from thinc.extra.search cimport Beam
|
|
|
|
from thinc.extra.search import MaxViolation
|
|
|
|
from thinc.typedefs cimport hash_t, class_t
|
|
|
|
|
|
|
|
from .transition_system cimport TransitionSystem, Transition
|
|
|
|
from .stateclass cimport StateClass
|
|
|
|
from ..gold cimport GoldParse
|
|
|
|
from ..tokens.doc cimport Doc
|
|
|
|
|
|
|
|
|
|
|
|
# These are passed as callbacks to thinc.search.Beam
|
|
|
|
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
|
|
|
dest = <StateClass>_dest
|
|
|
|
src = <StateClass>_src
|
|
|
|
moves = <const Transition*>_moves
|
|
|
|
dest.clone(src)
|
|
|
|
moves[clas].do(dest.c, moves[clas].label)
|
|
|
|
|
|
|
|
|
|
|
|
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
|
|
|
return (<StateClass>_state).is_final()
|
|
|
|
|
|
|
|
|
|
|
|
def _cleanup(Beam beam):
|
|
|
|
for i in range(beam.width):
|
|
|
|
Py_XDECREF(<PyObject*>beam._states[i].content)
|
|
|
|
Py_XDECREF(<PyObject*>beam._parents[i].content)
|
|
|
|
|
|
|
|
|
|
|
|
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
|
|
|
state = <StateClass>_state
|
|
|
|
if state.c.is_final():
|
|
|
|
return 1
|
|
|
|
else:
|
|
|
|
return state.c.hash()
|
|
|
|
|
|
|
|
|
|
|
|
cdef class ParserBeam(object):
|
|
|
|
cdef public TransitionSystem moves
|
2017-08-12 22:15:16 +00:00
|
|
|
cdef public object states
|
2017-08-12 19:47:45 +00:00
|
|
|
cdef public object golds
|
|
|
|
cdef public object beams
|
|
|
|
|
2017-08-12 22:15:16 +00:00
|
|
|
def __init__(self, TransitionSystem moves, states, golds,
|
2017-08-18 20:23:03 +00:00
|
|
|
int width=4, float density=0.001):
|
2017-08-12 19:47:45 +00:00
|
|
|
self.moves = moves
|
2017-08-12 22:15:16 +00:00
|
|
|
self.states = states
|
2017-08-12 19:47:45 +00:00
|
|
|
self.golds = golds
|
|
|
|
self.beams = []
|
|
|
|
cdef Beam beam
|
2017-08-12 22:15:16 +00:00
|
|
|
cdef StateClass state, st
|
|
|
|
for state in states:
|
2017-08-12 19:47:45 +00:00
|
|
|
beam = Beam(self.moves.n_moves, width, density)
|
2017-08-12 22:15:16 +00:00
|
|
|
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
|
2017-08-13 23:02:05 +00:00
|
|
|
for i in range(beam.width):
|
2017-08-12 22:15:16 +00:00
|
|
|
st = <StateClass>beam.at(i)
|
|
|
|
st.c.offset = state.c.offset
|
2017-08-12 19:47:45 +00:00
|
|
|
self.beams.append(beam)
|
2017-08-13 00:22:52 +00:00
|
|
|
|
|
|
|
def __dealloc__(self):
|
|
|
|
if self.beams is not None:
|
|
|
|
for beam in self.beams:
|
|
|
|
if beam is not None:
|
|
|
|
_cleanup(beam)
|
2017-08-13 07:33:39 +00:00
|
|
|
|
2017-08-12 19:47:45 +00:00
|
|
|
@property
|
|
|
|
def is_done(self):
|
2017-08-18 20:23:03 +00:00
|
|
|
return all(b.is_done for b in self.beams)
|
2017-08-12 19:47:45 +00:00
|
|
|
|
|
|
|
def __getitem__(self, i):
|
|
|
|
return self.beams[i]
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return len(self.beams)
|
|
|
|
|
|
|
|
def advance(self, scores, follow_gold=False):
|
|
|
|
cdef Beam beam
|
|
|
|
for i, beam in enumerate(self.beams):
|
2017-08-18 20:23:03 +00:00
|
|
|
if beam.is_done or not scores[i].size:
|
2017-08-13 07:33:39 +00:00
|
|
|
continue
|
2017-08-12 19:47:45 +00:00
|
|
|
self._set_scores(beam, scores[i])
|
|
|
|
if self.golds is not None:
|
|
|
|
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
|
2017-08-18 18:31:15 +00:00
|
|
|
if follow_gold:
|
2017-08-18 20:23:03 +00:00
|
|
|
assert self.golds is not None
|
2017-08-18 18:31:15 +00:00
|
|
|
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
|
|
|
else:
|
|
|
|
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
2017-08-13 00:22:52 +00:00
|
|
|
beam.check_done(_check_final_state, NULL)
|
2017-08-18 20:23:03 +00:00
|
|
|
if beam.is_done:
|
2017-08-13 23:02:05 +00:00
|
|
|
for j in range(beam.size):
|
2017-08-18 20:23:03 +00:00
|
|
|
if is_gold(<StateClass>beam.at(j), self.golds[i], self.moves.strings):
|
|
|
|
beam._states[j].loss = 0.0
|
|
|
|
elif beam._states[j].loss == 0.0:
|
|
|
|
beam._states[j].loss = 1.0
|
2017-08-13 00:22:52 +00:00
|
|
|
|
|
|
|
def _set_scores(self, Beam beam, float[:, ::1] scores):
|
|
|
|
cdef float* c_scores = &scores[0, 0]
|
2017-08-18 20:23:03 +00:00
|
|
|
for i in range(beam.size):
|
2017-08-12 19:47:45 +00:00
|
|
|
state = <StateClass>beam.at(i)
|
2017-08-12 23:21:54 +00:00
|
|
|
if not state.is_final():
|
2017-08-15 08:15:04 +00:00
|
|
|
for j in range(beam.nr_class):
|
2017-08-18 20:23:03 +00:00
|
|
|
beam.scores[i][j] = c_scores[i * beam.nr_class + j]
|
|
|
|
self.moves.set_valid(beam.is_valid[i], state.c)
|
2017-08-15 08:15:04 +00:00
|
|
|
|
2017-08-12 19:47:45 +00:00
|
|
|
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
|
|
|
|
for i in range(beam.size):
|
|
|
|
state = <StateClass>beam.at(i)
|
2017-08-13 00:22:52 +00:00
|
|
|
if not state.c.is_final():
|
|
|
|
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
|
|
|
|
if follow_gold:
|
|
|
|
for j in range(beam.nr_class):
|
|
|
|
if beam.costs[i][j] >= 1:
|
|
|
|
beam.is_valid[i][j] = 0
|
2017-08-13 07:33:39 +00:00
|
|
|
|
|
|
|
|
2017-08-18 20:23:03 +00:00
|
|
|
def is_gold(StateClass state, GoldParse gold, strings):
|
|
|
|
predicted = set()
|
|
|
|
truth = set()
|
|
|
|
for i in range(gold.length):
|
|
|
|
if gold.cand_to_gold[i] is None:
|
|
|
|
continue
|
|
|
|
if state.safe_get(i).dep:
|
|
|
|
predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
|
|
|
|
else:
|
|
|
|
predicted.add((i, state.H(i), 'ROOT'))
|
|
|
|
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
|
|
|
truth.add((id_, head, dep))
|
|
|
|
return truth == predicted
|
|
|
|
|
|
|
|
|
2017-08-12 19:47:45 +00:00
|
|
|
def get_token_ids(states, int n_tokens):
|
|
|
|
cdef StateClass state
|
|
|
|
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
|
2017-08-12 22:15:16 +00:00
|
|
|
dtype='int32', order='C')
|
2017-08-12 19:47:45 +00:00
|
|
|
c_ids = <int*>ids.data
|
|
|
|
for i, state in enumerate(states):
|
|
|
|
if not state.is_final():
|
|
|
|
state.c.set_context_tokens(c_ids, n_tokens)
|
2017-08-12 22:15:16 +00:00
|
|
|
else:
|
|
|
|
ids[i] = -1
|
2017-08-12 19:47:45 +00:00
|
|
|
c_ids += ids.shape[1]
|
|
|
|
return ids
|
|
|
|
|
2017-08-13 07:33:39 +00:00
|
|
|
nr_update = 0
|
2017-08-12 22:15:16 +00:00
|
|
|
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
|
|
|
states, tokvecs, golds,
|
2017-08-18 20:23:03 +00:00
|
|
|
state2vec, vec2scores, drop=0., sgd=None,
|
|
|
|
losses=None, int width=4, float density=0.001):
|
2017-08-13 07:33:39 +00:00
|
|
|
global nr_update
|
|
|
|
nr_update += 1
|
2017-08-12 22:15:16 +00:00
|
|
|
pbeam = ParserBeam(moves, states, golds,
|
2017-08-12 19:47:45 +00:00
|
|
|
width=width, density=density)
|
2017-08-12 22:15:16 +00:00
|
|
|
gbeam = ParserBeam(moves, states, golds,
|
2017-08-18 18:31:15 +00:00
|
|
|
width=width, density=0.0)
|
2017-08-13 23:02:05 +00:00
|
|
|
cdef StateClass state
|
2017-08-12 22:15:16 +00:00
|
|
|
beam_maps = []
|
2017-08-12 19:47:45 +00:00
|
|
|
backprops = []
|
2017-08-12 22:15:16 +00:00
|
|
|
violns = [MaxViolation() for _ in range(len(states))]
|
|
|
|
for t in range(max_steps):
|
2017-08-13 23:02:05 +00:00
|
|
|
# The beam maps let us find the right row in the flattened scores
|
|
|
|
# arrays for each state. States are identified by (example id, history).
|
|
|
|
# We keep a different beam map for each step (since we'll have a flat
|
|
|
|
# scores array for each step). The beam map will let us take the per-state
|
|
|
|
# losses, and compute the gradient for each (step, state, class).
|
2017-08-12 22:15:16 +00:00
|
|
|
beam_maps.append({})
|
2017-08-13 23:02:05 +00:00
|
|
|
# Gather all states from the two beams in a list. Some stats may occur
|
|
|
|
# in both beams. To figure out which beam each state belonged to,
|
|
|
|
# we keep two lists of indices, p_indices and g_indices
|
2017-08-13 07:33:39 +00:00
|
|
|
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
|
|
|
|
if not states:
|
|
|
|
break
|
2017-08-13 23:02:05 +00:00
|
|
|
# Now that we have our flat list of states, feed them through the model
|
2017-08-12 19:47:45 +00:00
|
|
|
token_ids = get_token_ids(states, nr_feature)
|
|
|
|
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
|
|
|
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
2017-08-12 22:15:16 +00:00
|
|
|
|
2017-08-13 23:02:05 +00:00
|
|
|
# Store the callbacks for the backward pass
|
2017-08-12 19:47:45 +00:00
|
|
|
backprops.append((token_ids, bp_vectors, bp_scores))
|
|
|
|
|
2017-08-13 23:02:05 +00:00
|
|
|
# Unpack the flat scores into lists for the two beams. The indices arrays
|
|
|
|
# tell us which example and state the scores-row refers to.
|
2017-08-12 23:21:54 +00:00
|
|
|
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
|
|
|
|
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
|
2017-08-13 23:02:05 +00:00
|
|
|
# Now advance the states in the beams. The gold beam is contrained to
|
|
|
|
# to follow only gold analyses.
|
2017-08-12 19:47:45 +00:00
|
|
|
pbeam.advance(p_scores)
|
|
|
|
gbeam.advance(g_scores, follow_gold=True)
|
2017-08-13 23:02:05 +00:00
|
|
|
# Track the "maximum violation", to use in the update.
|
2017-08-12 19:47:45 +00:00
|
|
|
for i, violn in enumerate(violns):
|
|
|
|
violn.check_crf(pbeam[i], gbeam[i])
|
2017-08-18 20:23:03 +00:00
|
|
|
|
|
|
|
# Only make updates if we have non-gold states
|
|
|
|
histories = [((v.p_hist + v.g_hist) if v.p_hist else []) for v in violns]
|
|
|
|
losses = [((v.p_probs + v.g_probs) if v.p_probs else []) for v in violns]
|
|
|
|
states_d_scores = get_gradient(moves.n_moves, beam_maps,
|
|
|
|
histories, losses)
|
|
|
|
assert len(states_d_scores) == len(backprops), (len(states_d_scores), len(backprops))
|
|
|
|
return states_d_scores, backprops
|
2017-08-12 19:47:45 +00:00
|
|
|
|
|
|
|
|
2017-08-13 07:33:39 +00:00
|
|
|
def get_states(pbeams, gbeams, beam_map, nr_update):
|
2017-08-12 19:47:45 +00:00
|
|
|
seen = {}
|
2017-08-12 22:15:16 +00:00
|
|
|
states = []
|
2017-08-12 19:47:45 +00:00
|
|
|
p_indices = []
|
|
|
|
g_indices = []
|
|
|
|
cdef Beam pbeam, gbeam
|
2017-08-13 23:02:05 +00:00
|
|
|
assert len(pbeams) == len(gbeams)
|
2017-08-12 22:15:16 +00:00
|
|
|
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
|
2017-08-12 19:47:45 +00:00
|
|
|
p_indices.append([])
|
2017-08-13 23:02:05 +00:00
|
|
|
g_indices.append([])
|
2017-08-18 20:23:03 +00:00
|
|
|
if pbeam.loss > 0 and pbeam.min_score > gbeam.score:
|
|
|
|
continue
|
2017-08-13 10:37:26 +00:00
|
|
|
for i in range(pbeam.size):
|
|
|
|
state = <StateClass>pbeam.at(i)
|
2017-08-13 07:33:39 +00:00
|
|
|
if not state.is_final():
|
2017-08-13 10:37:26 +00:00
|
|
|
key = tuple([eg_id] + pbeam.histories[i])
|
2017-08-13 07:33:39 +00:00
|
|
|
seen[key] = len(states)
|
|
|
|
p_indices[-1].append(len(states))
|
2017-08-13 23:02:05 +00:00
|
|
|
states.append(state)
|
2017-08-12 19:47:45 +00:00
|
|
|
beam_map.update(seen)
|
|
|
|
for i in range(gbeam.size):
|
2017-08-13 10:37:26 +00:00
|
|
|
state = <StateClass>gbeam.at(i)
|
2017-08-13 07:33:39 +00:00
|
|
|
if not state.is_final():
|
|
|
|
key = tuple([eg_id] + gbeam.histories[i])
|
|
|
|
if key in seen:
|
|
|
|
g_indices[-1].append(seen[key])
|
|
|
|
else:
|
|
|
|
g_indices[-1].append(len(states))
|
|
|
|
beam_map[key] = len(states)
|
2017-08-13 23:02:05 +00:00
|
|
|
states.append(state)
|
|
|
|
p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
|
|
|
|
g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
|
|
|
|
return states, p_idx, g_idx
|
2017-08-12 19:47:45 +00:00
|
|
|
|
|
|
|
|
2017-08-12 22:15:16 +00:00
|
|
|
def get_gradient(nr_class, beam_maps, histories, losses):
|
2017-08-12 19:47:45 +00:00
|
|
|
"""
|
|
|
|
The global model assigns a loss to each parse. The beam scores
|
|
|
|
are additive, so the same gradient is applied to each action
|
|
|
|
in the history. This gives the gradient of a single *action*
|
|
|
|
for a beam state -- so we have "the gradient of loss for taking
|
|
|
|
action i given history H."
|
2017-08-12 22:15:16 +00:00
|
|
|
|
|
|
|
Histories: Each hitory is a list of actions
|
|
|
|
Each candidate has a history
|
|
|
|
Each beam has multiple candidates
|
|
|
|
Each batch has multiple beams
|
|
|
|
So history is list of lists of lists of ints
|
2017-08-12 19:47:45 +00:00
|
|
|
"""
|
2017-08-12 22:15:16 +00:00
|
|
|
nr_step = len(beam_maps)
|
2017-08-13 07:33:39 +00:00
|
|
|
grads = []
|
2017-08-18 20:23:03 +00:00
|
|
|
for beam_map in beam_maps:
|
|
|
|
if beam_map:
|
|
|
|
grads.append(numpy.zeros((max(beam_map.values())+1, nr_class), dtype='f'))
|
2017-08-13 10:37:26 +00:00
|
|
|
assert len(histories) == len(losses)
|
2017-08-12 22:15:16 +00:00
|
|
|
for eg_id, hists in enumerate(histories):
|
|
|
|
for loss, hist in zip(losses[eg_id], hists):
|
|
|
|
key = tuple([eg_id])
|
|
|
|
for j, clas in enumerate(hist):
|
2017-08-13 23:02:05 +00:00
|
|
|
i = beam_maps[j][key]
|
2017-08-12 23:21:54 +00:00
|
|
|
# In step j, at state i action clas
|
|
|
|
# resulted in loss
|
2017-08-18 20:23:03 +00:00
|
|
|
grads[j][i, clas] += loss / len(histories)
|
2017-08-12 22:15:16 +00:00
|
|
|
key = key + tuple([clas])
|
2017-08-12 19:47:45 +00:00
|
|
|
return grads
|
|
|
|
|
|
|
|
|