diff --git a/models/multitask_question_answering_network.py b/models/multitask_question_answering_network.py index 26031915..4ecb4646 100644 --- a/models/multitask_question_answering_network.py +++ b/models/multitask_question_answering_network.py @@ -172,44 +172,43 @@ class MultitaskQuestionAnsweringNetwork(nn.Module): def greedy(self, self_attended_context, context, question, context_indices, question_indices, oov_to_limited_idx, rnn_state=None): - with torch.no_grad(): - B, TC, C = context.size() - T = self.args.max_output_length - outs = context.new_full((B, T), self.field.decoder_stoi[''], dtype=torch.long) - hiddens = [self_attended_context[0].new_zeros((B, T, C)) - for l in range(len(self.self_attentive_decoder.layers) + 1)] - hiddens[0] = hiddens[0] + positional_encodings_like(hiddens[0]) - eos_yet = context.new_zeros((B, )).byte() + B, TC, C = context.size() + T = self.args.max_output_length + outs = context.new_full((B, T), self.field.decoder_stoi[''], dtype=torch.long) + hiddens = [self_attended_context[0].new_zeros((B, T, C)) + for l in range(len(self.self_attentive_decoder.layers) + 1)] + hiddens[0] = hiddens[0] + positional_encodings_like(hiddens[0]) + eos_yet = context.new_zeros((B, )).byte() - rnn_output, context_alignment, question_alignment = None, None, None - for t in range(T): - if t == 0: - embedding = self.decoder_embeddings( - self_attended_context[-1].new_full((B, 1), self.field.vocab.stoi[''], dtype=torch.long), [1]*B) - else: - embedding = self.decoder_embeddings(outs[:, t - 1].unsqueeze(1), [1]*B) - hiddens[0][:, t] = hiddens[0][:, t] + (math.sqrt(self.self_attentive_decoder.d_model) * embedding).squeeze(1) - for l in range(len(self.self_attentive_decoder.layers)): - hiddens[l + 1][:, t] = self.self_attentive_decoder.layers[l].feedforward( - self.self_attentive_decoder.layers[l].attention( - self.self_attentive_decoder.layers[l].selfattn(hiddens[l][:, t], hiddens[l][:, :t + 1], hiddens[l][:, :t + 1]) - , self_attended_context[l], self_attended_context[l])) - decoder_outputs = self.dual_ptr_rnn_decoder(hiddens[-1][:, t].unsqueeze(1), - context, question, - context_alignment=context_alignment, question_alignment=question_alignment, - hidden=rnn_state, output=rnn_output) - rnn_output, context_attention, question_attention, context_alignment, question_alignment, vocab_pointer_switch, context_question_switch, rnn_state = decoder_outputs - probs = self.probs(self.out, rnn_output, vocab_pointer_switch, context_question_switch, - context_attention, question_attention, - context_indices, question_indices, - oov_to_limited_idx) - pred_probs, preds = probs.max(-1) - preds = preds.squeeze(1) - eos_yet = eos_yet | (preds == self.field.decoder_stoi['']) - outs[:, t] = preds.cpu().apply_(self.map_to_full) - if eos_yet.all(): - break - return outs + rnn_output, context_alignment, question_alignment = None, None, None + for t in range(T): + if t == 0: + embedding = self.decoder_embeddings( + self_attended_context[-1].new_full((B, 1), self.field.vocab.stoi[''], dtype=torch.long), [1]*B) + else: + embedding = self.decoder_embeddings(outs[:, t - 1].unsqueeze(1), [1]*B) + hiddens[0][:, t] = hiddens[0][:, t] + (math.sqrt(self.self_attentive_decoder.d_model) * embedding).squeeze(1) + for l in range(len(self.self_attentive_decoder.layers)): + hiddens[l + 1][:, t] = self.self_attentive_decoder.layers[l].feedforward( + self.self_attentive_decoder.layers[l].attention( + self.self_attentive_decoder.layers[l].selfattn(hiddens[l][:, t], hiddens[l][:, :t + 1], hiddens[l][:, :t + 1]) + , self_attended_context[l], self_attended_context[l])) + decoder_outputs = self.dual_ptr_rnn_decoder(hiddens[-1][:, t].unsqueeze(1), + context, question, + context_alignment=context_alignment, question_alignment=question_alignment, + hidden=rnn_state, output=rnn_output) + rnn_output, context_attention, question_attention, context_alignment, question_alignment, vocab_pointer_switch, context_question_switch, rnn_state = decoder_outputs + probs = self.probs(self.out, rnn_output, vocab_pointer_switch, context_question_switch, + context_attention, question_attention, + context_indices, question_indices, + oov_to_limited_idx) + pred_probs, preds = probs.max(-1) + preds = preds.squeeze(1) + eos_yet = eos_yet | (preds == self.field.decoder_stoi['']) + outs[:, t] = preds.cpu().apply_(self.map_to_full) + if eos_yet.all(): + break + return outs diff --git a/models/pointer_generator.py b/models/pointer_generator.py index 7e1950bc..5255c968 100644 --- a/models/pointer_generator.py +++ b/models/pointer_generator.py @@ -5,7 +5,6 @@ import numpy as np import torch from torch import nn from torch.nn import functional as F -from torch.autograd import Variable from .common import positional_encodings_like, INF, EPSILON, TransformerEncoder, TransformerDecoder, PackedLSTM, LSTMDecoderAttention, LSTMDecoder, Embedding, Feedforward, mask @@ -101,30 +100,28 @@ class PointerGenerator(nn.Module): effective_vocab_size = self.generative_vocab_size + len(oov_to_limited_idx) if self.generative_vocab_size < effective_vocab_size: size[-1] = effective_vocab_size - self.generative_vocab_size - buff = Variable(scaled_p_vocab.data.new(*size).fill_(EPSILON)) + buff = scaled_p_vocab.new_full(size, EPSILON) scaled_p_vocab = torch.cat([scaled_p_vocab, buff], dim=buff.dim()-1) - p_context_ptr = Variable(scaled_p_vocab.data.new(*scaled_p_vocab.size()).fill_(EPSILON)) + p_context_ptr = scaled_p_vocab.new_full(scaled_p_vocab.size(), EPSILON) p_context_ptr.scatter_add_(p_context_ptr.dim()-1, context_indices.unsqueeze(1).expand_as(context_attention), context_attention) scaled_p_context_ptr = (1 - vocab_pointer_switches).expand_as(p_context_ptr) * p_context_ptr - - probs = scaled_p_vocab + scaled_p_context_ptr #+ scaled_p_question_ptr + probs = scaled_p_vocab + scaled_p_context_ptr return probs def greedy(self, context, context_indices, oov_to_limited_idx, rnn_state=None): B, TC, C = context.size() T = self.args.max_output_length - outs = Variable(context.data.new(B, T).long().fill_( - self.field.decoder_stoi['']), volatile=True) + outs = context.new_full((B, T), self.field.decoder_stoi[''], dtype=torch.long) eos_yet = context.data.new(B).byte().zero_() rnn_output, context_alignment = None, None for t in range(T): if t == 0: - embedding = self.decoder_embeddings(Variable( - context[-1].data.new(B).long().fill_( - self.field.vocab.stoi['']), volatile=True).unsqueeze(1), [1]*B) + embedding = self.decoder_embeddings( + self_attended_context[-1].new_full((B, 1), self.field.vocab.stoi[''], dtype=torch.long), [1]*B) + else: embedding = self.decoder_embeddings(outs[:, t - 1].unsqueeze(1), [1]*B) decoder_outputs = self.dual_ptr_rnn_decoder(embedding, #hiddens[-1][:, t].unsqueeze(1), @@ -138,8 +135,9 @@ class PointerGenerator(nn.Module): context_indices, oov_to_limited_idx) pred_probs, preds = probs.max(-1) - eos_yet = eos_yet | (preds.data == self.field.decoder_stoi['']) - outs[:, t] = Variable(preds.data.cpu().apply_(self.map_to_full), volatile=True) + preds = preds.squeeze(1) + eos_yet = eos_yet | (preds == self.field.decoder_stoi['']) + outs[:, t] = preds.cpu().apply_(self.map_to_full) if eos_yet.all(): break return outs