Remove code related to differentiable BLEU loss
This code is unused, has questionable license, was never updated to Torch 1.0, and seems unlikely to work.
This commit is contained in:
parent
61b152cd50
commit
58355e0ccc
|
@ -129,9 +129,7 @@ def parse(argv):
|
|||
|
||||
parser.add_argument('--skip_cache', action='store_true', dest='skip_cache_bool', help='whether to use exisiting cached splits or generate new ones')
|
||||
parser.add_argument('--lr_rate', default=0.001, type=float, help='initial_learning_rate')
|
||||
parser.add_argument('--use_bleu_loss', action='store_true', help='whether to use differentiable BLEU loss or not')
|
||||
parser.add_argument('--use_maxmargin_loss', action='store_true', help='whether to use max-margin loss or not')
|
||||
parser.add_argument('--loss_switch', default=0.666, type=float, help='switch to BLEU loss after certain iterations controlled by this ratio')
|
||||
parser.add_argument('--small_glove', action='store_true', help='Use glove.6B.50d instead of glove.840B.300d')
|
||||
parser.add_argument('--almond_type_embeddings', action='store_true', help='Add type-based word embeddings for Almond task')
|
||||
parser.add_argument('--use_curriculum', action='store_true', help='Use curriculum learning')
|
||||
|
|
|
@ -28,18 +28,9 @@
|
|||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import os
|
||||
import math
|
||||
import numpy as np
|
||||
import json
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from collections import defaultdict
|
||||
|
||||
from ..util import get_trainable_params, set_seed
|
||||
from ..modules import expectedBLEU, expectedMultiBleu, matrixBLEU
|
||||
|
||||
from .common import *
|
||||
|
||||
|
@ -203,18 +194,7 @@ class MultitaskQuestionAnsweringNetwork(nn.Module):
|
|||
oov_to_limited_idx)
|
||||
|
||||
|
||||
if self.args.use_bleu_loss and iteration >= self.args.loss_switch * max(self.args.train_iterations):
|
||||
max_order = 4
|
||||
targets = answer_indices[:, 1:].contiguous()
|
||||
batch_size = targets.size(0)
|
||||
reference_lengths = [l-1 for l in answer_lengths]
|
||||
translation_len = max(reference_lengths)
|
||||
translation_lengths = torch.tensor([translation_len] * batch_size, device=self.device)
|
||||
|
||||
bleu_loss_smoothed = expectedMultiBleu.bleu(probs, targets, translation_lengths, reference_lengths, max_order=max_order, smooth=True)
|
||||
loss = -1 * bleu_loss_smoothed[0]
|
||||
|
||||
elif self.args.use_maxmargin_loss:
|
||||
if self.args.use_maxmargin_loss:
|
||||
targets = answer_indices[:, 1:].contiguous()
|
||||
loss = max_margin_loss(probs, targets, pad_idx=pad_idx)
|
||||
|
||||
|
|
|
@ -1,53 +0,0 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
from .utils import CUDA_wrapper
|
||||
from functools import reduce
|
||||
from .utils import LongTensor, FloatTensor
|
||||
import time
|
||||
|
||||
|
||||
def one_hots(zeros, ix):
|
||||
for i in range(zeros.size()[0]):
|
||||
zeros[i, ix[i]] = 1
|
||||
return zeros
|
||||
|
||||
def overlap(t, r_hot, r, f, temp, n):
|
||||
""" calculate overlap as in original BLEU script but expected.
|
||||
see google's nmt bleu.py BLEU script for details """
|
||||
t_soft = f(t / temp)
|
||||
length = t.size()[0]
|
||||
v_size = t.size()[1]
|
||||
from_ref = list([i.data[0] for i in r])
|
||||
from_ref_t = LongTensor(from_ref)
|
||||
mapper_ref = {j:i for i, j in enumerate(from_ref)}
|
||||
res = CUDA_wrapper(Variable(FloatTensor([0])))
|
||||
M = [[from_ref[i + j] for j in range(n)] for i in range(len(from_ref) - n + 1)]
|
||||
mul = lambda x, y: x * y
|
||||
start_all = time.time()
|
||||
for i in range(length - n + 1):
|
||||
start_select_t_soft = time.time()
|
||||
pp = [t_soft[i + j] for j in range(n)]
|
||||
ngram_calc_cum = 0
|
||||
for m in M:
|
||||
reslicer = lambda x: r.data.shape[0] + x
|
||||
ngram_calc_start = time.time()
|
||||
y_prod = reduce(mul,
|
||||
[r_hot[j:reslicer(-n + 1 + j), m[j]] for j in range(n)]) # j is id of current word in sentense
|
||||
y_prod = y_prod.sum(0)
|
||||
p_prod = reduce(mul, \
|
||||
[t_soft[j:reslicer(-n + 1 + j), m[j]] for j in range(n)])
|
||||
denominator = 1 + p_prod.sum(0) - p_prod[i]
|
||||
ngram_calc_cum += time.time() - ngram_calc_start
|
||||
pr = reduce(mul, [pp[j][m[j]] for j in range(n)])
|
||||
res += torch.min(pr, pr * y_prod / denominator)
|
||||
return res
|
||||
|
||||
def precision(t, r_hot, r, f, temp, n):
|
||||
return overlap(t, r_hot, r, f, temp, n) / (t.data.shape[0] - n + 1)
|
||||
|
||||
def bleu(t, r_hot, r, f, temp, n):
|
||||
precisions = [precision(t, r_hot, r, f, temp, i) for i in range(1, n+1)]
|
||||
p_log_sum = sum([(1. / n) * torch.log(p)\
|
||||
for p in precisions])
|
||||
return torch.exp(p_log_sum)
|
|
@ -1,164 +0,0 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
from torch.autograd import Variable
|
||||
from collections import Counter
|
||||
from copy import deepcopy as copy_deep
|
||||
from copy import copy as copy
|
||||
from .matrixBLEU import mBLEU
|
||||
from .utils import CUDA_wrapper
|
||||
from collections import Counter
|
||||
from .utils import LongTensor, FloatTensor
|
||||
from functools import reduce
|
||||
from .utils import CUDA_wrapper
|
||||
import sys
|
||||
|
||||
def eprint(*args, **kwargs):
|
||||
print(*args, file=sys.stderr, **kwargs)
|
||||
|
||||
class Reslicer:
|
||||
def __init__(self, max_lenght):
|
||||
"""
|
||||
This functor is used to prevent empty reslice
|
||||
of index selecting when it appears to be zero
|
||||
"""
|
||||
self.max_l = max_lenght
|
||||
|
||||
def __call__(self, x):
|
||||
return self.max_l - x
|
||||
|
||||
def ngrams_product(A, n):
|
||||
"""
|
||||
A-is probability matrix
|
||||
[batch x length_candidate_translation x reference_len]
|
||||
third dimension is reference's words in order of appearance in reference
|
||||
n - states for n-grams
|
||||
Output: [batch, (length_candidate_translation-n+1) x (reference_len-n+1)]
|
||||
"""
|
||||
max_l = min(A.size()[1:])
|
||||
reslicer = Reslicer(max_l)
|
||||
if reslicer(n-1) <= 0:
|
||||
return None
|
||||
cur = A[:, :reslicer(n-1), :reslicer(n-1)].clone()
|
||||
for i in range(1, n):
|
||||
mul = A[:, i:reslicer(n-1-i), i:reslicer(n-1-i)]
|
||||
cur = cur * mul
|
||||
return cur
|
||||
|
||||
def get_selected_matrices(probs, references, dim=1):
|
||||
"""
|
||||
batched index select
|
||||
probs - is a matrix
|
||||
references - is index
|
||||
dim - is dimention of element of the batch
|
||||
"""
|
||||
# NOTE for loop in index select. Found only this way to do this.
|
||||
# It seems that it could be optimized via batched version of index_select
|
||||
# but there is no batched_index_select in pytorch for now
|
||||
return torch.cat([torch.index_select(a, dim, Variable(LongTensor(i))).unsqueeze(0)\
|
||||
for a, i in zip(probs, references)])
|
||||
|
||||
|
||||
def ngram_ref_counts(reference, lengths, n):
|
||||
"""
|
||||
For each position counts n-grams equal to n-gram to this position
|
||||
reference - matrix sequences of id's from vocabulary.[batch, ref len]
|
||||
NOTE reference should be padded with some special ids
|
||||
At least one value in length must be equal reference.shape[1]
|
||||
output: counts n-grams for each start position padded with zeros
|
||||
"""
|
||||
res = []
|
||||
max_len = max(lengths)
|
||||
if max_len - n + 1 <= 0:
|
||||
return None
|
||||
for r, l in zip(reference, lengths):
|
||||
picked = set() # we only take into account first appearance of n-gram
|
||||
# (which contains its count of occurrence)
|
||||
current_length = l - n + 1
|
||||
cnt = Counter([tuple([r[i + j] for j in range(n)]) \
|
||||
for i in range(current_length)])
|
||||
occurrence = []
|
||||
for i in range(current_length):
|
||||
n_gram = tuple([r[i + j] for j in range(n)])
|
||||
val = 0
|
||||
if not n_gram in picked:
|
||||
val = cnt[n_gram]
|
||||
picked.add(n_gram)
|
||||
occurrence.append(val)
|
||||
padding = [1 for _ in range(max_len - l if current_length > 0\
|
||||
else max_len - n+ 1)]
|
||||
res.append(occurrence + padding)
|
||||
return Variable(FloatTensor(res), requires_grad=False)
|
||||
|
||||
def calculate_overlap(p, r, n, lengths):
|
||||
"""
|
||||
p - probability tensor [b x len_x x reference_length]
|
||||
r - references, tensor [b x len_y]
|
||||
contains word's ids for each reference in batch
|
||||
n - n-gram
|
||||
lenghts - lengths of each reference in batch
|
||||
"""
|
||||
A = ngrams_product(get_selected_matrices(p, r), n)
|
||||
r_cnt = ngram_ref_counts(r, lengths, n)
|
||||
if A is None or r_cnt is None:
|
||||
return CUDA_wrapper(torch.zeros(p.shape[0]))
|
||||
r_cnt = r_cnt[:, None]
|
||||
A_div = -A + torch.sum(A, 1, keepdim=True) + 1
|
||||
second_arg = r_cnt / A_div
|
||||
term = torch.min(A, A * second_arg)
|
||||
return torch.sum(torch.sum(term, 2), 1)
|
||||
|
||||
def bleu(p, r, translation_lengths, reference_lengths, max_order=4, smooth=False):
|
||||
"""
|
||||
p - matrix with probabilityes
|
||||
r - reference batch
|
||||
reference_lengths - lengths of the references
|
||||
max_order - max order of n-gram
|
||||
smooth - smooth calculation of precisions
|
||||
translation_lengths - torch tensor
|
||||
"""
|
||||
overlaps_list = []
|
||||
translation_length = sum(translation_lengths)
|
||||
reference_length = sum(reference_lengths)
|
||||
for n in range(1, max_order + 1):
|
||||
overlaps_list.append(calculate_overlap(p, r, n, reference_lengths))
|
||||
overlaps = CUDA_wrapper(torch.stack(overlaps_list))
|
||||
matches_by_order = torch.sum(overlaps, 1)
|
||||
possible_matches_by_order = CUDA_wrapper(torch.zeros(max_order))
|
||||
for n in range(1, max_order + 1):
|
||||
cur_pm = translation_lengths.float() - n + 1
|
||||
mask = cur_pm > 0
|
||||
cur_pm *= mask.float()
|
||||
possible_matches_by_order[n - 1] = torch.sum(cur_pm)
|
||||
precisions = Variable(FloatTensor([0] * max_order))
|
||||
for i in range(max_order):
|
||||
if smooth:
|
||||
precisions[i] = (matches_by_order[i] + 1) /\
|
||||
(possible_matches_by_order[i] + 1)
|
||||
else:
|
||||
if possible_matches_by_order[i] > 0:
|
||||
precisions[i] = matches_by_order[i] /\
|
||||
possible_matches_by_order[i]
|
||||
else:
|
||||
precisions[i] = Variable(FloatTensor([0]))
|
||||
if torch.min(precisions[:max_order]).item() > 0:
|
||||
p_log_sum = sum([(1. / max_order) * torch.log(p) for p in precisions])
|
||||
geo_mean = torch.exp(p_log_sum)
|
||||
else:
|
||||
geo_mean = torch.pow(\
|
||||
reduce(lambda x, y: x*y, precisions), 1./max_order)
|
||||
eprint('WARNING: some precision(s) is zero')
|
||||
ratio = float(translation_length) / reference_length
|
||||
if ratio > 1.0:
|
||||
bp = 1.0
|
||||
else:
|
||||
THRESHOLD_RATIO = 1E-1
|
||||
MIN_BP = 1E-2
|
||||
if ratio > THRESHOLD_RATIO:
|
||||
bp = np.exp(1 - 1. / ratio)
|
||||
else:
|
||||
bp = MIN_BP
|
||||
bleu = -geo_mean * bp
|
||||
return bleu, precisions
|
|
@ -1,114 +0,0 @@
|
|||
import torch
|
||||
from torch.nn import functional
|
||||
from torch.autograd import Variable
|
||||
import numpy as np
|
||||
import os
|
||||
from functools import reduce
|
||||
from copy import deepcopy as copy
|
||||
import time
|
||||
from .utils import CUDA_wrapper
|
||||
from .utils import SoftmaxWithTemperature
|
||||
from .utils import fill_eye_diag
|
||||
|
||||
class mBLEU:
|
||||
def __init__(self, max_order=4, softmax_temperature=0.001, T_argmax=True,\
|
||||
std_temp=False):
|
||||
"""class implementing straightforwad matrix BLEU computation"""
|
||||
self.max_order = max_order
|
||||
self.T_argmax = T_argmax
|
||||
self.sm = SoftmaxWithTemperature(softmax_temperature)
|
||||
self.softmax_regular = torch.nn.Softmax()
|
||||
self.std_temp = std_temp
|
||||
|
||||
def __call__(self, R, T, reference_corpus_lens, translation_corpus_lens):
|
||||
"""
|
||||
T[b x t x v]
|
||||
R[b x r]
|
||||
reference_corpus_lens - list, len=b
|
||||
translation_corpus_lens - list, len=b
|
||||
"""
|
||||
max_order = self.max_order
|
||||
shapeR = R.data.shape
|
||||
shapeT = T.data.shape
|
||||
translation_length = sum(translation_corpus_lens)
|
||||
reference_length = sum(reference_corpus_lens)
|
||||
if self.T_argmax:
|
||||
cur_temperature = None
|
||||
if self.std_temp:
|
||||
cur_temperature = T.std()
|
||||
if (np.random.rand(1)[0] > 0.99):
|
||||
print(cur_temperature)
|
||||
T = self.sm(T.contiguous().view(-1, shapeT[2]),\
|
||||
temperature=cur_temperature).view(shapeT)
|
||||
TR = T.bmm(R.transpose(1, 2))
|
||||
TT = T.bmm(T.transpose(1, 2))
|
||||
# TT = fill_eye_diag(TT)
|
||||
|
||||
reference_len = sum(reference_corpus_lens)
|
||||
tanslation_len = sum(translation_corpus_lens)
|
||||
matches_by_order = [CUDA_wrapper(Variable(torch.FloatTensor([0])))\
|
||||
for i in range(max_order)]
|
||||
cur_t = TT
|
||||
cur_tr = TR
|
||||
all_t = [torch.sum(cur_t, 1)]
|
||||
all_tr = [torch.sum(cur_tr, 2)]
|
||||
def overlapper(t, tr):
|
||||
SMOOTH_CONST = 1E-10
|
||||
return torch.sum((torch.min(t, tr) + SMOOTH_CONST) / torch.max(\
|
||||
(t + SMOOTH_CONST),CUDA_wrapper(Variable(\
|
||||
torch.FloatTensor([1])))), 1)
|
||||
overlap = overlapper(all_t[-1], all_tr[-1])
|
||||
matches_by_order[0] = torch.sum(overlap)
|
||||
possible_matches_by_order = [
|
||||
CUDA_wrapper(Variable(torch.FloatTensor([0])))\
|
||||
for i in range(max_order)\
|
||||
]
|
||||
def update_possible_matches(possible_matches_by_order,\
|
||||
translation_corpus_lens, order):
|
||||
for transl_len in translation_corpus_lens:
|
||||
possible_matches = transl_len - order
|
||||
if possible_matches > 0:
|
||||
possible_matches_by_order[order] += possible_matches
|
||||
update_possible_matches(possible_matches_by_order,\
|
||||
translation_corpus_lens, 0)
|
||||
for order in range(1, min(max_order, shapeT[1], shapeR[1])):
|
||||
cur_t = TT[:, order:, order:] * cur_t[:, :-1, :-1]
|
||||
all_t.append(torch.sum(cur_t, 1))
|
||||
cur_tr = TR[:, order:, order:] * cur_tr[:, :-1, :-1]
|
||||
all_tr.append(torch.sum(cur_tr, 2))
|
||||
overlap = overlapper(all_t[-1], all_tr[-1])
|
||||
matches_by_order[order] = torch.sum(overlap)
|
||||
update_possible_matches(possible_matches_by_order,\
|
||||
translation_corpus_lens, order)
|
||||
|
||||
precisions = [CUDA_wrapper(Variable(torch.FloatTensor([0])))\
|
||||
for i in range(max_order)]
|
||||
for i in range(0, max_order):
|
||||
if possible_matches_by_order[i].data[0] > 0:
|
||||
if i > 0:
|
||||
precisions[i] = ((matches_by_order[i].float() + 1)\
|
||||
/( possible_matches_by_order[i] + 1))
|
||||
else:
|
||||
precisions[i] = (matches_by_order[i].float()\
|
||||
/possible_matches_by_order[i])
|
||||
else:
|
||||
precisions[i] = CUDA_wrapper(Variable(torch.FloatTensor([0])))
|
||||
if torch.min(torch.stack(precisions)).data[0] > 1E-3:
|
||||
p_log_sum = sum([(1. / max_order) * torch.log(p)\
|
||||
for p in precisions])
|
||||
geo_mean = torch.exp(p_log_sum)
|
||||
else:
|
||||
geo_mean = torch.pow(\
|
||||
reduce(lambda x, y: x*y, precisions), 1./max_order)
|
||||
ratio = float(translation_length) / reference_length
|
||||
if ratio > 1.0:
|
||||
bp = 1.
|
||||
else:
|
||||
THRESHOLD_RATIO = 1E-1
|
||||
MIN_BP = 1E-2
|
||||
if ratio > THRESHOLD_RATIO:
|
||||
bp = np.exp(1 - 1. / ratio)
|
||||
else:
|
||||
bp = MIN_BP
|
||||
bleu = -geo_mean * bp
|
||||
return bleu, precisions
|
|
@ -1,41 +0,0 @@
|
|||
import torch
|
||||
from torch.autograd import Variable
|
||||
|
||||
|
||||
if torch.cuda.is_available():
|
||||
Tensor = torch.cuda.FloatTensor
|
||||
FloatTensor = torch.cuda.FloatTensor
|
||||
LongTensor = torch.cuda.LongTensor
|
||||
ByteTensor = torch.cuda.ByteTensor
|
||||
else:
|
||||
Tensor = torch.Tensor
|
||||
FloatTensor = torch.FloatTensor
|
||||
LongTensor = torch.LongTensor
|
||||
ByteTensor = torch.ByteTensor
|
||||
|
||||
def CUDA_wrapper(tensor):
|
||||
use_cuda = torch.cuda.is_available()
|
||||
if use_cuda:
|
||||
return tensor.cuda()
|
||||
else:
|
||||
return tensor
|
||||
|
||||
class SoftmaxWithTemperature:
|
||||
def __init__(self, temperature):
|
||||
"""
|
||||
formula: softmax(x/temperature)
|
||||
"""
|
||||
self.temperature = temperature
|
||||
self.softmax = torch.nn.Softmax()
|
||||
|
||||
def __call__(self, x, temperature=None):
|
||||
if not temperature is None:
|
||||
return self.softmax(x / temperature)
|
||||
else:
|
||||
return self.softmax(x / self.temperature)
|
||||
|
||||
def fill_eye_diag(a):
|
||||
_, s1, s2 = a.data.shape
|
||||
dd = Variable(CUDA_wrapper(torch.eye(s1)))
|
||||
zero_dd = 1 - dd
|
||||
return a * zero_dd + dd
|
Loading…
Reference in New Issue