Remove code related to differentiable BLEU loss

This code is unused, has questionable license, was never updated
to Torch 1.0, and seems unlikely to work.
This commit is contained in:
Giovanni Campagna 2019-12-10 17:49:12 -08:00
parent 61b152cd50
commit 58355e0ccc
7 changed files with 1 additions and 395 deletions

View File

@ -129,9 +129,7 @@ def parse(argv):
parser.add_argument('--skip_cache', action='store_true', dest='skip_cache_bool', help='whether to use exisiting cached splits or generate new ones')
parser.add_argument('--lr_rate', default=0.001, type=float, help='initial_learning_rate')
parser.add_argument('--use_bleu_loss', action='store_true', help='whether to use differentiable BLEU loss or not')
parser.add_argument('--use_maxmargin_loss', action='store_true', help='whether to use max-margin loss or not')
parser.add_argument('--loss_switch', default=0.666, type=float, help='switch to BLEU loss after certain iterations controlled by this ratio')
parser.add_argument('--small_glove', action='store_true', help='Use glove.6B.50d instead of glove.840B.300d')
parser.add_argument('--almond_type_embeddings', action='store_true', help='Add type-based word embeddings for Almond task')
parser.add_argument('--use_curriculum', action='store_true', help='Use curriculum learning')

View File

@ -28,18 +28,9 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import math
import numpy as np
import json
import torch
from torch import nn
from torch.nn import functional as F
from collections import defaultdict
from ..util import get_trainable_params, set_seed
from ..modules import expectedBLEU, expectedMultiBleu, matrixBLEU
from .common import *
@ -203,18 +194,7 @@ class MultitaskQuestionAnsweringNetwork(nn.Module):
oov_to_limited_idx)
if self.args.use_bleu_loss and iteration >= self.args.loss_switch * max(self.args.train_iterations):
max_order = 4
targets = answer_indices[:, 1:].contiguous()
batch_size = targets.size(0)
reference_lengths = [l-1 for l in answer_lengths]
translation_len = max(reference_lengths)
translation_lengths = torch.tensor([translation_len] * batch_size, device=self.device)
bleu_loss_smoothed = expectedMultiBleu.bleu(probs, targets, translation_lengths, reference_lengths, max_order=max_order, smooth=True)
loss = -1 * bleu_loss_smoothed[0]
elif self.args.use_maxmargin_loss:
if self.args.use_maxmargin_loss:
targets = answer_indices[:, 1:].contiguous()
loss = max_margin_loss(probs, targets, pad_idx=pad_idx)

View File

@ -1,53 +0,0 @@
import numpy as np
import torch
from torch.autograd import Variable
from .utils import CUDA_wrapper
from functools import reduce
from .utils import LongTensor, FloatTensor
import time
def one_hots(zeros, ix):
for i in range(zeros.size()[0]):
zeros[i, ix[i]] = 1
return zeros
def overlap(t, r_hot, r, f, temp, n):
""" calculate overlap as in original BLEU script but expected.
see google's nmt bleu.py BLEU script for details """
t_soft = f(t / temp)
length = t.size()[0]
v_size = t.size()[1]
from_ref = list([i.data[0] for i in r])
from_ref_t = LongTensor(from_ref)
mapper_ref = {j:i for i, j in enumerate(from_ref)}
res = CUDA_wrapper(Variable(FloatTensor([0])))
M = [[from_ref[i + j] for j in range(n)] for i in range(len(from_ref) - n + 1)]
mul = lambda x, y: x * y
start_all = time.time()
for i in range(length - n + 1):
start_select_t_soft = time.time()
pp = [t_soft[i + j] for j in range(n)]
ngram_calc_cum = 0
for m in M:
reslicer = lambda x: r.data.shape[0] + x
ngram_calc_start = time.time()
y_prod = reduce(mul,
[r_hot[j:reslicer(-n + 1 + j), m[j]] for j in range(n)]) # j is id of current word in sentense
y_prod = y_prod.sum(0)
p_prod = reduce(mul, \
[t_soft[j:reslicer(-n + 1 + j), m[j]] for j in range(n)])
denominator = 1 + p_prod.sum(0) - p_prod[i]
ngram_calc_cum += time.time() - ngram_calc_start
pr = reduce(mul, [pp[j][m[j]] for j in range(n)])
res += torch.min(pr, pr * y_prod / denominator)
return res
def precision(t, r_hot, r, f, temp, n):
return overlap(t, r_hot, r, f, temp, n) / (t.data.shape[0] - n + 1)
def bleu(t, r_hot, r, f, temp, n):
precisions = [precision(t, r_hot, r, f, temp, i) for i in range(1, n+1)]
p_log_sum = sum([(1. / n) * torch.log(p)\
for p in precisions])
return torch.exp(p_log_sum)

View File

@ -1,164 +0,0 @@
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from collections import Counter
from copy import deepcopy as copy_deep
from copy import copy as copy
from .matrixBLEU import mBLEU
from .utils import CUDA_wrapper
from collections import Counter
from .utils import LongTensor, FloatTensor
from functools import reduce
from .utils import CUDA_wrapper
import sys
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
class Reslicer:
def __init__(self, max_lenght):
"""
This functor is used to prevent empty reslice
of index selecting when it appears to be zero
"""
self.max_l = max_lenght
def __call__(self, x):
return self.max_l - x
def ngrams_product(A, n):
"""
A-is probability matrix
[batch x length_candidate_translation x reference_len]
third dimension is reference's words in order of appearance in reference
n - states for n-grams
Output: [batch, (length_candidate_translation-n+1) x (reference_len-n+1)]
"""
max_l = min(A.size()[1:])
reslicer = Reslicer(max_l)
if reslicer(n-1) <= 0:
return None
cur = A[:, :reslicer(n-1), :reslicer(n-1)].clone()
for i in range(1, n):
mul = A[:, i:reslicer(n-1-i), i:reslicer(n-1-i)]
cur = cur * mul
return cur
def get_selected_matrices(probs, references, dim=1):
"""
batched index select
probs - is a matrix
references - is index
dim - is dimention of element of the batch
"""
# NOTE for loop in index select. Found only this way to do this.
# It seems that it could be optimized via batched version of index_select
# but there is no batched_index_select in pytorch for now
return torch.cat([torch.index_select(a, dim, Variable(LongTensor(i))).unsqueeze(0)\
for a, i in zip(probs, references)])
def ngram_ref_counts(reference, lengths, n):
"""
For each position counts n-grams equal to n-gram to this position
reference - matrix sequences of id's from vocabulary.[batch, ref len]
NOTE reference should be padded with some special ids
At least one value in length must be equal reference.shape[1]
output: counts n-grams for each start position padded with zeros
"""
res = []
max_len = max(lengths)
if max_len - n + 1 <= 0:
return None
for r, l in zip(reference, lengths):
picked = set() # we only take into account first appearance of n-gram
# (which contains its count of occurrence)
current_length = l - n + 1
cnt = Counter([tuple([r[i + j] for j in range(n)]) \
for i in range(current_length)])
occurrence = []
for i in range(current_length):
n_gram = tuple([r[i + j] for j in range(n)])
val = 0
if not n_gram in picked:
val = cnt[n_gram]
picked.add(n_gram)
occurrence.append(val)
padding = [1 for _ in range(max_len - l if current_length > 0\
else max_len - n+ 1)]
res.append(occurrence + padding)
return Variable(FloatTensor(res), requires_grad=False)
def calculate_overlap(p, r, n, lengths):
"""
p - probability tensor [b x len_x x reference_length]
r - references, tensor [b x len_y]
contains word's ids for each reference in batch
n - n-gram
lenghts - lengths of each reference in batch
"""
A = ngrams_product(get_selected_matrices(p, r), n)
r_cnt = ngram_ref_counts(r, lengths, n)
if A is None or r_cnt is None:
return CUDA_wrapper(torch.zeros(p.shape[0]))
r_cnt = r_cnt[:, None]
A_div = -A + torch.sum(A, 1, keepdim=True) + 1
second_arg = r_cnt / A_div
term = torch.min(A, A * second_arg)
return torch.sum(torch.sum(term, 2), 1)
def bleu(p, r, translation_lengths, reference_lengths, max_order=4, smooth=False):
"""
p - matrix with probabilityes
r - reference batch
reference_lengths - lengths of the references
max_order - max order of n-gram
smooth - smooth calculation of precisions
translation_lengths - torch tensor
"""
overlaps_list = []
translation_length = sum(translation_lengths)
reference_length = sum(reference_lengths)
for n in range(1, max_order + 1):
overlaps_list.append(calculate_overlap(p, r, n, reference_lengths))
overlaps = CUDA_wrapper(torch.stack(overlaps_list))
matches_by_order = torch.sum(overlaps, 1)
possible_matches_by_order = CUDA_wrapper(torch.zeros(max_order))
for n in range(1, max_order + 1):
cur_pm = translation_lengths.float() - n + 1
mask = cur_pm > 0
cur_pm *= mask.float()
possible_matches_by_order[n - 1] = torch.sum(cur_pm)
precisions = Variable(FloatTensor([0] * max_order))
for i in range(max_order):
if smooth:
precisions[i] = (matches_by_order[i] + 1) /\
(possible_matches_by_order[i] + 1)
else:
if possible_matches_by_order[i] > 0:
precisions[i] = matches_by_order[i] /\
possible_matches_by_order[i]
else:
precisions[i] = Variable(FloatTensor([0]))
if torch.min(precisions[:max_order]).item() > 0:
p_log_sum = sum([(1. / max_order) * torch.log(p) for p in precisions])
geo_mean = torch.exp(p_log_sum)
else:
geo_mean = torch.pow(\
reduce(lambda x, y: x*y, precisions), 1./max_order)
eprint('WARNING: some precision(s) is zero')
ratio = float(translation_length) / reference_length
if ratio > 1.0:
bp = 1.0
else:
THRESHOLD_RATIO = 1E-1
MIN_BP = 1E-2
if ratio > THRESHOLD_RATIO:
bp = np.exp(1 - 1. / ratio)
else:
bp = MIN_BP
bleu = -geo_mean * bp
return bleu, precisions

View File

@ -1,114 +0,0 @@
import torch
from torch.nn import functional
from torch.autograd import Variable
import numpy as np
import os
from functools import reduce
from copy import deepcopy as copy
import time
from .utils import CUDA_wrapper
from .utils import SoftmaxWithTemperature
from .utils import fill_eye_diag
class mBLEU:
def __init__(self, max_order=4, softmax_temperature=0.001, T_argmax=True,\
std_temp=False):
"""class implementing straightforwad matrix BLEU computation"""
self.max_order = max_order
self.T_argmax = T_argmax
self.sm = SoftmaxWithTemperature(softmax_temperature)
self.softmax_regular = torch.nn.Softmax()
self.std_temp = std_temp
def __call__(self, R, T, reference_corpus_lens, translation_corpus_lens):
"""
T[b x t x v]
R[b x r]
reference_corpus_lens - list, len=b
translation_corpus_lens - list, len=b
"""
max_order = self.max_order
shapeR = R.data.shape
shapeT = T.data.shape
translation_length = sum(translation_corpus_lens)
reference_length = sum(reference_corpus_lens)
if self.T_argmax:
cur_temperature = None
if self.std_temp:
cur_temperature = T.std()
if (np.random.rand(1)[0] > 0.99):
print(cur_temperature)
T = self.sm(T.contiguous().view(-1, shapeT[2]),\
temperature=cur_temperature).view(shapeT)
TR = T.bmm(R.transpose(1, 2))
TT = T.bmm(T.transpose(1, 2))
# TT = fill_eye_diag(TT)
reference_len = sum(reference_corpus_lens)
tanslation_len = sum(translation_corpus_lens)
matches_by_order = [CUDA_wrapper(Variable(torch.FloatTensor([0])))\
for i in range(max_order)]
cur_t = TT
cur_tr = TR
all_t = [torch.sum(cur_t, 1)]
all_tr = [torch.sum(cur_tr, 2)]
def overlapper(t, tr):
SMOOTH_CONST = 1E-10
return torch.sum((torch.min(t, tr) + SMOOTH_CONST) / torch.max(\
(t + SMOOTH_CONST),CUDA_wrapper(Variable(\
torch.FloatTensor([1])))), 1)
overlap = overlapper(all_t[-1], all_tr[-1])
matches_by_order[0] = torch.sum(overlap)
possible_matches_by_order = [
CUDA_wrapper(Variable(torch.FloatTensor([0])))\
for i in range(max_order)\
]
def update_possible_matches(possible_matches_by_order,\
translation_corpus_lens, order):
for transl_len in translation_corpus_lens:
possible_matches = transl_len - order
if possible_matches > 0:
possible_matches_by_order[order] += possible_matches
update_possible_matches(possible_matches_by_order,\
translation_corpus_lens, 0)
for order in range(1, min(max_order, shapeT[1], shapeR[1])):
cur_t = TT[:, order:, order:] * cur_t[:, :-1, :-1]
all_t.append(torch.sum(cur_t, 1))
cur_tr = TR[:, order:, order:] * cur_tr[:, :-1, :-1]
all_tr.append(torch.sum(cur_tr, 2))
overlap = overlapper(all_t[-1], all_tr[-1])
matches_by_order[order] = torch.sum(overlap)
update_possible_matches(possible_matches_by_order,\
translation_corpus_lens, order)
precisions = [CUDA_wrapper(Variable(torch.FloatTensor([0])))\
for i in range(max_order)]
for i in range(0, max_order):
if possible_matches_by_order[i].data[0] > 0:
if i > 0:
precisions[i] = ((matches_by_order[i].float() + 1)\
/( possible_matches_by_order[i] + 1))
else:
precisions[i] = (matches_by_order[i].float()\
/possible_matches_by_order[i])
else:
precisions[i] = CUDA_wrapper(Variable(torch.FloatTensor([0])))
if torch.min(torch.stack(precisions)).data[0] > 1E-3:
p_log_sum = sum([(1. / max_order) * torch.log(p)\
for p in precisions])
geo_mean = torch.exp(p_log_sum)
else:
geo_mean = torch.pow(\
reduce(lambda x, y: x*y, precisions), 1./max_order)
ratio = float(translation_length) / reference_length
if ratio > 1.0:
bp = 1.
else:
THRESHOLD_RATIO = 1E-1
MIN_BP = 1E-2
if ratio > THRESHOLD_RATIO:
bp = np.exp(1 - 1. / ratio)
else:
bp = MIN_BP
bleu = -geo_mean * bp
return bleu, precisions

View File

@ -1,41 +0,0 @@
import torch
from torch.autograd import Variable
if torch.cuda.is_available():
Tensor = torch.cuda.FloatTensor
FloatTensor = torch.cuda.FloatTensor
LongTensor = torch.cuda.LongTensor
ByteTensor = torch.cuda.ByteTensor
else:
Tensor = torch.Tensor
FloatTensor = torch.FloatTensor
LongTensor = torch.LongTensor
ByteTensor = torch.ByteTensor
def CUDA_wrapper(tensor):
use_cuda = torch.cuda.is_available()
if use_cuda:
return tensor.cuda()
else:
return tensor
class SoftmaxWithTemperature:
def __init__(self, temperature):
"""
formula: softmax(x/temperature)
"""
self.temperature = temperature
self.softmax = torch.nn.Softmax()
def __call__(self, x, temperature=None):
if not temperature is None:
return self.softmax(x / temperature)
else:
return self.softmax(x / self.temperature)
def fill_eye_diag(a):
_, s1, s2 = a.data.shape
dd = Variable(CUDA_wrapper(torch.eye(s1)))
zero_dd = 1 - dd
return a * zero_dd + dd