From 58355e0ccc5b4d01dfd42159c730ab124cd178d8 Mon Sep 17 00:00:00 2001 From: Giovanni Campagna Date: Tue, 10 Dec 2019 17:49:12 -0800 Subject: [PATCH] Remove code related to differentiable BLEU loss This code is unused, has questionable license, was never updated to Torch 1.0, and seems unlikely to work. --- decanlp/arguments.py | 2 - .../multitask_question_answering_network.py | 22 +-- decanlp/modules/__init__.py | 0 decanlp/modules/expectedBLEU.py | 53 ------ decanlp/modules/expectedMultiBleu.py | 164 ------------------ decanlp/modules/matrixBLEU.py | 114 ------------ decanlp/modules/utils.py | 41 ----- 7 files changed, 1 insertion(+), 395 deletions(-) delete mode 100644 decanlp/modules/__init__.py delete mode 100755 decanlp/modules/expectedBLEU.py delete mode 100644 decanlp/modules/expectedMultiBleu.py delete mode 100755 decanlp/modules/matrixBLEU.py delete mode 100644 decanlp/modules/utils.py diff --git a/decanlp/arguments.py b/decanlp/arguments.py index 8318f41b..96f32a03 100644 --- a/decanlp/arguments.py +++ b/decanlp/arguments.py @@ -129,9 +129,7 @@ def parse(argv): parser.add_argument('--skip_cache', action='store_true', dest='skip_cache_bool', help='whether to use exisiting cached splits or generate new ones') parser.add_argument('--lr_rate', default=0.001, type=float, help='initial_learning_rate') - parser.add_argument('--use_bleu_loss', action='store_true', help='whether to use differentiable BLEU loss or not') parser.add_argument('--use_maxmargin_loss', action='store_true', help='whether to use max-margin loss or not') - parser.add_argument('--loss_switch', default=0.666, type=float, help='switch to BLEU loss after certain iterations controlled by this ratio') parser.add_argument('--small_glove', action='store_true', help='Use glove.6B.50d instead of glove.840B.300d') parser.add_argument('--almond_type_embeddings', action='store_true', help='Add type-based word embeddings for Almond task') parser.add_argument('--use_curriculum', action='store_true', help='Use curriculum learning') diff --git a/decanlp/models/multitask_question_answering_network.py b/decanlp/models/multitask_question_answering_network.py index 399510d7..c28b7201 100644 --- a/decanlp/models/multitask_question_answering_network.py +++ b/decanlp/models/multitask_question_answering_network.py @@ -28,18 +28,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os -import math -import numpy as np -import json - -import torch -from torch import nn -from torch.nn import functional as F from collections import defaultdict from ..util import get_trainable_params, set_seed -from ..modules import expectedBLEU, expectedMultiBleu, matrixBLEU from .common import * @@ -203,18 +194,7 @@ class MultitaskQuestionAnsweringNetwork(nn.Module): oov_to_limited_idx) - if self.args.use_bleu_loss and iteration >= self.args.loss_switch * max(self.args.train_iterations): - max_order = 4 - targets = answer_indices[:, 1:].contiguous() - batch_size = targets.size(0) - reference_lengths = [l-1 for l in answer_lengths] - translation_len = max(reference_lengths) - translation_lengths = torch.tensor([translation_len] * batch_size, device=self.device) - - bleu_loss_smoothed = expectedMultiBleu.bleu(probs, targets, translation_lengths, reference_lengths, max_order=max_order, smooth=True) - loss = -1 * bleu_loss_smoothed[0] - - elif self.args.use_maxmargin_loss: + if self.args.use_maxmargin_loss: targets = answer_indices[:, 1:].contiguous() loss = max_margin_loss(probs, targets, pad_idx=pad_idx) diff --git a/decanlp/modules/__init__.py b/decanlp/modules/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/decanlp/modules/expectedBLEU.py b/decanlp/modules/expectedBLEU.py deleted file mode 100755 index ab376113..00000000 --- a/decanlp/modules/expectedBLEU.py +++ /dev/null @@ -1,53 +0,0 @@ -import numpy as np -import torch -from torch.autograd import Variable -from .utils import CUDA_wrapper -from functools import reduce -from .utils import LongTensor, FloatTensor -import time - - -def one_hots(zeros, ix): - for i in range(zeros.size()[0]): - zeros[i, ix[i]] = 1 - return zeros - -def overlap(t, r_hot, r, f, temp, n): - """ calculate overlap as in original BLEU script but expected. - see google's nmt bleu.py BLEU script for details """ - t_soft = f(t / temp) - length = t.size()[0] - v_size = t.size()[1] - from_ref = list([i.data[0] for i in r]) - from_ref_t = LongTensor(from_ref) - mapper_ref = {j:i for i, j in enumerate(from_ref)} - res = CUDA_wrapper(Variable(FloatTensor([0]))) - M = [[from_ref[i + j] for j in range(n)] for i in range(len(from_ref) - n + 1)] - mul = lambda x, y: x * y - start_all = time.time() - for i in range(length - n + 1): - start_select_t_soft = time.time() - pp = [t_soft[i + j] for j in range(n)] - ngram_calc_cum = 0 - for m in M: - reslicer = lambda x: r.data.shape[0] + x - ngram_calc_start = time.time() - y_prod = reduce(mul, - [r_hot[j:reslicer(-n + 1 + j), m[j]] for j in range(n)]) # j is id of current word in sentense - y_prod = y_prod.sum(0) - p_prod = reduce(mul, \ - [t_soft[j:reslicer(-n + 1 + j), m[j]] for j in range(n)]) - denominator = 1 + p_prod.sum(0) - p_prod[i] - ngram_calc_cum += time.time() - ngram_calc_start - pr = reduce(mul, [pp[j][m[j]] for j in range(n)]) - res += torch.min(pr, pr * y_prod / denominator) - return res - -def precision(t, r_hot, r, f, temp, n): - return overlap(t, r_hot, r, f, temp, n) / (t.data.shape[0] - n + 1) - -def bleu(t, r_hot, r, f, temp, n): - precisions = [precision(t, r_hot, r, f, temp, i) for i in range(1, n+1)] - p_log_sum = sum([(1. / n) * torch.log(p)\ - for p in precisions]) - return torch.exp(p_log_sum) diff --git a/decanlp/modules/expectedMultiBleu.py b/decanlp/modules/expectedMultiBleu.py deleted file mode 100644 index a1fd9dca..00000000 --- a/decanlp/modules/expectedMultiBleu.py +++ /dev/null @@ -1,164 +0,0 @@ -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torch.autograd import Variable -from collections import Counter -from copy import deepcopy as copy_deep -from copy import copy as copy -from .matrixBLEU import mBLEU -from .utils import CUDA_wrapper -from collections import Counter -from .utils import LongTensor, FloatTensor -from functools import reduce -from .utils import CUDA_wrapper -import sys - -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - -class Reslicer: - def __init__(self, max_lenght): - """ - This functor is used to prevent empty reslice - of index selecting when it appears to be zero - """ - self.max_l = max_lenght - - def __call__(self, x): - return self.max_l - x - -def ngrams_product(A, n): - """ - A-is probability matrix - [batch x length_candidate_translation x reference_len] - third dimension is reference's words in order of appearance in reference - n - states for n-grams - Output: [batch, (length_candidate_translation-n+1) x (reference_len-n+1)] - """ - max_l = min(A.size()[1:]) - reslicer = Reslicer(max_l) - if reslicer(n-1) <= 0: - return None - cur = A[:, :reslicer(n-1), :reslicer(n-1)].clone() - for i in range(1, n): - mul = A[:, i:reslicer(n-1-i), i:reslicer(n-1-i)] - cur = cur * mul - return cur - -def get_selected_matrices(probs, references, dim=1): - """ - batched index select - probs - is a matrix - references - is index - dim - is dimention of element of the batch - """ - # NOTE for loop in index select. Found only this way to do this. - # It seems that it could be optimized via batched version of index_select - # but there is no batched_index_select in pytorch for now - return torch.cat([torch.index_select(a, dim, Variable(LongTensor(i))).unsqueeze(0)\ - for a, i in zip(probs, references)]) - - -def ngram_ref_counts(reference, lengths, n): - """ - For each position counts n-grams equal to n-gram to this position - reference - matrix sequences of id's from vocabulary.[batch, ref len] - NOTE reference should be padded with some special ids - At least one value in length must be equal reference.shape[1] - output: counts n-grams for each start position padded with zeros - """ - res = [] - max_len = max(lengths) - if max_len - n + 1 <= 0: - return None - for r, l in zip(reference, lengths): - picked = set() # we only take into account first appearance of n-gram - # (which contains its count of occurrence) - current_length = l - n + 1 - cnt = Counter([tuple([r[i + j] for j in range(n)]) \ - for i in range(current_length)]) - occurrence = [] - for i in range(current_length): - n_gram = tuple([r[i + j] for j in range(n)]) - val = 0 - if not n_gram in picked: - val = cnt[n_gram] - picked.add(n_gram) - occurrence.append(val) - padding = [1 for _ in range(max_len - l if current_length > 0\ - else max_len - n+ 1)] - res.append(occurrence + padding) - return Variable(FloatTensor(res), requires_grad=False) - -def calculate_overlap(p, r, n, lengths): - """ - p - probability tensor [b x len_x x reference_length] - r - references, tensor [b x len_y] - contains word's ids for each reference in batch - n - n-gram - lenghts - lengths of each reference in batch - """ - A = ngrams_product(get_selected_matrices(p, r), n) - r_cnt = ngram_ref_counts(r, lengths, n) - if A is None or r_cnt is None: - return CUDA_wrapper(torch.zeros(p.shape[0])) - r_cnt = r_cnt[:, None] - A_div = -A + torch.sum(A, 1, keepdim=True) + 1 - second_arg = r_cnt / A_div - term = torch.min(A, A * second_arg) - return torch.sum(torch.sum(term, 2), 1) - -def bleu(p, r, translation_lengths, reference_lengths, max_order=4, smooth=False): - """ - p - matrix with probabilityes - r - reference batch - reference_lengths - lengths of the references - max_order - max order of n-gram - smooth - smooth calculation of precisions - translation_lengths - torch tensor - """ - overlaps_list = [] - translation_length = sum(translation_lengths) - reference_length = sum(reference_lengths) - for n in range(1, max_order + 1): - overlaps_list.append(calculate_overlap(p, r, n, reference_lengths)) - overlaps = CUDA_wrapper(torch.stack(overlaps_list)) - matches_by_order = torch.sum(overlaps, 1) - possible_matches_by_order = CUDA_wrapper(torch.zeros(max_order)) - for n in range(1, max_order + 1): - cur_pm = translation_lengths.float() - n + 1 - mask = cur_pm > 0 - cur_pm *= mask.float() - possible_matches_by_order[n - 1] = torch.sum(cur_pm) - precisions = Variable(FloatTensor([0] * max_order)) - for i in range(max_order): - if smooth: - precisions[i] = (matches_by_order[i] + 1) /\ - (possible_matches_by_order[i] + 1) - else: - if possible_matches_by_order[i] > 0: - precisions[i] = matches_by_order[i] /\ - possible_matches_by_order[i] - else: - precisions[i] = Variable(FloatTensor([0])) - if torch.min(precisions[:max_order]).item() > 0: - p_log_sum = sum([(1. / max_order) * torch.log(p) for p in precisions]) - geo_mean = torch.exp(p_log_sum) - else: - geo_mean = torch.pow(\ - reduce(lambda x, y: x*y, precisions), 1./max_order) - eprint('WARNING: some precision(s) is zero') - ratio = float(translation_length) / reference_length - if ratio > 1.0: - bp = 1.0 - else: - THRESHOLD_RATIO = 1E-1 - MIN_BP = 1E-2 - if ratio > THRESHOLD_RATIO: - bp = np.exp(1 - 1. / ratio) - else: - bp = MIN_BP - bleu = -geo_mean * bp - return bleu, precisions diff --git a/decanlp/modules/matrixBLEU.py b/decanlp/modules/matrixBLEU.py deleted file mode 100755 index cf54abde..00000000 --- a/decanlp/modules/matrixBLEU.py +++ /dev/null @@ -1,114 +0,0 @@ -import torch -from torch.nn import functional -from torch.autograd import Variable -import numpy as np -import os -from functools import reduce -from copy import deepcopy as copy -import time -from .utils import CUDA_wrapper -from .utils import SoftmaxWithTemperature -from .utils import fill_eye_diag - -class mBLEU: - def __init__(self, max_order=4, softmax_temperature=0.001, T_argmax=True,\ - std_temp=False): - """class implementing straightforwad matrix BLEU computation""" - self.max_order = max_order - self.T_argmax = T_argmax - self.sm = SoftmaxWithTemperature(softmax_temperature) - self.softmax_regular = torch.nn.Softmax() - self.std_temp = std_temp - - def __call__(self, R, T, reference_corpus_lens, translation_corpus_lens): - """ - T[b x t x v] - R[b x r] - reference_corpus_lens - list, len=b - translation_corpus_lens - list, len=b - """ - max_order = self.max_order - shapeR = R.data.shape - shapeT = T.data.shape - translation_length = sum(translation_corpus_lens) - reference_length = sum(reference_corpus_lens) - if self.T_argmax: - cur_temperature = None - if self.std_temp: - cur_temperature = T.std() - if (np.random.rand(1)[0] > 0.99): - print(cur_temperature) - T = self.sm(T.contiguous().view(-1, shapeT[2]),\ - temperature=cur_temperature).view(shapeT) - TR = T.bmm(R.transpose(1, 2)) - TT = T.bmm(T.transpose(1, 2)) - # TT = fill_eye_diag(TT) - - reference_len = sum(reference_corpus_lens) - tanslation_len = sum(translation_corpus_lens) - matches_by_order = [CUDA_wrapper(Variable(torch.FloatTensor([0])))\ - for i in range(max_order)] - cur_t = TT - cur_tr = TR - all_t = [torch.sum(cur_t, 1)] - all_tr = [torch.sum(cur_tr, 2)] - def overlapper(t, tr): - SMOOTH_CONST = 1E-10 - return torch.sum((torch.min(t, tr) + SMOOTH_CONST) / torch.max(\ - (t + SMOOTH_CONST),CUDA_wrapper(Variable(\ - torch.FloatTensor([1])))), 1) - overlap = overlapper(all_t[-1], all_tr[-1]) - matches_by_order[0] = torch.sum(overlap) - possible_matches_by_order = [ - CUDA_wrapper(Variable(torch.FloatTensor([0])))\ - for i in range(max_order)\ - ] - def update_possible_matches(possible_matches_by_order,\ - translation_corpus_lens, order): - for transl_len in translation_corpus_lens: - possible_matches = transl_len - order - if possible_matches > 0: - possible_matches_by_order[order] += possible_matches - update_possible_matches(possible_matches_by_order,\ - translation_corpus_lens, 0) - for order in range(1, min(max_order, shapeT[1], shapeR[1])): - cur_t = TT[:, order:, order:] * cur_t[:, :-1, :-1] - all_t.append(torch.sum(cur_t, 1)) - cur_tr = TR[:, order:, order:] * cur_tr[:, :-1, :-1] - all_tr.append(torch.sum(cur_tr, 2)) - overlap = overlapper(all_t[-1], all_tr[-1]) - matches_by_order[order] = torch.sum(overlap) - update_possible_matches(possible_matches_by_order,\ - translation_corpus_lens, order) - - precisions = [CUDA_wrapper(Variable(torch.FloatTensor([0])))\ - for i in range(max_order)] - for i in range(0, max_order): - if possible_matches_by_order[i].data[0] > 0: - if i > 0: - precisions[i] = ((matches_by_order[i].float() + 1)\ - /( possible_matches_by_order[i] + 1)) - else: - precisions[i] = (matches_by_order[i].float()\ - /possible_matches_by_order[i]) - else: - precisions[i] = CUDA_wrapper(Variable(torch.FloatTensor([0]))) - if torch.min(torch.stack(precisions)).data[0] > 1E-3: - p_log_sum = sum([(1. / max_order) * torch.log(p)\ - for p in precisions]) - geo_mean = torch.exp(p_log_sum) - else: - geo_mean = torch.pow(\ - reduce(lambda x, y: x*y, precisions), 1./max_order) - ratio = float(translation_length) / reference_length - if ratio > 1.0: - bp = 1. - else: - THRESHOLD_RATIO = 1E-1 - MIN_BP = 1E-2 - if ratio > THRESHOLD_RATIO: - bp = np.exp(1 - 1. / ratio) - else: - bp = MIN_BP - bleu = -geo_mean * bp - return bleu, precisions diff --git a/decanlp/modules/utils.py b/decanlp/modules/utils.py deleted file mode 100644 index dd0d7119..00000000 --- a/decanlp/modules/utils.py +++ /dev/null @@ -1,41 +0,0 @@ -import torch -from torch.autograd import Variable - - -if torch.cuda.is_available(): - Tensor = torch.cuda.FloatTensor - FloatTensor = torch.cuda.FloatTensor - LongTensor = torch.cuda.LongTensor - ByteTensor = torch.cuda.ByteTensor -else: - Tensor = torch.Tensor - FloatTensor = torch.FloatTensor - LongTensor = torch.LongTensor - ByteTensor = torch.ByteTensor - -def CUDA_wrapper(tensor): - use_cuda = torch.cuda.is_available() - if use_cuda: - return tensor.cuda() - else: - return tensor - -class SoftmaxWithTemperature: - def __init__(self, temperature): - """ - formula: softmax(x/temperature) - """ - self.temperature = temperature - self.softmax = torch.nn.Softmax() - - def __call__(self, x, temperature=None): - if not temperature is None: - return self.softmax(x / temperature) - else: - return self.softmax(x / self.temperature) - -def fill_eye_diag(a): - _, s1, s2 = a.data.shape - dd = Variable(CUDA_wrapper(torch.eye(s1))) - zero_dd = 1 - dd - return a * zero_dd + dd