From 58355e0ccc5b4d01dfd42159c730ab124cd178d8 Mon Sep 17 00:00:00 2001
From: Giovanni Campagna <gcampagn@cs.stanford.edu>
Date: Tue, 10 Dec 2019 17:49:12 -0800
Subject: [PATCH] Remove code related to differentiable BLEU loss

This code is unused, has questionable license, was never updated
to Torch 1.0, and seems unlikely to work.
---
 decanlp/arguments.py                          |   2 -
 .../multitask_question_answering_network.py   |  22 +--
 decanlp/modules/__init__.py                   |   0
 decanlp/modules/expectedBLEU.py               |  53 ------
 decanlp/modules/expectedMultiBleu.py          | 164 ------------------
 decanlp/modules/matrixBLEU.py                 | 114 ------------
 decanlp/modules/utils.py                      |  41 -----
 7 files changed, 1 insertion(+), 395 deletions(-)
 delete mode 100644 decanlp/modules/__init__.py
 delete mode 100755 decanlp/modules/expectedBLEU.py
 delete mode 100644 decanlp/modules/expectedMultiBleu.py
 delete mode 100755 decanlp/modules/matrixBLEU.py
 delete mode 100644 decanlp/modules/utils.py

diff --git a/decanlp/arguments.py b/decanlp/arguments.py
index 8318f41b..96f32a03 100644
--- a/decanlp/arguments.py
+++ b/decanlp/arguments.py
@@ -129,9 +129,7 @@ def parse(argv):
 
     parser.add_argument('--skip_cache', action='store_true', dest='skip_cache_bool', help='whether to use exisiting cached splits or generate new ones')
     parser.add_argument('--lr_rate', default=0.001, type=float, help='initial_learning_rate')
-    parser.add_argument('--use_bleu_loss', action='store_true', help='whether to use differentiable BLEU loss or not')
     parser.add_argument('--use_maxmargin_loss', action='store_true', help='whether to use max-margin loss or not')
-    parser.add_argument('--loss_switch', default=0.666, type=float, help='switch to BLEU loss after certain iterations controlled by this ratio')
     parser.add_argument('--small_glove', action='store_true', help='Use glove.6B.50d instead of glove.840B.300d')
     parser.add_argument('--almond_type_embeddings', action='store_true', help='Add type-based word embeddings for Almond task')
     parser.add_argument('--use_curriculum', action='store_true', help='Use curriculum learning')
diff --git a/decanlp/models/multitask_question_answering_network.py b/decanlp/models/multitask_question_answering_network.py
index 399510d7..c28b7201 100644
--- a/decanlp/models/multitask_question_answering_network.py
+++ b/decanlp/models/multitask_question_answering_network.py
@@ -28,18 +28,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import math
-import numpy as np
-import json
-
-import torch
-from torch import nn
-from torch.nn import functional as F
 from collections import defaultdict
 
 from ..util import get_trainable_params, set_seed
-from ..modules import expectedBLEU, expectedMultiBleu, matrixBLEU
 
 from .common import *
 
@@ -203,18 +194,7 @@ class MultitaskQuestionAnsweringNetwork(nn.Module):
                 oov_to_limited_idx)
 
 
-            if self.args.use_bleu_loss and iteration >= self.args.loss_switch * max(self.args.train_iterations):
-                max_order = 4
-                targets = answer_indices[:, 1:].contiguous()
-                batch_size = targets.size(0)
-                reference_lengths = [l-1 for l in answer_lengths]
-                translation_len = max(reference_lengths)
-                translation_lengths = torch.tensor([translation_len] * batch_size, device=self.device)
-
-                bleu_loss_smoothed = expectedMultiBleu.bleu(probs, targets, translation_lengths, reference_lengths, max_order=max_order, smooth=True)
-                loss = -1 * bleu_loss_smoothed[0]
-
-            elif self.args.use_maxmargin_loss:
+            if self.args.use_maxmargin_loss:
                 targets = answer_indices[:, 1:].contiguous()
                 loss = max_margin_loss(probs, targets, pad_idx=pad_idx)
 
diff --git a/decanlp/modules/__init__.py b/decanlp/modules/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/decanlp/modules/expectedBLEU.py b/decanlp/modules/expectedBLEU.py
deleted file mode 100755
index ab376113..00000000
--- a/decanlp/modules/expectedBLEU.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import numpy as np
-import torch
-from torch.autograd import Variable
-from .utils import CUDA_wrapper
-from functools import reduce
-from .utils import LongTensor, FloatTensor
-import time
-
-
-def one_hots(zeros, ix):
-    for i in range(zeros.size()[0]):
-        zeros[i, ix[i]] = 1
-    return zeros
-
-def overlap(t, r_hot, r, f, temp, n):
-    """ calculate overlap as in original BLEU script but expected.
-    see google's nmt bleu.py BLEU script for details """
-    t_soft = f(t / temp)
-    length = t.size()[0]
-    v_size = t.size()[1]
-    from_ref = list([i.data[0] for i in r])
-    from_ref_t = LongTensor(from_ref)
-    mapper_ref = {j:i for i, j in enumerate(from_ref)}
-    res = CUDA_wrapper(Variable(FloatTensor([0])))
-    M = [[from_ref[i + j] for j in range(n)] for i in range(len(from_ref) - n + 1)]
-    mul = lambda x, y: x * y
-    start_all = time.time()
-    for i in range(length - n + 1):
-        start_select_t_soft = time.time()
-        pp = [t_soft[i + j] for j in range(n)]
-        ngram_calc_cum = 0
-        for m in M:
-            reslicer = lambda x: r.data.shape[0] + x
-            ngram_calc_start = time.time()
-            y_prod = reduce(mul,
-                     [r_hot[j:reslicer(-n + 1 + j), m[j]] for j in range(n)]) # j is id of current word in sentense
-            y_prod = y_prod.sum(0)
-            p_prod = reduce(mul, \
-                     [t_soft[j:reslicer(-n + 1 + j), m[j]] for j in range(n)])
-            denominator = 1 + p_prod.sum(0) - p_prod[i]
-            ngram_calc_cum += time.time() - ngram_calc_start
-            pr = reduce(mul, [pp[j][m[j]] for j in range(n)])
-            res += torch.min(pr, pr * y_prod / denominator)
-    return res
-
-def precision(t, r_hot, r, f, temp, n):
-    return overlap(t, r_hot, r, f, temp, n) / (t.data.shape[0] - n + 1)
-
-def bleu(t, r_hot, r, f, temp, n):
-    precisions = [precision(t, r_hot, r, f, temp, i) for i in range(1, n+1)]
-    p_log_sum =  sum([(1. / n) * torch.log(p)\
-                                                for p in precisions])
-    return torch.exp(p_log_sum)
diff --git a/decanlp/modules/expectedMultiBleu.py b/decanlp/modules/expectedMultiBleu.py
deleted file mode 100644
index a1fd9dca..00000000
--- a/decanlp/modules/expectedMultiBleu.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torch.autograd import Variable
-from collections import Counter
-from copy import deepcopy as copy_deep
-from copy import copy as copy
-from .matrixBLEU import mBLEU
-from .utils import CUDA_wrapper
-from collections import Counter
-from .utils import LongTensor, FloatTensor
-from functools import reduce
-from .utils import CUDA_wrapper
-import sys
-
-def eprint(*args, **kwargs):
-    print(*args, file=sys.stderr, **kwargs)
-
-class Reslicer:
-    def __init__(self, max_lenght):
-        """
-        This functor is used to prevent empty reslice
-        of index selecting when it appears to be zero
-        """
-        self.max_l = max_lenght
-
-    def __call__(self, x):
-        return self.max_l - x
-
-def ngrams_product(A, n):
-    """
-    A-is probability matrix
-    [batch x length_candidate_translation x reference_len]
-    third dimension is reference's words in order of appearance in reference
-    n - states for n-grams
-    Output: [batch, (length_candidate_translation-n+1) x (reference_len-n+1)]
-    """
-    max_l = min(A.size()[1:])
-    reslicer = Reslicer(max_l)
-    if reslicer(n-1) <= 0:
-        return None
-    cur = A[:, :reslicer(n-1), :reslicer(n-1)].clone()
-    for i in range(1, n):
-        mul = A[:, i:reslicer(n-1-i), i:reslicer(n-1-i)]
-        cur = cur * mul
-    return cur
-
-def get_selected_matrices(probs, references, dim=1):
-    """
-    batched index select
-    probs - is a matrix
-    references - is index
-    dim - is dimention of element of the batch
-    """
-    # NOTE for loop in index select. Found only this way to do this.
-    # It seems that it could be optimized via batched version of index_select
-    # but there is no batched_index_select in pytorch for now
-    return torch.cat([torch.index_select(a, dim, Variable(LongTensor(i))).unsqueeze(0)\
-                            for a, i in zip(probs, references)])
-
-
-def ngram_ref_counts(reference, lengths, n):
-    """
-    For each position counts n-grams equal to n-gram to this position
-    reference - matrix sequences of id's from vocabulary.[batch, ref len]
-    NOTE reference should be padded with some special ids
-    At least one value in length must be equal reference.shape[1]
-    output: counts n-grams for each start position padded with zeros
-    """
-    res = []
-    max_len = max(lengths)
-    if max_len - n + 1 <= 0:
-        return None
-    for r, l in zip(reference, lengths):
-        picked = set() # we only take into account first appearance of n-gram
-        #             (which contains its count of occurrence)
-        current_length = l - n + 1
-        cnt = Counter([tuple([r[i + j] for j in range(n)]) \
-                        for i in range(current_length)])
-        occurrence = []
-        for i in range(current_length):
-            n_gram = tuple([r[i + j] for j in range(n)])
-            val = 0
-            if not n_gram in picked:
-                val = cnt[n_gram]
-                picked.add(n_gram)
-            occurrence.append(val)
-        padding = [1 for _ in range(max_len - l if current_length > 0\
-                                                else max_len - n+ 1)]
-        res.append(occurrence + padding)
-    return Variable(FloatTensor(res), requires_grad=False)
-
-def calculate_overlap(p, r, n, lengths):
-    """
-    p - probability tensor [b x len_x x reference_length]
-    r - references, tensor [b x len_y]
-    contains word's ids for each reference in batch
-    n - n-gram
-    lenghts - lengths of each reference in batch
-    """
-    A = ngrams_product(get_selected_matrices(p, r), n)
-    r_cnt = ngram_ref_counts(r, lengths, n)
-    if A is None or r_cnt is None:
-        return CUDA_wrapper(torch.zeros(p.shape[0]))
-    r_cnt = r_cnt[:, None]
-    A_div = -A + torch.sum(A, 1, keepdim=True) + 1
-    second_arg = r_cnt / A_div
-    term = torch.min(A, A * second_arg)
-    return torch.sum(torch.sum(term, 2), 1)
-
-def bleu(p, r, translation_lengths, reference_lengths, max_order=4, smooth=False):
-    """
-    p - matrix with probabilityes
-    r - reference batch
-    reference_lengths - lengths of the references
-    max_order - max order of n-gram
-    smooth - smooth calculation of precisions
-    translation_lengths - torch tensor
-    """
-    overlaps_list = []
-    translation_length = sum(translation_lengths)
-    reference_length = sum(reference_lengths)
-    for n in range(1, max_order + 1):
-        overlaps_list.append(calculate_overlap(p, r, n, reference_lengths))
-    overlaps = CUDA_wrapper(torch.stack(overlaps_list))
-    matches_by_order = torch.sum(overlaps, 1)
-    possible_matches_by_order = CUDA_wrapper(torch.zeros(max_order))
-    for n in range(1, max_order + 1):
-        cur_pm = translation_lengths.float() - n + 1
-        mask = cur_pm > 0
-        cur_pm *= mask.float()
-        possible_matches_by_order[n - 1] = torch.sum(cur_pm)
-    precisions = Variable(FloatTensor([0] * max_order))
-    for i in range(max_order):
-        if smooth:
-            precisions[i] = (matches_by_order[i] + 1) /\
-                                            (possible_matches_by_order[i] + 1)
-        else:
-            if possible_matches_by_order[i] > 0:
-                precisions[i] = matches_by_order[i] /\
-                                            possible_matches_by_order[i]
-            else:
-                precisions[i] = Variable(FloatTensor([0]))
-    if torch.min(precisions[:max_order]).item() > 0:
-        p_log_sum = sum([(1. / max_order) * torch.log(p) for p in precisions])
-        geo_mean = torch.exp(p_log_sum)
-    else:
-        geo_mean = torch.pow(\
-                        reduce(lambda x, y: x*y, precisions), 1./max_order)
-        eprint('WARNING: some precision(s) is zero')
-    ratio = float(translation_length) / reference_length
-    if ratio > 1.0:
-        bp = 1.0
-    else:
-        THRESHOLD_RATIO = 1E-1
-        MIN_BP = 1E-2
-        if ratio > THRESHOLD_RATIO:
-            bp = np.exp(1 - 1. / ratio)
-        else:
-            bp = MIN_BP
-    bleu = -geo_mean * bp
-    return bleu, precisions
diff --git a/decanlp/modules/matrixBLEU.py b/decanlp/modules/matrixBLEU.py
deleted file mode 100755
index cf54abde..00000000
--- a/decanlp/modules/matrixBLEU.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import torch
-from torch.nn import functional
-from torch.autograd import Variable
-import numpy as np
-import os
-from functools import reduce
-from copy import deepcopy as copy
-import time
-from .utils import CUDA_wrapper
-from .utils import SoftmaxWithTemperature
-from .utils import fill_eye_diag
-
-class mBLEU:
-    def __init__(self, max_order=4, softmax_temperature=0.001, T_argmax=True,\
-                std_temp=False):
-        """class implementing straightforwad matrix BLEU computation"""
-        self.max_order = max_order
-        self.T_argmax = T_argmax
-        self.sm = SoftmaxWithTemperature(softmax_temperature)
-        self.softmax_regular = torch.nn.Softmax()
-        self.std_temp = std_temp
-
-    def __call__(self, R, T, reference_corpus_lens, translation_corpus_lens):
-        """
-        T[b x t x v]
-        R[b x r]
-        reference_corpus_lens - list, len=b
-        translation_corpus_lens - list, len=b
-        """
-        max_order = self.max_order
-        shapeR = R.data.shape
-        shapeT = T.data.shape
-        translation_length = sum(translation_corpus_lens)
-        reference_length = sum(reference_corpus_lens)
-        if self.T_argmax:
-            cur_temperature = None
-            if self.std_temp:
-                cur_temperature = T.std()
-                if (np.random.rand(1)[0] > 0.99):
-                    print(cur_temperature)
-            T = self.sm(T.contiguous().view(-1, shapeT[2]),\
-                                    temperature=cur_temperature).view(shapeT)
-        TR = T.bmm(R.transpose(1, 2))
-        TT = T.bmm(T.transpose(1, 2))
-        # TT = fill_eye_diag(TT)
-
-        reference_len = sum(reference_corpus_lens)
-        tanslation_len = sum(translation_corpus_lens)
-        matches_by_order = [CUDA_wrapper(Variable(torch.FloatTensor([0])))\
-                                        for i in range(max_order)]
-        cur_t = TT
-        cur_tr = TR
-        all_t = [torch.sum(cur_t, 1)]
-        all_tr = [torch.sum(cur_tr, 2)]
-        def overlapper(t, tr):
-            SMOOTH_CONST = 1E-10
-            return torch.sum((torch.min(t, tr) + SMOOTH_CONST) / torch.max(\
-                (t + SMOOTH_CONST),CUDA_wrapper(Variable(\
-                                                torch.FloatTensor([1])))), 1)
-        overlap = overlapper(all_t[-1], all_tr[-1])
-        matches_by_order[0] = torch.sum(overlap)
-        possible_matches_by_order = [
-                                CUDA_wrapper(Variable(torch.FloatTensor([0])))\
-                                for i in range(max_order)\
-                                    ]
-        def update_possible_matches(possible_matches_by_order,\
-                                                translation_corpus_lens, order):
-            for transl_len in translation_corpus_lens:
-                possible_matches = transl_len - order
-                if possible_matches > 0:
-                    possible_matches_by_order[order] += possible_matches
-        update_possible_matches(possible_matches_by_order,\
-                                                translation_corpus_lens, 0)
-        for order in range(1, min(max_order, shapeT[1], shapeR[1])):
-            cur_t = TT[:, order:, order:] * cur_t[:, :-1, :-1]
-            all_t.append(torch.sum(cur_t, 1))
-            cur_tr = TR[:, order:, order:] * cur_tr[:, :-1, :-1]
-            all_tr.append(torch.sum(cur_tr, 2))
-            overlap = overlapper(all_t[-1], all_tr[-1])
-            matches_by_order[order] = torch.sum(overlap)
-            update_possible_matches(possible_matches_by_order,\
-                                            translation_corpus_lens, order)
-
-        precisions = [CUDA_wrapper(Variable(torch.FloatTensor([0])))\
-                                                    for i in range(max_order)]
-        for i in range(0, max_order):
-            if possible_matches_by_order[i].data[0] > 0:
-                if i > 0:
-                    precisions[i] = ((matches_by_order[i].float() + 1)\
-                                        /( possible_matches_by_order[i] + 1))
-                else:
-                    precisions[i] = (matches_by_order[i].float()\
-                                        /possible_matches_by_order[i])
-            else:
-                precisions[i] = CUDA_wrapper(Variable(torch.FloatTensor([0])))
-        if torch.min(torch.stack(precisions)).data[0] > 1E-3:
-            p_log_sum = sum([(1. / max_order) * torch.log(p)\
-                                                        for p in precisions])
-            geo_mean = torch.exp(p_log_sum)
-        else:
-            geo_mean = torch.pow(\
-                            reduce(lambda x, y: x*y, precisions), 1./max_order)
-        ratio = float(translation_length) / reference_length
-        if ratio > 1.0:
-            bp = 1.
-        else:
-            THRESHOLD_RATIO = 1E-1
-            MIN_BP = 1E-2
-            if ratio > THRESHOLD_RATIO:
-                bp = np.exp(1 - 1. / ratio)
-            else:
-                bp = MIN_BP
-        bleu = -geo_mean * bp
-        return bleu, precisions
diff --git a/decanlp/modules/utils.py b/decanlp/modules/utils.py
deleted file mode 100644
index dd0d7119..00000000
--- a/decanlp/modules/utils.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import torch
-from torch.autograd import Variable
-
-
-if torch.cuda.is_available():
-    Tensor = torch.cuda.FloatTensor
-    FloatTensor = torch.cuda.FloatTensor
-    LongTensor = torch.cuda.LongTensor
-    ByteTensor = torch.cuda.ByteTensor
-else:
-    Tensor = torch.Tensor
-    FloatTensor = torch.FloatTensor
-    LongTensor = torch.LongTensor
-    ByteTensor = torch.ByteTensor
-
-def CUDA_wrapper(tensor):
-    use_cuda = torch.cuda.is_available()
-    if use_cuda:
-        return tensor.cuda()
-    else:
-        return tensor
-
-class SoftmaxWithTemperature:
-    def __init__(self, temperature):
-        """
-        formula: softmax(x/temperature)
-        """
-        self.temperature = temperature
-        self.softmax = torch.nn.Softmax()
-
-    def __call__(self, x, temperature=None):
-        if not temperature is None:
-            return self.softmax(x / temperature)
-        else:
-            return self.softmax(x / self.temperature)
-
-def fill_eye_diag(a):
-    _, s1, s2 = a.data.shape
-    dd = Variable(CUDA_wrapper(torch.eye(s1)))
-    zero_dd = 1 - dd
-    return a * zero_dd + dd