diff --git a/Pipfile b/Pipfile
index 013badbe..9722feec 100644
--- a/Pipfile
+++ b/Pipfile
@@ -19,9 +19,9 @@ pyrouge = ">=0.1.3"
 sacrebleu = "~=1.0"
 tensorboardX = "==2.0.*"
 requests = "~=2.22"
-transformers = "==2.11"
+transformers = "==3.5.1"
 radam = {git = "https://github.com/LiyuanLucasLiu/RAdam"}
-sentencepiece = ">=0.1.83,<0.2.0"
+sentencepiece = "==0.1.91"
 mosestokenizer = '~=1.1'
 matplotlib = '~=3.1'
 seaborn = '~=0.9'
diff --git a/genienlp/paraphrase/GPT2Seq2Seq.py b/genienlp/paraphrase/GPT2Seq2Seq.py
index 8a496036..71ab4b01 100644
--- a/genienlp/paraphrase/GPT2Seq2Seq.py
+++ b/genienlp/paraphrase/GPT2Seq2Seq.py
@@ -1,6 +1,8 @@
 from typing import List
-from transformers import GPT2LMHeadModel
+
 import torch
+from transformers.modeling_gpt2 import GPT2LMHeadModel
+
 
 class GPT2Seq2Seq(GPT2LMHeadModel):
     def __init__(self, config):
@@ -23,7 +25,7 @@ class GPT2Seq2Seq(GPT2LMHeadModel):
 
         return copy_input_sequences
 
-    
+    #TODO check if this function is used
     def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty):
         """ repetition penalty from CTRL (https://arxiv.org/abs/1909.05858), but much faster on GPU
         """
@@ -37,15 +39,7 @@ class GPT2Seq2Seq(GPT2LMHeadModel):
         need_divide = need_change > 0
         need_multiply = need_change < 0
         lprobs = need_divide * lprobs / repetition_penalty + need_multiply * lprobs * repetition_penalty + (1-m) * lprobs
-        
-        # old, slow implementation
-        # if repetition_penalty != 1.0:
-            # for i in range(context.shape[0]):
-                # for previous_token in set(generated[i].tolist()):
-                    # if lprobs[i, previous_token] > 0:
-                        # lprobs[i, previous_token] /= repetition_penalty
-                    # else:
-                        # lprobs[i, previous_token] *= repetition_penalty
+
 
     def generate(self, **kwargs):
         # change arguments so that they have the same meaning as seq2seq models
@@ -68,18 +62,20 @@ class GPT2Seq2Seq(GPT2LMHeadModel):
         return outputs
 
 
-    def prepare_inputs_for_generation(self, input_ids, past, **kwargs):
-        sep_token_position = (input_ids==self.sep_token_id).to(torch.long)
-        assert (torch.sum(sep_token_position, dim=1)==1).all(), 'All input_ids must contain exactly one sep_token. sep_token_position = %s\nsep_token_id = %d' % (str(sep_token_position), self.sep_token_id)
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        sep_token_position = (input_ids == self.sep_token_id).to(torch.long)
+        assert (torch.sum(sep_token_position, dim=1) == 1).all(), 'All input_ids must contain exactly one sep_token.' \
+                                                                  ' sep_token_position = %s\nsep_token_id = %d' % (str(sep_token_position), self.sep_token_id)
         token_type_ids = torch.cumsum(sep_token_position, dim=1) - sep_token_position
-        attention_mask = (input_ids!=self.pad_token_id).to(torch.long) # 0 means mask, 1 means no mask
-        position_ids = ((torch.cumsum(attention_mask, dim=1)-1)*(1-token_type_ids)+(torch.cumsum(token_type_ids, dim=1)-1)*token_type_ids).clamp(min=0)
-        token_type_ids = self.sep_token_id * (1-token_type_ids) + self.eos_token_id * token_type_ids
+        attention_mask = (input_ids != self.pad_token_id).to(torch.long)  # 0 means mask, 1 means no mask
+        position_ids = ((torch.cumsum(attention_mask, dim=1) - 1) * (1 - token_type_ids) +
+                        (torch.cumsum(token_type_ids, dim=1) - 1) * token_type_ids).clamp(min=0)
+        token_type_ids = self.sep_token_id * (1 - token_type_ids) + self.eos_token_id * token_type_ids
 
         if past:
             input_ids = input_ids[:, -1].unsqueeze(-1)
             position_ids = position_ids[:, -1].unsqueeze(-1)
             token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
 
-        inputs = {"input_ids": input_ids, "position_ids": position_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "past": past}
+        inputs = {"input_ids": input_ids, "position_ids": position_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "past_key_values": past}
         return inputs
\ No newline at end of file
diff --git a/genienlp/paraphrase/data_utils.py b/genienlp/paraphrase/data_utils.py
index c32366f2..4478a534 100644
--- a/genienlp/paraphrase/data_utils.py
+++ b/genienlp/paraphrase/data_utils.py
@@ -308,7 +308,7 @@ def create_features_from_tsv_file(file_path, tokenizer, input_column, gold_colum
 
 
 def is_question(sentence: str):
-    question_words = ['which', 'what', 'where', 'how', 'who', 'when', 'is', 'are', 'am', \
+    question_words = ['which', 'what', 'where', 'how', 'who', 'when', 'is', 'are', 'am',
                       'can', 'could', 'would', 'will', 'have', 'did', 'do', 'does', 'no is', 'yes is']
     for w in question_words:
         if sentence.startswith(w+' '):
diff --git a/genienlp/paraphrase/dataset.py b/genienlp/paraphrase/dataset.py
index 93197c78..8927ae5e 100644
--- a/genienlp/paraphrase/dataset.py
+++ b/genienlp/paraphrase/dataset.py
@@ -168,11 +168,11 @@ class TextDataset(Dataset):
 
     def _add_marian_example(self, input_sequence, output_sequence):
     
-        model_inputs = self.tokenizer.prepare_translation_batch([input_sequence], [output_sequence])
+        model_inputs = self.tokenizer.prepare_seq2seq_batch([input_sequence], [output_sequence])
     
         encoded_input_ids = model_inputs['input_ids'].tolist()[0]
         encoded_attention_mask = model_inputs['attention_mask'].tolist()[0]
-        encoded_output_ids = model_inputs['decoder_input_ids'].tolist()[0]
+        encoded_output_ids = model_inputs['labels'].tolist()[0]
     
         self._update_seq2seq_example(encoded_input_ids, encoded_attention_mask, encoded_output_ids)
         
diff --git a/genienlp/paraphrase/model_utils.py b/genienlp/paraphrase/model_utils.py
index ce863af1..5af51efa 100644
--- a/genienlp/paraphrase/model_utils.py
+++ b/genienlp/paraphrase/model_utils.py
@@ -67,12 +67,12 @@ def check_args(args):
     
     if args.model_type == 'marian' and args.model_name_or_path.rsplit('-', 1)[1] not in MARIAN_GROUP_MEMBERS and args.tgt_lang:
         logger.warning('Target language should not be provided when using models with single language pairs,'
-                       'otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...')
+                       ' otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...')
         args.tgt_lang = None
     
     if args.model_type == 'marian' and args.model_name_or_path.rsplit('-', 2)[1] not in MARIAN_GROUP_MEMBERS and args.src_lang:
         logger.warning('Source language should not be provided when using models with single language pairs,'
-                       'otherwise the translation outputs will be incorrect; thus we ignore the source language you provided...')
+                       ' otherwise the translation outputs will be incorrect; thus we ignore the source language you provided...')
         args.src_lang = None
     
     if args.model_type == 'mbart' and not (args.tgt_lang and args.src_lang):
diff --git a/genienlp/paraphrase/run_generation.py b/genienlp/paraphrase/run_generation.py
index cc1c87b2..a650b195 100644
--- a/genienlp/paraphrase/run_generation.py
+++ b/genienlp/paraphrase/run_generation.py
@@ -45,11 +45,10 @@ import torch
 from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .transformers_utils import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-from transformers import GPT2Tokenizer, T5Tokenizer, MarianTokenizer
+from transformers import GPT2Tokenizer, T5Tokenizer, MarianTokenizer, BartTokenizer
 
-from transformers import BartForConditionalGeneration
-from .transformers_utils import MarianMTModel, T5ForConditionalGeneration, BartForConditionalGeneration as MBartForConditionalGeneration
-from .transformers_utils import BartTokenizer, MBartTokenizer
+from .transformers_utils import GenieMarianMTModel, GenieT5ForConditionalGeneration, GenieBartForConditionalGeneration, GenieMBartForConditionalGeneration
+from .transformers_utils import GenieMBartTokenizer
 
 
 from transformers import PretrainedConfig
@@ -69,10 +68,10 @@ ALL_MODELS = sum((tuple(map.keys()) for map in (GPT2_PRETRAINED_CONFIG_ARCHIVE_M
 
 MODEL_CLASSES = {
     'gpt2': (GPT2Seq2Seq, GPT2Tokenizer, {'bos_token': '<unk>', 'sep_token': '<paraphrase>', 'eos_token': '</paraphrase>'}),
-    't5': (T5ForConditionalGeneration, T5Tokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
-    'bart': (BartForConditionalGeneration, BartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
-    'mbart': (MBartForConditionalGeneration, MBartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
-    'marian': (MarianMTModel, MarianTokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
+    't5': (GenieT5ForConditionalGeneration, T5Tokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
+    'bart': (GenieBartForConditionalGeneration, BartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
+    'mbart': (GenieMBartForConditionalGeneration, GenieMBartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
+    'marian': (GenieMarianMTModel, MarianTokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
 }
 
 
@@ -126,14 +125,17 @@ def parse_argv(parser):
     parser.add_argument('--batch_size', type=int, default=4,
                         help="Batch size for text generation for each GPU.")
     
+    parser.add_argument('--pad_token', type=str, default='<pad>',
+                        help='The special token for padding, if tokenizer does not have that')
+    
     parser.add_argument('--cache_dir', default='.embeddings', type=str, help='where to save transforemrs cached models, configs, and tokenizers.')
     
     parser.add_argument('--trained_model_type', type=str, help='if provided we make sure the loaded model matches the model_type')
     
-    parser.add_argument('--src_lang', type=str, default='en', help='source language used for translation task')
+    parser.add_argument('--src_lang', type=str, help='source language used for translation task')
     parser.add_argument('--tgt_lang', type=str, help='target language used for translation task')
-    parser.add_argument('--return_attentions', action='store_true', help='return self and cross attention weights for seq2seq models')
-    parser.add_argument('--return_hidden_states', action='store_true', help='return all hidden states for seq2seq models')
+    parser.add_argument('--output_attentions', action='store_true', help='return self and cross attention weights for seq2seq models')
+    parser.add_argument('--output_hidden_states', action='store_true', help='return all hidden states for seq2seq models')
 
     parser.add_argument('--att_pooling', type=str, default='max', help='pooling used to calculate decoder-encoder attention values across different heads')
     parser.add_argument('--plot_heatmaps', action='store_true', help='whether to plot decoder-encoder attention heatmaps')
@@ -277,12 +279,12 @@ def run_multi_process_generation(args):
 def run_single_process_generation(args, config):
     model_class, tokenizer_class, special_tokens = MODEL_CLASSES[args.model_type]
     
-    return_attentions = args.return_attentions
-    return_hidden_states = args.return_hidden_states
+    output_attentions = args.output_attentions
+    output_hidden_states = args.output_hidden_states
     
     model = model_class.from_pretrained(args.model_name_or_path,
-                                        output_attentions=return_attentions,
-                                        output_hidden_states=return_hidden_states,
+                                        output_attentions=output_attentions,
+                                        output_hidden_states=output_hidden_states,
                                         cache_dir=args.cache_dir)
     model.to(args.device)
 
@@ -297,6 +299,11 @@ def run_single_process_generation(args, config):
     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
     eos_token_id = tokenizer.convert_tokens_to_ids(special_tokens['eos_token'])
     sep_token_id = tokenizer.convert_tokens_to_ids(special_tokens['sep_token'])
+    
+    if tokenizer.pad_token is None:
+        # this assigns pad token but doesn't add it to the vocabulary
+        tokenizer.pad_token = args.pad_token
+        
     pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
     
     if pad_token_id is None:
@@ -403,13 +410,12 @@ def run_single_process_generation(args, config):
                                  temperature=args.temperature[hyperparameter_idx] if args.temperature[hyperparameter_idx] > 0 else 1.0, # if temperature==0, we do not sample
                                  eos_token_id=eos_token_id,
                                  pad_token_id=pad_token_id,
-                                 return_attentions=return_attentions,
-                                 return_hidden_states=return_hidden_states,
                                  use_cache=True,
+                                 output_attentions=output_attentions
                                 )
 
             # TODO fix the way output attention is handled. Some models do not support it.
-            if return_attentions:
+            if output_attentions:
                 decoded, all_encoder_attentions = outputs
             else:
                 decoded = outputs
@@ -434,7 +440,7 @@ def run_single_process_generation(args, config):
                 min_index = min_index + 1
                 out_cropped = out[:min_index]
             
-                if args.task == 'translate':
+                if args.task == 'translate' and output_attentions:
                     src_tokens = tokenizer.convert_ids_to_tokens(batch_context_tensor[sample_index])
                     tgt_tokens = tokenizer.convert_ids_to_tokens(out_cropped)
                     
diff --git a/genienlp/paraphrase/run_lm_finetuning.py b/genienlp/paraphrase/run_lm_finetuning.py
index fd8613e4..7f03fb93 100644
--- a/genienlp/paraphrase/run_lm_finetuning.py
+++ b/genienlp/paraphrase/run_lm_finetuning.py
@@ -43,10 +43,10 @@ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                                   DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
                                   CamembertConfig, CamembertForMaskedLM, CamembertTokenizer,
                                   BartConfig, BartForConditionalGeneration, BartTokenizer,
-                                  MarianConfig, MarianTokenizer)
+                                  MBartConfig, MBartForConditionalGeneration,
+                                  MarianConfig, MarianMTModel, MarianTokenizer)
 
-from .transformers_utils import BartForConditionalGeneration as MBartForConditionalGeneration
-from .transformers_utils import MBartTokenizer, MarianMTModel
+from .transformers_utils import GenieMBartTokenizer
 
 from genienlp.util import set_seed, split_file_on_disk
 from genienlp.paraphrase.data_utils import mask_tokens, add_special_tokens
@@ -66,7 +66,7 @@ MODEL_CLASSES = {
     'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
     'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
     'bart': (BartConfig, BartForConditionalGeneration, BartTokenizer),
-    'mbart': (BartConfig, MBartForConditionalGeneration, MBartTokenizer),
+    'mbart': (MBartConfig, MBartForConditionalGeneration, GenieMBartTokenizer),
     'marian': (MarianConfig, MarianMTModel, MarianTokenizer)
 }
 
@@ -184,7 +184,7 @@ def train(args, train_dataset, model, tokenizer, input_file_name=None, multiple_
                 steps_trained_in_current_epoch -= 1
                 continue
 
-            inputs, attention_mask, labels, position_ids, segment_ids = batch # batch is a tuple (input, labels, position_ids, segment_ids)
+            inputs, attention_mask, labels, position_ids, segment_ids = batch
             
             if args.mlm:
                 inputs, labels = mask_tokens(inputs, labels, tokenizer, args.mlm_probability, args.mlm_ignore_index)
@@ -195,7 +195,7 @@ def train(args, train_dataset, model, tokenizer, input_file_name=None, multiple_
             segment_ids = segment_ids.to(args.device)
             model.train()
             
-            model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids}
+            model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids, 'use_cache': False}
             
             # prepare inputs for mbart, and marian
             if args.model_type in ['mbart', 'marian']:
@@ -349,7 +349,7 @@ def evaluate(args, model, tokenizer, prefix="", aux=False):
         segment_ids = segment_ids.to(args.device)
 
         with torch.no_grad():
-            model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids}
+            model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids, 'use_cache': False}
             
             if args.model_type in ['mbart', 'marian']:
                 model_inputs['attention_mask'] = attention_mask
diff --git a/genienlp/paraphrase/transformers_utils.py b/genienlp/paraphrase/transformers_utils.py
index 7378c917..09d6e9e5 100644
--- a/genienlp/paraphrase/transformers_utils.py
+++ b/genienlp/paraphrase/transformers_utils.py
@@ -1,10 +1,16 @@
-import copy
 import re
+import torch
+import torch.nn.functional as F
+from typing import List, Optional
 
-from transformers.modeling_bart import LayerNorm, LearnedPositionalEmbedding, BartEncoder, SelfAttention, invert_mask, \
-    SinusoidalPositionalEmbedding, BartModel, BartForConditionalGeneration
+from transformers import LogitsProcessorList
+from transformers.modeling_marian import MarianMTModel
+from transformers.modeling_bart import BartForConditionalGeneration
+from transformers.modeling_mbart import MBartForConditionalGeneration
+from transformers.modeling_t5 import T5ForConditionalGeneration
+from transformers.modeling_utils import PreTrainedModel
 
-from transformers.modeling_t5 import T5ForConditionalGeneration, T5PreTrainedModel, T5LayerNorm, T5Block
+from transformers.tokenization_mbart import MBartTokenizer, _all_mbart_models, SPM_URL
 
 SPIECE_UNDERLINE = "▁"
 
@@ -63,606 +69,10 @@ MARIAN_GROUP_MEMBERS = {
             "yue", "yue_Hans", "yue_Hant", "zho", "zho_Hans", "zho_Hant", "zlm_Latn", "zsm_Latn", "zul", "zza"]
 }
 
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BART model, ported from the fairseq repo."""
-import math
-import random
-from typing import List, Optional
 
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
+###############
 
-from transformers.activations import ACT2FN
-from transformers.configuration_bart import BartConfig
-from transformers.modeling_utils import calc_banned_ngram_tokens, calc_banned_bad_words_ids, top_k_top_p_filtering
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, config: BartConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-        self.output_attentions = config.output_attentions
-        self.self_attn = SelfAttention(
-            embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout,
-        )
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.normalize_before = config.normalize_before
-
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
-        self.encoder_attn = SelfAttention(
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            encoder_decoder_attention=True,
-        )
-        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
-        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = LayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        x,
-        encoder_hidden_states,
-        encoder_attn_mask=None,
-        layer_state=None,
-        causal_mask=None,
-        decoder_padding_mask=None,
-    ):
-        residual = x
-
-        if layer_state is None:
-            layer_state = {}
-        if self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-            
-        # Self Attention
-        x, self_attn_weights = self.self_attn(
-            query=x,
-            key=x,
-            layer_state=layer_state,  # adds keys to layer state
-            key_padding_mask=decoder_padding_mask,
-            attn_mask=causal_mask,
-            need_weights=self.output_attentions,
-        )
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        if not self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-
-        # Cross attention
-        residual = x
-        assert self.encoder_attn.cache_key != self.self_attn.cache_key
-        if self.normalize_before:
-            x = self.encoder_attn_layer_norm(x)
-        x, cross_attn_weights = self.encoder_attn(
-            query=x,
-            key=encoder_hidden_states,
-            key_padding_mask=encoder_attn_mask,
-            layer_state=layer_state,  # mutates layer state
-            need_weights=self.output_attentions,
-        )
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        if not self.normalize_before:
-            x = self.encoder_attn_layer_norm(x)
-
-        # Fully Connected
-        residual = x
-        if self.normalize_before:
-            x = self.final_layer_norm(x)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        if not self.normalize_before:
-            x = self.final_layer_norm(x)
-        return (
-            x,
-            self_attn_weights,
-            cross_attn_weights,
-            layer_state,
-        )  # both self_attn and cross-attn weights, following t5, layer_state = cache for decoding
-            # attention weight has size (bsz, num_heads, tgt_len, src_len)
-
-
-class BartDecoder(nn.Module):
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer
-    is a :class:`DecoderLayer`.
-    Args:
-        config: BartConfig
-        embed_tokens (torch.nn.Embedding): output embedding
-    """
-
-    def __init__(self, config: BartConfig, embed_tokens: nn.Embedding):
-        super().__init__()
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.dropout = config.dropout
-        self.layerdrop = config.decoder_layerdrop
-        self.padding_idx = embed_tokens.padding_idx
-        self.max_target_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-        self.embed_tokens = embed_tokens
-        if config.static_position_embeddings:
-            self.embed_positions = SinusoidalPositionalEmbedding(
-                config.max_position_embeddings, config.d_model, config.pad_token_id
-            )
-        else:
-            self.embed_positions = LearnedPositionalEmbedding(
-                config.max_position_embeddings, config.d_model, self.padding_idx,
-            )
-        self.layers = nn.ModuleList(
-            [DecoderLayer(config) for _ in range(config.decoder_layers)]
-        )  # type: List[DecoderLayer]
-        self.layernorm_embedding = LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
-        self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None
-
-    def forward(
-        self,
-        input_ids,
-        encoder_hidden_states,
-        encoder_padding_mask,
-        decoder_padding_mask,
-        decoder_causal_mask,
-        decoder_cached_states=None,
-        use_cache=False,
-        **unused
-    ):
-        """
-        Includes several features from "Jointly Learning to Align and
-        Translate with Transformer Models" (Garg et al., EMNLP 2019).
-
-        Args:
-            input_ids (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for teacher forcing
-            encoder_hidden_states: output from the encoder, used for
-                encoder-side attention
-            encoder_padding_mask: for ignoring pad tokens
-            decoder_cached_states (dict or None): dictionary used for storing state during generation
-
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - hidden states
-                - attentions
-        """
-        # check attention mask and invert
-        if encoder_padding_mask is not None:
-            encoder_padding_mask = invert_mask(encoder_padding_mask)
-
-        # embed positions
-        positions = self.embed_positions(input_ids, use_cache=use_cache)
-
-        if use_cache:
-            input_ids = input_ids[:, -1:]
-            positions = positions[:, -1:]  # happens after we embed them
-            # assert input_ids.ne(self.padding_idx).any()
-
-        x = self.embed_tokens(input_ids) * self.embed_scale
-        x += positions
-        x = self.layernorm_embedding(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-
-        # Convert to Bart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
-        x = x.transpose(0, 1)
-        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
-
-        # decoder layers
-        all_hidden_states = ()
-        all_self_attns = ()
-        all_cross_attns = ()
-        next_decoder_cache = []
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if self.output_hidden_states:
-                all_hidden_states += (x,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
-
-            layer_state = decoder_cached_states[idx] if decoder_cached_states is not None else None
-
-            x, layer_self_attn, layer_cross_attention, layer_past = decoder_layer(
-                x,
-                encoder_hidden_states,
-                encoder_attn_mask=encoder_padding_mask,
-                decoder_padding_mask=decoder_padding_mask,
-                layer_state=layer_state,
-                causal_mask=decoder_causal_mask,
-            )
-
-            if use_cache:
-                next_decoder_cache.append(layer_past.copy())
-
-            if self.layer_norm and (idx == len(self.layers) - 1):  # last layer of mbart
-                x = self.layer_norm(x)
-            if self.output_attentions:
-                all_self_attns += (layer_self_attn,)
-                all_cross_attns += (layer_cross_attention,)
-
-        # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
-        all_hidden_states = [hidden_state.transpose(0, 1) for hidden_state in all_hidden_states]
-        x = x.transpose(0, 1)
-        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
-
-        if use_cache:
-            next_cache = ((encoder_hidden_states, encoder_padding_mask), next_decoder_cache)
-        else:
-            next_cache = None
-        return x, next_cache, all_hidden_states, list(all_self_attns), list(all_cross_attns)
-
-
-class BartModel(BartModel):
-    def __init__(self, config: BartConfig):
-        super().__init__(config)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-
-        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
-        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
-
-        self.encoder = BartEncoder(config, self.shared)
-        self.decoder = BartDecoder(config, self.shared)
-
-        self.init_weights()
-
-
-class BartForConditionalGeneration(BartForConditionalGeneration):
-    base_model_prefix = "model"
-
-    def __init__(self, config: BartConfig):
-        super().__init__(config)
-        base_model = BartModel(config)
-        self.model = base_model
-        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
-        
-    def prepare_inputs_for_generation(self, decoder_input_ids, past, attention_mask, use_cache, **kwargs):
-        assert past is not None, "past has to be defined for encoder_outputs"
-
-        # first step, decoder_cached_states are empty
-        # first step
-        if kwargs['cur_len'] == 1:
-            encoder_outputs, decoder_cached_states = past[0], None
-        else:
-            if use_cache:
-                if len(past) < 2:
-                    encoder_outputs, decoder_cached_states = past[0], None
-                else:
-                    encoder_outputs, decoder_cached_states = past[0], past[1]
-            else:
-                encoder_outputs, decoder_cached_states = past[0], None
-                
-        if not isinstance(encoder_outputs, tuple):
-            encoder_outputs = (encoder_outputs, )
-
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "decoder_cached_states": decoder_cached_states,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-
-    def _generate_no_beam_search(
-            self,
-            input_ids,
-            cur_len,
-            max_length,
-            min_length,
-            do_sample,
-            temperature,
-            top_k,
-            top_p,
-            repetition_penalty,
-            no_repeat_ngram_size,
-            bad_words_ids,
-            bos_token_id,
-            pad_token_id,
-            eos_token_id,
-            decoder_start_token_id,
-            batch_size,
-            encoder_outputs,
-            attention_mask,
-            use_cache,
-            model_specific_kwargs,
-    ):
-        """ Generate sequences for each example without beam search (num_beams == 1).
-            All returned sequence are generated independantly.
-        """
-        # length of generated sentences / unfinished sentences
-        unfinished_sents = input_ids.new(batch_size).fill_(1)
-        sent_lengths = input_ids.new(batch_size).fill_(max_length)
-        
-        if getattr(self.config, 'encoder_layers', None):
-            num_layers = self.config.encoder_layers
-        else:
-            num_layers = self.config.num_layers
-
-        if getattr(self.config, 'encoder_attention_heads', None):
-            num_heads = self.config.encoder_attention_heads
-        else:
-            num_heads = self.config.num_heads
-
-        all_encoder_attentions = [input_ids.new_full([batch_size, num_heads, max_length,
-                                                     encoder_outputs[0].size(1)], dtype=torch.float32,
-                                                    fill_value=-1000000) for _ in range(num_layers)]
-        
-        # encoder outputs for Bart and models inheriting from BartModel encoder is (encoder hidden outputs of last layer, all_hidden_states, all_attention_weights )
-        # it always outputs all_hidden_states and all_attention_weights and then filters empty ones out when passed through BartModel
-        
-        # on the other hand, T5 encoder outputs (last-layer hidden state, presents, all_hidden_states, all_attention_weights) only if returning them is requested
-        # otherwise it just returns last-layer hidden states
-        past = encoder_outputs  # defined for encoder-decoder models, None for decoder-only models
-        if not isinstance(past, tuple):
-            past = (past,)
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(
-                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, cur_len=cur_len, **model_specific_kwargs
-            )
-
-            outputs = self(**model_inputs)
-            # decoder_outputs = x, next_cache, all_hidden_states, all_self_attns, all_cross_attns
-            # encoder_outputs = encoder_hidden_last_layer + all_hidden_states + all_self_attns
-            # outputs = decoder_outputs + encoder_outputs
-            
-            # outputs is then filtered if attention weights, hidden states, or cached_decoding_values are empty
-            # so the index below is adjusted
-            # remember we always return attention weights
-            
-            next_token_logits = outputs[0][:, -1, :]
-            
-            index = 2 + int(model_specific_kwargs['return_hidden_states']) + int(use_cache)
-            for i in range(num_layers):
-                all_encoder_attentions[i][:, :, [cur_len - 1], :] = outputs[index][i]
-
-            # if model has past, then set the past variable to speed up decoding
-            if self._use_cache(outputs, use_cache):
-                past = outputs[1]
-
-            # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
-            if repetition_penalty != 1.0:
-                self.enforce_repetition_penalty_(next_token_logits, batch_size, 1, input_ids, repetition_penalty)
-
-
-            if no_repeat_ngram_size > 0:
-                # calculate a list of banned tokens to prevent repetitively generating the same ngrams
-                # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
-                banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len)
-                for batch_idx in range(batch_size):
-                    next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf")
-
-            if bad_words_ids is not None:
-                # calculate a list of banned tokens according to bad words
-                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
-
-                for batch_idx in range(batch_size):
-                    next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf")
-            
-            # #TODO added and modified by mehrad from transformers modeling_utils.py
-            # if self.config.is_encoder_decoder:
-            #     if cur_len == 1:
-            #         self._force_token_ids_generation(next_token_logits, model_specific_kwargs['tgt_lang_id'])
-            #     if cur_len == max_length - 1 and self.config.eos_token_id is not None:
-            #         self._force_token_ids_generation(next_token_logits, self.config.eos_token_id)
-
-            # set bos token prob to zero
-            # if bos_token_id is not None:
-            #     next_token_logits[:, bos_token_id] = -float("inf")
-
-            # set eos token prob to zero if min_length is not reached
-            if eos_token_id is not None and cur_len < min_length:
-                next_token_logits[:, eos_token_id] = -float("inf")
-
-            if do_sample:
-                # Temperature (higher temperature => more likely to sample low probability tokens)
-                if temperature != 1.0:
-                    next_token_logits = next_token_logits / temperature
-                # Top-p/top-k filtering
-                next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-                # Sample
-                probs = F.softmax(next_token_logits, dim=-1)
-                next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                # Greedy decoding
-                next_token = torch.argmax(next_token_logits, dim=-1)
-
-            # update generations and finished sentences
-            if eos_token_id is not None:
-                # pad finished sentences if eos_token_id exist
-                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
-            else:
-                tokens_to_add = next_token
-
-            # add token and increase length by one
-            input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1)
-            cur_len = cur_len + 1
-
-            if eos_token_id is not None:
-                eos_in_sents = tokens_to_add == eos_token_id
-                # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
-                is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool()
-                sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len)
-                # unfinished_sents is set to zero if eos in sentence
-                unfinished_sents.mul_((~eos_in_sents).long())
-
-            # stop when there is a </s> in each sentence, or if we exceed the maximum length
-            if unfinished_sents.max() == 0:
-                break
-
-            # extend attention_mask for new generated input if only decoder
-            if self.config.is_encoder_decoder is False:
-                attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-                )
-
-        # if there are different sentences lengths in the batch, some batches have to be padded
-        if sent_lengths.min().item() != sent_lengths.max().item():
-            assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths"
-            # finished sents are filled with pad_token
-            decoded = input_ids.new(batch_size, sent_lengths.max().item()).fill_(pad_token_id)
-        else:
-            decoded = input_ids
-
-        for hypo_idx, hypo in enumerate(input_ids):
-            decoded[hypo_idx, : sent_lengths[hypo_idx]] = hypo[: sent_lengths[hypo_idx]]
-        
-        # List of each encoder layer cross-attention values each with size (bsz, num_heads, tgt_len, src_len)
-        all_encoder_attentions = [layer_all_encoder_attentions[:, :, :sent_lengths.max().item(), :] for layer_all_encoder_attentions in all_encoder_attentions]
-
-        return decoded, all_encoder_attentions
-
-
-
-  
-# coding=utf-8
-# Copyright 2020 Marian Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch MarianMTModel model, ported from the Marian C++ repo."""
-
-
-# from transformers.modeling_bart import BartForConditionalGeneration
-
-
-MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    # See all Marian models at https://huggingface.co/models?search=Helsinki-NLP
-]
-
-
-class MarianMTModel(BartForConditionalGeneration):
-    r"""
-    Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
-    Model API is identical to BartForConditionalGeneration.
-    Available models are listed at `Model List <https://huggingface.co/models?search=Helsinki-NLP>`__
-
-    Examples::
-
-        from transformers import MarianTokenizer, MarianMTModel
-        from typing import List
-        src = 'fr'  # source language
-        trg = 'en'  # target language
-        sample_text = "où est l'arrêt de bus ?"
-        mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
-
-        model = MarianMTModel.from_pretrained(mname)
-        tok = MarianTokenizer.from_pretrained(mname)
-        batch = tok.prepare_translation_batch(src_texts=[sample_text])  # don't need tgt_text for inference
-        gen = model.generate(**batch)  # for forward pass: model(**batch)
-        words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the the bus stop ?"
-
-    """
-
-    def prepare_logits_for_generation(self, logits, cur_len, max_length):
-        logits[:, self.config.pad_token_id] = float("-inf")
-        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
-            self._force_token_ids_generation(logits, self.config.eos_token_id)
-        return logits
-
-
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers.tokenization_roberta import RobertaTokenizer
-from transformers.tokenization_utils import BatchEncoding
-from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
-
-
-# vocab and merges same as roberta
-vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
-merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
-_all_bart_models = [
-    "facebook/bart-large",
-    "facebook/bart-large-mnli",
-    "facebook/bart-large-cnn",
-    "facebook/bart-large-xsum",
-    "yjernite/bart_eli5",
-]
-
-
-class BartTokenizer(RobertaTokenizer):
-    # merges and vocab same as Roberta
-    max_model_input_sizes = {m: 1024 for m in _all_bart_models}
-    pretrained_vocab_files_map = {
-        "vocab_file": {m: vocab_url for m in _all_bart_models},
-        "merges_file": {m: merges_url for m in _all_bart_models},
-    }
-
-_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"]
-SPM_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/sentence.bpe.model"
-
-FAIRSEQ_LANGUAGE_CODES = [
-    "ar_AR",
-    "cs_CZ",
-    "de_DE",
-    "en_XX",
-    "es_XX",
-    "et_EE",
-    "fi_FI",
-    "fr_XX",
-    "gu_IN",
-    "hi_IN",
-    "it_IT",
-    "ja_XX",
-    "kk_KZ",
-    "ko_KR",
-    "lt_LT",
-    "lv_LV",
-    "my_MM",
-    "ne_NP",
-    "nl_XX",
-    "ro_RO",
-    "ru_RU",
-    "si_LK",
-    "tr_TR",
-    "vi_VN",
-    "zh_CN",
-]
-
-
-class MBartTokenizer(XLMRobertaTokenizer):
+class GenieMBartTokenizer(MBartTokenizer):
     vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
     max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
     pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
@@ -670,236 +80,9 @@ class MBartTokenizer(XLMRobertaTokenizer):
     prefix_tokens: List[int] = []
     suffix_tokens: List[int] = []
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, *args, tokenizer_file=None, **kwargs):
+        super().__init__(*args, tokenizer_file=tokenizer_file, **kwargs)
 
-        self.sp_model_size = len(self.sp_model)
-        self.lang_code_to_id = {
-            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
-        }
-        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
-        self.cur_lang_code = self.lang_code_to_id["en_XX"]
-        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
-
-        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-        self._additional_special_tokens = list(self.lang_code_to_id.keys())
-        self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX"))
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1  # Plus 1 for the mask token
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-    
-    # The __call__ was implemented only in transformers >=3.0.0
-    def __call__(
-            self,
-            text,
-            text_pair=None,
-            add_special_tokens: bool = True,
-            padding=False,
-            truncation=False,
-            max_length: Optional[int] = None,
-            stride: int = 0,
-            is_split_into_words: bool = False,
-            pad_to_multiple_of: Optional[int] = None,
-            return_tensors=None,
-            return_token_type_ids: Optional[bool] = None,
-            return_attention_mask: Optional[bool] = None,
-            return_overflowing_tokens: bool = False,
-            return_special_tokens_mask: bool = False,
-            return_offsets_mapping: bool = False,
-            return_length: bool = False,
-            verbose: bool = True,
-            **kwargs
-    ) -> BatchEncoding:
-        """
-        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
-        sequences.
-
-        Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                The sequence or batch of sequences to be encoded.
-                Each sequence can be a string or a list of strings (pretokenized string).
-                If the sequences are provided as list of strings (pretokenized), you must set
-                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                The sequence or batch of sequences to be encoded.
-                Each sequence can be a string or a list of strings (pretokenized string).
-                If the sequences are provided as list of strings (pretokenized), you must set
-                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-        """
-        # Input type checking for clearer error
-        assert isinstance(text, str) or (
-                isinstance(text, (list, tuple))
-                and (
-                        len(text) == 0
-                        or (
-                                isinstance(text[0], str)
-                                or (isinstance(text[0], (list, tuple)) and (
-                                    len(text[0]) == 0 or isinstance(text[0][0], str)))
-                        )
-                )
-        ), (
-            "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
-            "or `List[List[str]]` (batch of pretokenized examples)."
-        )
-    
-        assert (
-                text_pair is None
-                or isinstance(text_pair, str)
-                or (
-                        isinstance(text_pair, (list, tuple))
-                        and (
-                                len(text_pair) == 0
-                                or (
-                                        isinstance(text_pair[0], str)
-                                        or (
-                                                isinstance(text_pair[0], (list, tuple))
-                                                and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
-                                        )
-                                )
-                        )
-                )
-        ), (
-            "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
-            "or `List[List[str]]` (batch of pretokenized examples)."
-        )
-    
-        is_batched = bool(
-            (not is_split_into_words and isinstance(text, (list, tuple)))
-            or (
-                    is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0],
-                                                                                                    (list, tuple))
-            )
-        )
-    
-        if is_batched:
-            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-            return self.batch_encode_plus(
-                batch_text_or_text_pairs=batch_text_or_text_pairs,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                is_split_into_words=is_split_into_words,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-        else:
-            return self.encode_plus(
-                text=text,
-                text_pair=text_pair,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                is_split_into_words=is_split_into_words,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: List[str],
-        src_lang: str = "en_XX",
-        tgt_texts: Optional[List[str]] = None,
-        tgt_lang: str = "ro_RO",
-        max_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        truncation: bool = True,
-        padding: str = "longest",
-        return_tensors: str = "pt",
-        add_prefix_space: bool = False,  # ignored
-        **kwargs,
-    ) -> BatchEncoding:
-        if max_length is None:
-            max_length = self.max_len
-        self.set_src_lang_special_tokens(src_lang)
-        model_inputs: BatchEncoding = self(
-            src_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            max_length=max_length,
-            padding=padding,
-            truncation=truncation,
-            **kwargs,
-        )
-        if tgt_texts is None:
-            return model_inputs
-        # Process tgt_texts
-        if max_target_length is None:
-            max_target_length = max_length
-        self.set_tgt_lang_special_tokens(tgt_lang)
-
-        labels = self(
-            tgt_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            padding=padding,
-            max_length=max_target_length,
-            truncation=True,
-            **kwargs,
-        )["input_ids"]
-        model_inputs["labels"] = labels
-        self.set_src_lang_special_tokens(src_lang)  # sets to src_lang
-        return model_inputs
-    
-    ##TODO comment out to use original mbart tokenization used by huggingface
-    # def set_src_lang_special_tokens(self, src_lang) -> None:
-    #     """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code]."""
-    #     self.cur_lang_code = self.lang_code_to_id[src_lang]
-    #     self.prefix_tokens = []
-    #     self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
-    #
-    # def set_tgt_lang_special_tokens(self, lang: str) -> None:
-    #     """Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos]."""
-    #     self.cur_lang_code = self.lang_code_to_id[lang]
-    #     self.prefix_tokens = []
-    #     self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
-    
     def set_src_lang_special_tokens(self, src_lang) -> None:
         """Reset the special tokens to the source lang setting. Prefix [bos_token_id], suffix =[eos_token_id]."""
         self.cur_lang_code = self.lang_code_to_id[src_lang]
@@ -911,248 +94,225 @@ class MBartTokenizer(XLMRobertaTokenizer):
         self.cur_lang_code = self.lang_code_to_id[lang]
         self.prefix_tokens = [self.cur_lang_code]
         self.suffix_tokens = [self.eos_token_id]
+        
+###############
 
 
-class T5Stack(T5PreTrainedModel):
-    def __init__(self, config, embed_tokens=None):
-        super().__init__(config)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-
-        self.embed_tokens = embed_tokens
-        self.is_decoder = config.is_decoder
-
-        self.block = nn.ModuleList(
-            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-        )
-        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def get_output_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, new_embeddings):
-        self.embed_tokens = new_embeddings
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        inputs_embeds=None,
-        head_mask=None,
-        past_key_value_states=None,
-        use_cache=False,
+class GeniePreTrainedModel(PreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+    
+    def greedy_search(
+            self,
+            input_ids: torch.LongTensor,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            max_length: Optional[int] = None,
+            pad_token_id: Optional[int] = None,
+            eos_token_id: Optional[int] = None,
+            **model_kwargs
     ):
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            if self.is_decoder:
-                raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        max_length = max_length if max_length is not None else self.config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        
+        # init sequence length tensors
+        sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation(
+            input_ids, max_length
+        )
+        
+        output_attentions = model_kwargs.get('output_attentions', None)
+        
+        if output_attentions:
+            batch_size = input_ids.size(0)
+            if getattr(self.config, 'encoder_layers', None):
+                num_layers = self.config.encoder_layers
             else:
-                raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        batch_size, seq_length = input_shape
-
-        if past_key_value_states is not None:
-            assert seq_length == 1, "Input shape is {}, but should be {} when using past_key_value_sates".format(
-                input_shape, (batch_size, 1)
+                num_layers = self.config.num_layers
+            
+            if getattr(self.config, 'encoder_attention_heads', None):
+                num_heads = self.config.encoder_attention_heads
+            else:
+                num_heads = self.config.num_heads
+            
+            if model_kwargs.get('encoder_outputs', None):
+                seq_length = model_kwargs['encoder_outputs'][0].size(1)
+            else:
+                seq_length = max_length
+                
+            all_cross_attentions = [input_ids.new_full([batch_size, num_heads, max_length, seq_length],
+                                                       dtype=torch.float32,
+                                                       fill_value=-1000000)
+                                    for _ in range(num_layers)]
+   
+        while cur_len < max_length:
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            
+            # forward pass to get next token
+            outputs = self(**model_inputs, return_dict=True)
+            next_token_logits = outputs.logits[:, -1, :]
+            
+            if output_attentions:
+                for i in range(num_layers):
+                    all_cross_attentions[i][:, :, [cur_len - 1], :] = outputs.cross_attentions[i]
+            
+            # pre-process distribution
+            scores = logits_processor(input_ids, next_token_logits)
+            
+            # argmax
+            next_tokens = torch.argmax(scores, dim=-1)
+            
+            # add code that transfomers next_tokens to tokens_to_add
+            if eos_token_id is not None:
+                assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
+                next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences)
+            
+            # add token and increase length by one
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            
+            # update sequence length
+            if eos_token_id is not None:
+                sequence_lengths, unfinished_sequences = self._update_seq_length_for_generation(
+                    sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id
+                )
+            
+            # update model kwargs
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
-            # required mask seq length can be calculated via length of past
-            # key value states and seq_length = 1 for the last token
-            mask_seq_length = past_key_value_states[0][0].shape[2] + seq_length
+            
+            # stop when there is a </s> in each sentence, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0:
+                break
+            
+            # increase cur_len
+            cur_len = cur_len + 1
+        
+        if output_attentions:
+            # List of each encoder layer cross-attention values each with size (bsz, num_heads, tgt_len, src_len)
+            all_cross_attentions = [layer_all_cross_attentions[:, :, :sequence_lengths.max().item(), :] for
+                                    layer_all_cross_attentions in all_cross_attentions]
+            
+            return input_ids, all_cross_attentions
         else:
-            mask_seq_length = seq_length
+            return input_ids
+    
+    
+    def sample(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        **model_kwargs
+    ):
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+        max_length = max_length if max_length is not None else self.config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
 
-        if attention_mask is None:
-            attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device)
-        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
-            encoder_seq_length = encoder_hidden_states.shape[1]
-            encoder_attention_mask = torch.ones(
-                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
+        # init sequence length tensors
+        sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation(
+            input_ids, max_length
+        )
+        
+        output_attentions = model_kwargs.get('output_attentions', None)
+        
+        if output_attentions:
+            batch_size = input_ids.size(0)
+            if getattr(self.config, 'encoder_layers', None):
+                num_layers = self.config.encoder_layers
+            else:
+                num_layers = self.config.num_layers
+    
+            if getattr(self.config, 'encoder_attention_heads', None):
+                num_heads = self.config.encoder_attention_heads
+            else:
+                num_heads = self.config.num_heads
+    
+            if model_kwargs.get('encoder_outputs', None):
+                seq_length = model_kwargs['encoder_outputs'][0].size(1)
+            else:
+                seq_length = max_length
+    
+            all_cross_attentions = [input_ids.new_full([batch_size, num_heads, max_length, seq_length],
+                                                       dtype=torch.float32,
+                                                       fill_value=-1000000)
+                                    for _ in range(num_layers)]
+
+        # auto-regressive generation
+        while cur_len < max_length:
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # forward pass to get next token
+            outputs = self(**model_inputs, return_dict=True)
+            next_token_logits = outputs.logits[:, -1, :]
+            
+            if output_attentions:
+                for i in range(num_layers):
+                    all_cross_attentions[i][:, :, [cur_len - 1], :] = outputs.cross_attentions[i]
+
+            # pre-process distribution
+            scores = logits_processor(input_ids, next_token_logits)
+            scores = logits_warper(input_ids, scores)
+
+            # sample
+            probs = F.softmax(scores, dim=-1)
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+            # add code that transfomers next_tokens to tokens_to_add
+            if eos_token_id is not None:
+                assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
+                next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences)
+
+            # add token and increase length by one
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            cur_len = cur_len + 1
+
+            # update sequence length
+            if eos_token_id is not None:
+                sequence_lengths, unfinished_sequences = self._update_seq_length_for_generation(
+                    sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id
+                )
+
+            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            if unfinished_sequences.max() == 0:
+                break
+
+            # update model kwargs
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
-
-        # initialize past_key_value_states with `None` if past does not exist
-        if past_key_value_states is None:
-            past_key_value_states = [None] * len(self.block)
-
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device)
-
-        if self.is_decoder and encoder_attention_mask is not None:
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        
+        if output_attentions:
+            # List of each encoder layer cross-attention values each with size (bsz, num_heads, tgt_len, src_len)
+            all_cross_attentions = [layer_all_cross_attentions[:, :, :sequence_lengths.max().item(), :] for
+                                    layer_all_cross_attentions in all_cross_attentions]
+    
+            return input_ids, all_cross_attentions
         else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-        present_key_value_states = ()
-        all_hidden_states = ()
-        all_attentions = ()
-        cross_attentions = ()
-        position_bias = None
-        encoder_decoder_position_bias = None
-
-        hidden_states = self.dropout(inputs_embeds)
-
-        for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                hidden_states,
-                attention_mask=extended_attention_mask,
-                position_bias=position_bias,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_extended_attention_mask,
-                encoder_decoder_position_bias=encoder_decoder_position_bias,
-                head_mask=head_mask[i],
-                past_key_value_state=past_key_value_state,
-                use_cache=use_cache,
-            )
-            # layer_outputs is a tuple with:
-            # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-            hidden_states, present_key_value_state = layer_outputs[:2]
-
-            if i == 0:
-                # We share the position biases between the layers - the first layer store them
-                # layer_outputs = hidden-states, key-value-states (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-                position_bias = layer_outputs[3 if self.output_attentions else 2]
-                if self.is_decoder and encoder_hidden_states is not None:
-                    encoder_decoder_position_bias = layer_outputs[5 if self.output_attentions else 3]
-            # append next layer key value states
-            present_key_value_states = present_key_value_states + (present_key_value_state,)
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[2],)  # add self-attention
-                if self.is_decoder and encoder_hidden_states is not None:
-                    if i==0:
-                        cross_attentions = cross_attentions + (layer_outputs[4],)  # add cross-attention
-                    else:
-                        cross_attentions = cross_attentions + (layer_outputs[3],)  # add cross-attention
-
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if use_cache is True:
-            assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self)
-            outputs = outputs + (present_key_value_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,) + (cross_attentions,)
-        return outputs  # last-layer hidden state, (presents,) (all hidden states), (all self_attentions), (all cross_attentions)
+            return input_ids
 
 
-class T5ForConditionalGeneration(T5ForConditionalGeneration):
+class GenieMarianMTModel(MarianMTModel, GeniePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.model_dim = config.d_model
-
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        self.decoder = T5Stack(decoder_config, self.shared)
-
-        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
-
-        self.init_weights()
 
+class GenieBartForConditionalGeneration(BartForConditionalGeneration, GeniePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
         
-    def _generate_no_beam_search(
-            self,
-            input_ids,
-            cur_len,
-            max_length,
-            min_length,
-            do_sample,
-            temperature,
-            top_k,
-            top_p,
-            repetition_penalty,
-            no_repeat_ngram_size,
-            bad_words_ids,
-            bos_token_id,
-            pad_token_id,
-            eos_token_id,
-            decoder_start_token_id,
-            batch_size,
-            encoder_outputs,
-            attention_mask,
-            use_cache,
-            model_specific_kwargs,
-    ):
-        return BartForConditionalGeneration._generate_no_beam_search(
-                self,
-                input_ids,
-                cur_len,
-                max_length,
-                min_length,
-                do_sample,
-                temperature,
-                top_k,
-                top_p,
-                repetition_penalty,
-                no_repeat_ngram_size,
-                bad_words_ids,
-                bos_token_id,
-                pad_token_id,
-                eos_token_id,
-                decoder_start_token_id,
-                batch_size,
-                encoder_outputs,
-                attention_mask,
-                use_cache,
-                model_specific_kwargs,)
-    
-    def prepare_inputs_for_generation(self, input_ids, past, attention_mask, use_cache, **kwargs):
-        assert past is not None, "past has to be defined for encoder_outputs"
-        
-        # first step
-        if kwargs['cur_len'] == 1:
-            encoder_outputs, decoder_past_key_value_states = past[0], None
-        else:
-            if use_cache:
-                if len(past) < 2:
-                    encoder_outputs, decoder_past_key_value_states = past[0], None
-                else:
-                    encoder_outputs, decoder_past_key_value_states = past[0], past[1]
-            else:
-                encoder_outputs, decoder_past_key_value_states = past[0], None
-                
-        if not isinstance(encoder_outputs, tuple):
-            encoder_outputs = (encoder_outputs, )
-
-        return {
-            "decoder_input_ids": input_ids,
-            "decoder_past_key_value_states": decoder_past_key_value_states,
-            "encoder_outputs": encoder_outputs,
-            "attention_mask": attention_mask,
-            "use_cache": use_cache,
-        }
+class GenieMBartForConditionalGeneration(MBartForConditionalGeneration, GeniePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
 
+class GenieT5ForConditionalGeneration(T5ForConditionalGeneration, GeniePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
diff --git a/setup.py b/setup.py
index c3ec9941..343b1c7e 100644
--- a/setup.py
+++ b/setup.py
@@ -56,8 +56,8 @@ setuptools.setup(
         'pyrouge>=0.1.3',
         'sacrebleu~=1.0',
         'requests~=2.22',
-        'transformers==2.11',
-        'sentencepiece>=0.1.83,<0.2.0',
+        'transformers==3.5.1',
+        'sentencepiece==0.1.91',
         'mosestokenizer~=1.1',
     ]
 )
diff --git a/tests/dataset/translation/en-de/dev_marian_aligned.tsv b/tests/dataset/translation/en-de/dev_marian_aligned.tsv
index 5af1f0ef..d5fe1b50 100644
--- a/tests/dataset/translation/en-de/dev_marian_aligned.tsv
+++ b/tests/dataset/translation/en-de/dev_marian_aligned.tsv
@@ -3,4 +3,4 @@ show me nearby hotels with both a " catalan " and " sauna "	zeigen Sie mir in de
 find people graduate of Stanford.	Leute finden, die Stanford graduieren.
 what is the highest rated hotel ?	was ist das am höchsten bewertete Hotel ?
 find hotels with 2 star ratings .	Hotels mit 2 Sterne Bewertungen finden.
-what is the rating of " rosedon " in " glenorchy " ?	Wie hoch ist die Bewertung von „ Rosedon " in „ Glenorchy " ?
+what is the rating of " rosedon " in " glenorchy " ?	Wie hoch ist die Bewertung von " rosedon " " in " glenorchy " " ?
diff --git a/tests/dataset/translation/en-de/dev_t5_aligned.tsv b/tests/dataset/translation/en-de/dev_t5_aligned.tsv
index 2a3cf83c..5336fc8d 100644
--- a/tests/dataset/translation/en-de/dev_t5_aligned.tsv
+++ b/tests/dataset/translation/en-de/dev_t5_aligned.tsv
@@ -1,6 +1,6 @@
 who has a 8 star rating with over 8 reviews in " fonte " ?	wer hat eine 8 Sterne Bewertung mit über 8 Bewertungen in " fonte " ?
-show me nearby hotels with both a " catalan " and " sauna "	ich sah mich in der Nähe von Hotels mit sowohl " Katalanen " als auch " Sauna " zeigen, " sowohl " Katalanen
-find people graduate of Stanford.	- es gibt Leute, die an der Stanford University studieren.
-what is the highest rated hotel ?	Was ist das Hotel mit dem höchsten Preis ?
+show me nearby hotels with both a " catalan " and " sauna "	ich sah in der Nähe Hotels mit " catalan " und " sauna " .
+find people graduate of Stanford.	finden Menschen Absolventen von Stanford.
+what is the highest rated hotel ?	Was ist das höchst bewertete Hotel ?
 find hotels with 2 star ratings .	finden Sie Hotels mit 2 Sternenbewertungen .
 what is the rating of " rosedon " in " glenorchy " ?	Was ist die Bewertung von " rosedon " in " glenorchy " ?
diff --git a/tests/test.sh b/tests/test.sh
index 41994b86..20b2fd3d 100755
--- a/tests/test.sh
+++ b/tests/test.sh
@@ -125,7 +125,7 @@ done
 # masked paraphrasing tests
 cp -r $SRCDIR/dataset/paraphrasing/ $workdir/masked_paraphrasing/
 
-for model in "sshleifer/bart-tiny-random" ; do
+for model in "sshleifer/bart-tiny-random" "sshleifer/tiny-mbart" ; do
 
   if [[ $model == *mbart* ]] ; then
     model_type="mbart"
@@ -160,7 +160,7 @@ for model in "t5-small" "Helsinki-NLP/opus-mt-en-de" ; do
   fi
 
   # use a pre-trained model
-  pipenv run python3 -m genienlp run-paraphrase --model_name_or_path $model --length 15 --temperature 0 --repetition_penalty 1.0 --num_samples 1 --batch_size 3 --input_file $workdir/translation/en-de/dev_"$base_model"_aligned.tsv --input_column 0 --gold_column 1 --output_file $workdir/generated_"$base_model"_aligned.tsv  --skip_heuristics --att_pooling mean --task translate --tgt_lang de --replace_qp --return_attentions
+  pipenv run python3 -m genienlp run-paraphrase --model_name_or_path $model --length 15 --temperature 0 --repetition_penalty 1.0 --num_samples 1 --batch_size 3 --input_file $workdir/translation/en-de/dev_"$base_model"_aligned.tsv --input_column 0 --gold_column 1 --output_file $workdir/generated_"$base_model"_aligned.tsv  --skip_heuristics --att_pooling mean --task translate --tgt_lang de --replace_qp --force_replace_qp --return_attentions
 
   # check if result file exists and exact match accuracy is 100%
   cut -f2 $workdir/translation/en-de/dev_"$base_model"_aligned.tsv | diff -u - $workdir/generated_"$base_model"_aligned.tsv