diff --git a/Pipfile b/Pipfile
index 1d23a164..a82cc1ff 100644
--- a/Pipfile
+++ b/Pipfile
@@ -18,7 +18,7 @@ pyrouge = ">=0.1.3"
 sacrebleu = "~=1.0"
 tensorboardX = "==2.0.*"
 requests = "~=2.22"
-transformers = "==2.3.0"
+transformers = "==2.5.1"
 radam = {git = "https://github.com/LiyuanLucasLiu/RAdam"}
 sentencepiece = ">=0.1.83,<0.2.0"
 
diff --git a/genienlp/run_lm_finetuning.py b/genienlp/run_lm_finetuning.py
index 137d07ac..896efc79 100644
--- a/genienlp/run_lm_finetuning.py
+++ b/genienlp/run_lm_finetuning.py
@@ -30,6 +30,9 @@ import pickle
 import re
 import shutil
 import torch
+import math
+import csv
+import numpy as np
 from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
 from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data.distributed import DistributedSampler
@@ -47,7 +50,7 @@ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                                   DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
                                   CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
 
-from genienlp.util import set_seed
+from genienlp.util import set_seed, get_number_of_lines
 
 
 logger = logging.getLogger(__name__)
@@ -64,10 +67,12 @@ MODEL_CLASSES = {
 
 
 class TextDataset(Dataset):
-    def __init__(self, tokenizer, args, file_path=None, block_size=512, prompt_token='<paraphrase>', evaluate=None):
+    def __init__(self, tokenizer, args, file_path=None, block_size=512, evaluate=None):
+        self.tokenizer = tokenizer
+        self.block_size = block_size
         assert os.path.isfile(file_path)
         directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, os.path.basename(os.path.normpath(args.model_name_or_path)) + '_cached_lm_' + str(block_size) + '_' + filename)
+        cached_features_file = os.path.join(directory, os.path.basename(os.path.normpath(args.model_name_or_path)) + '_cached_lm_' + str(self.block_size) + '_' + filename)
 
         if os.path.exists(cached_features_file) and not args.overwrite_cache:
             logger.info("Loading features from cached file %s", cached_features_file)
@@ -76,42 +81,84 @@ class TextDataset(Dataset):
         else:
             logger.info("Creating features from dataset file at %s", file_path)
 
-            prompt_token_id = tokenizer.convert_tokens_to_ids(prompt_token)
-            segment1_id = tokenizer.convert_tokens_to_ids(args.start_special_token)
-            segment2_id = tokenizer.convert_tokens_to_ids(args.end_special_token)
+            self.prompt_token_id = self.tokenizer.convert_tokens_to_ids(args.start_special_token)
+            self.end_token_id = self.tokenizer.convert_tokens_to_ids(args.end_special_token)
+            self.segment1_id = 0
+            self.segment2_id = 1
+            if args.model_type == 'gpt2':
+                self.segment1_id = self.prompt_token_id
+                self.segment2_id = self.end_token_id
             # print('prompt_token_id = ', prompt_token_id)
             self.examples = []
             self.labels = []
             self.position_ids = []
             self.segment_ids = []
-            max_input_length = 0
+            self.max_input_length = 0
+
+            if not evaluate and args.aux_train_data_file is not None:
+                number_of_lines = get_number_of_lines(args.aux_train_data_file)
+                with open(args.aux_train_data_file, encoding="utf-8") as f:
+                    reader = csv.reader(f, delimiter='\t')
+                    for row in tqdm(reader, desc='Tokenizing Auxiliary File', total=number_of_lines):
+                        self._add_example(row[0], None, args)
+
+            number_of_lines = get_number_of_lines(file_path)
             with open(file_path, encoding="utf-8") as f:
-                for line in tqdm(f, desc='Tokenizing'):
-                    tokens = tokenizer.tokenize(line)
-                    tokenized_text = tokenizer.convert_tokens_to_ids(tokens)
-                    tokenized_text = tokenized_text[0:block_size] # truncate longer sequences
-                    # print(tokenized_text)
-                    example = tokenizer.build_inputs_with_special_tokens(tokenized_text)
-                    max_input_length = max(max_input_length, len(example))
-                    try:
-                        prompt_token_location = tokenized_text.index(prompt_token_id)
-                    except ValueError:
-                        logger.warning('Prompt token not found after truncating the input. Dropping the example.')
-                        continue
+                reader = csv.reader(f, delimiter='\t')
+                for row in tqdm(reader, desc='Tokenizing', total=number_of_lines):
+                    self._add_example(row[0], row[1], args)
 
-                    self.examples.append(example)
-                    if args.train_all_tokens and not evaluate:
-                        self.labels.append(example)
-                    else: # During evaluation, we only care about the output sequence so we mask the input
-                        self.labels.append([-1]*(prompt_token_location+1)+example[prompt_token_location+1:])
-                    self.position_ids.append([pos for pos in range(prompt_token_location+1)]+[pos for pos in range(len(example)-prompt_token_location-1)])
-                    self.segment_ids.append([segment1_id]*(prompt_token_location+1)+[segment2_id]*(len(example)-prompt_token_location-1))
+            
 
-            logger.info('Maximum input length: %d', max_input_length)
+            logger.info('Maximum input length: %d', self.max_input_length)
             logger.info("Saving features into cached file %s", cached_features_file)
             with open(cached_features_file, 'wb') as handle:
                 pickle.dump((self.examples, self.labels, self.position_ids, self.segment_ids), handle, protocol=pickle.HIGHEST_PROTOCOL)
 
+    def _add_example(self, input_sequence, output_sequence, args):
+        """
+        Args:
+            input_sequence: if None, a corrupted version of the output_sequence will be used
+        """
+        # TODO we should make use of tokenizer.build_inputs_with_special_tokens(sequence1, sequence2). Add special tokens manualy only if our model does not support two sequences (like GPT2).
+        
+        input_token_ids = self.tokenizer.encode(input_sequence, add_special_tokens=False) + [self.tokenizer.convert_tokens_to_ids(args.start_special_token)]
+        if output_sequence is None:
+            output_token_ids = []
+        else:
+            output_token_ids = self.tokenizer.encode(output_sequence, add_special_tokens=False) + [self.tokenizer.convert_tokens_to_ids(args.end_special_token)]
+        tokenized_text = input_token_ids + output_token_ids
+        
+        tokenized_text = tokenized_text[0:self.block_size] # truncate longer sequences
+        # print('tokenized_text = ', tokenized_text)
+
+        example = self.tokenizer.build_inputs_with_special_tokens(tokenized_text)
+        # Remove duplicate end_token for models like BERT and RoBERTa that already add it
+        if example[-2] == self.end_token_id:
+            example = example[:-1]
+        # print('example = ', example)
+        self.max_input_length = max(self.max_input_length, len(example))
+        try:
+            prompt_token_location = example.index(self.prompt_token_id)
+        except ValueError:
+            logger.warning('Prompt token not found after truncating the input. Dropping the example.')
+            return
+
+        self.examples.append(example)
+        if args.train_all_tokens and not evaluate or output_sequence is None:
+            self.labels.append(example)
+        else: # During evaluation, we only care about the output_sequence so we mask the input
+            self.labels.append([-100]*(prompt_token_location+1)+example[prompt_token_location+1:])
+        
+        position_ids2 = range(len(example)-prompt_token_location-1)
+        if args.reverse_position_ids:
+            position_ids2 = reversed(position_ids2)
+        self.position_ids.append(list(range(prompt_token_location+1)) + list(position_ids2))
+        self.segment_ids.append([self.segment1_id]*(prompt_token_location+1) + [self.segment2_id]*(len(example)-prompt_token_location-1))
+
+        # print('position_ids = ', self.position_ids[-1])
+        # print('segment_ids = ', self.segment_ids[-1])
+
     def __len__(self):
         return len(self.examples)
 
@@ -119,8 +166,24 @@ class TextDataset(Dataset):
         return torch.tensor(self.examples[item]), torch.tensor(self.labels[item]), torch.tensor(self.position_ids[item]), torch.tensor(self.segment_ids[item])
 
 
-def load_and_cache_examples(args, tokenizer, evaluate=False):
-    dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size, evaluate=evaluate)
+def get_transformer_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, dimension):
+    num_warmup_steps = max(1, num_warmup_steps)
+
+    def lr_lambda(current_step):
+        current_step += 1
+        return 1. / math.sqrt(dimension) * min(1 / math.sqrt(current_step), current_step / (num_warmup_steps * math.sqrt(num_warmup_steps)))
+
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, aux=False):
+    if evaluate:
+        if aux:
+            file_path = args.aux_eval_data_file
+        else:
+            file_path = args.eval_data_file
+    else:
+        file_path = args.train_data_file
+    dataset = TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size, evaluate=evaluate)
     return dataset
 
 
@@ -153,15 +216,18 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
         shutil.rmtree(checkpoint)
 
 
-def mask_tokens(inputs, tokenizer, args):
-    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
-    labels = inputs.clone()
+def mask_tokens(inputs, labels, tokenizer, args):
+    """
+    Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+    """
     # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
     probability_matrix = torch.full(labels.shape, args.mlm_probability)
     special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
+    # print('labels.tolist() = ', labels.tolist())
+    # print('special_tokens_mask = ', special_tokens_mask)
     probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
     masked_indices = torch.bernoulli(probability_matrix).bool()
-    labels[~masked_indices] = -1  # We only compute loss on masked tokens
+    labels[~masked_indices] = -100  # We only compute loss on masked tokens
 
     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
     indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
@@ -178,7 +244,7 @@ def mask_tokens(inputs, tokenizer, args):
 def pad_collate(batch, pad_token_id):
     (inputs, labels, position_ids, segment_ids) = zip(*batch)
     inputs_pad = pad_sequence(inputs, batch_first=True, padding_value=pad_token_id)
-    labels_pad = pad_sequence(labels, batch_first=True, padding_value=-1)
+    labels_pad = pad_sequence(labels, batch_first=True, padding_value=-100)
     position_ids = pad_sequence(position_ids, batch_first=True, padding_value=0) # will be ignored in the loss function, so its value does not matter
     segment_ids = pad_sequence(segment_ids, batch_first=True, padding_value=0) # will be ignored in the loss function, so its value does not matter
 
@@ -227,7 +293,19 @@ def train(args, train_dataset, model, tokenizer):
         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
         ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+
+    if args.scheduler == 'linear':
+        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    elif args.scheduler == 'transformer':
+        if args.model_type == 'bert':
+            dimension = model.config.hidden_size
+        elif args.model_type == 'gpt2':
+            dimension = model.config.n_embd
+        else:
+            logger.error('Cannot detect hidden size dimensions in this model type. Config: %s', model.config)
+        scheduler = get_transformer_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, dimension=dimension)
+    else:
+        logger.error('Unknown scheduler type.')
 
     # Check if saved optimizer or scheduler states exist
     if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
@@ -270,7 +348,7 @@ def train(args, train_dataset, model, tokenizer):
         # set global_step to gobal_step of last saved checkpoint from model path
         global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = (global_step % (len(train_dataloader) // args.gradient_accumulation_steps)) * args.gradient_accumulation_steps
 
         logger.info("  Continuing training from checkpoint, will skip to saved global_step")
         logger.info("  Continuing training from epoch %d", epochs_trained)
@@ -299,13 +377,22 @@ def train(args, train_dataset, model, tokenizer):
                 steps_trained_in_current_epoch -= 1
                 continue
 
-            inputs, labels, position_ids, segment_ids = mask_tokens(batch, tokenizer, args) if args.mlm else batch # batch is a tuple (input, labels, position_ids, segment_ids)
+            inputs, labels, position_ids, segment_ids = batch # batch is a tuple (input, labels, position_ids, segment_ids)
+            if args.mlm:
+                inputs, labels = mask_tokens(inputs, labels, tokenizer, args)
             inputs = inputs.to(args.device)
             labels = labels.to(args.device)
             position_ids = position_ids.to(args.device)
             segment_ids = segment_ids.to(args.device)
             model.train()
-            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
+            # print('inputs', inputs)
+            # print('labels', labels)
+            # print('position_ids', position_ids.shape)
+            # print('segment_ids', segment_ids.shape)
+            if args.mlm:
+                outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
+            else:
+                outputs = model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
             loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
@@ -334,6 +421,10 @@ def train(args, train_dataset, model, tokenizer):
                     # Log metrics
                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
+                        if args.aux_eval_data_file is not None:
+                            aux_results = evaluate(args, model, tokenizer, aux=True)
+                            for key, value in aux_results.items():
+                                tb_writer.add_scalar('auxiliary_eval_{}'.format(key), value, global_step)
                         if best_eval_perplexity > results['perplexity']:
                             best_eval_perplexity = results['perplexity']
                             if not os.path.exists(args.output_dir):
@@ -356,7 +447,7 @@ def train(args, train_dataset, model, tokenizer):
                     tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                     logging_loss = tr_loss
 
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0 and args.save_total_limit > 0:
                     checkpoint_prefix = 'checkpoint'
                     # Save model checkpoint
                     output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
@@ -388,11 +479,11 @@ def train(args, train_dataset, model, tokenizer):
     return global_step, tr_loss / global_step
 
 
-def evaluate(args, model, tokenizer, prefix=""):
+def evaluate(args, model, tokenizer, prefix="", aux=False):
     # Loop to handle MNLI double evaluation (matched, mis-matched)
     eval_output_dir = args.output_dir
 
-    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
+    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True, aux=aux)
 
     if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(eval_output_dir)
@@ -416,14 +507,19 @@ def evaluate(args, model, tokenizer, prefix=""):
     model.eval()
 
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        inputs, labels, position_ids, segment_ids = mask_tokens(batch, tokenizer, args) if args.mlm else batch
+        inputs, labels, position_ids, segment_ids = batch
+        if args.mlm:
+            inputs, labels = mask_tokens(inputs, labels, tokenizer, args)
         inputs = inputs.to(args.device)
         labels = labels.to(args.device)
         position_ids = position_ids.to(args.device)
         segment_ids = segment_ids.to(args.device)
 
         with torch.no_grad():
-            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
+            if args.mlm:
+                outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
+            else:
+                outputs = model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
             lm_loss = outputs[0]
             eval_loss += lm_loss.mean().item()
         nb_eval_steps += 1
@@ -444,10 +540,11 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     return result
 
-def add_special_tokens(model, tokenizer, additional_special_tokens):
+def add_special_tokens(model, tokenizer, additional_special_tokens, pad_token=None):
     """ Add special tokens to the tokenizer and the model if they have not already been added. """
-    ATTR_TO_SPECIAL_TOKEN = {'pad_token': '<pad>',
-                         'additional_special_tokens': additional_special_tokens}
+    ATTR_TO_SPECIAL_TOKEN = {'additional_special_tokens': additional_special_tokens}
+    if pad_token is not None:
+        ATTR_TO_SPECIAL_TOKEN['pad_token'] = pad_token
     orig_num_tokens = len(tokenizer)
     num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
     if num_added_tokens > 0:
@@ -456,25 +553,33 @@ def add_special_tokens(model, tokenizer, additional_special_tokens):
 
 def parse_argv(parser):
     ## Required parameters
-    parser.add_argument("--train_data_file", default=None, type=str, required=True,
-                        help="The input training data file (a text file).")
     parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model predictions and checkpoints will be written.")
-    parser.add_argument("--tensorboard_dir", default=None, type=str, required=True,
+    
+    ## Other parameters
+    parser.add_argument("--tensorboard_dir", default=None, type=str,
                         help="The output directory where the tensorboard files will be written.")
-                        
-
+    parser.add_argument("--train_data_file", default=None, type=str,
+                        help="The input training data file.")
+    parser.add_argument("--aux_train_data_file", default=None, type=str,
+                        help="An input training data file for the target domain.")
     parser.add_argument('--start_special_token', type=str, default='<paraphrase>',
                         help='The special token for the start of paraphrases.')
     parser.add_argument('--end_special_token', type=str, default='</paraphrase>',
                         help='The special token for the end of paraphrases.')
+    parser.add_argument('--pad_token', type=str, default='<pad>',
+                        help='The special token for padding..')
     parser.add_argument('--add_inbetween_as_special_tokens', action='store_true',
                         help='The space-separated tokens between --start_special_token and --end_special_token will be added as special tokens. Useful for ThingTalk code.')
     parser.add_argument('--train_all_tokens', action='store_true',
                         help='If True, the model will be trained on input and output sequences, as opposed to only tokens of the output sequence')
-    ## Other parameters
+    parser.add_argument("--reverse_position_ids", action='store_true',
+                        help='If we assume we know the length of the output sequence beforehand, we can do a better job at generation.')
+
     parser.add_argument("--eval_data_file", default=None, type=str,
                         help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
+    parser.add_argument("--aux_eval_data_file", default=None, type=str,
+                        help="An additional input evaluation data file to evaluate the perplexity on (a text file).")
 
     parser.add_argument("--model_type", default="bert", type=str,
                         help="The model architecture to be fine-tuned.")
@@ -525,6 +630,8 @@ def parse_argv(parser):
                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
     parser.add_argument("--warmup_steps", default=0, type=int,
                         help="Linear warmup over warmup_steps.")
+    parser.add_argument("--scheduler", default='linear', type=str, choices=['linear', 'transformer'],
+                        help="The type of learning rate scheduler to use.")
 
     parser.add_argument('--logging_steps', type=int, default=50,
                         help="Log every X updates steps.")
@@ -556,6 +663,15 @@ def main(args):
     if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
         raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                          "flag (masked language modeling).")
+    if args.model_type in ['bert'] and (args.pad_token != '[PAD]' or args.start_special_token != '[SEP]' or args.end_special_token != '[SEP]'):
+        raise ValueError("BERT already has its own special tokens [PAD] and [SEP]. You should use them for better results.")
+    if args.do_train:
+        if args.train_data_file is None:
+            raise ValueError("Cannot do training without a training data file. Either supply a file to --train_data_file "
+                            "or remove the --do_train argument.")
+        if args.tensorboard_dir is None:
+            raise ValueError("Cannot do training without specifying --tensorboard_dir")
+
     if args.eval_data_file is None and args.do_eval:
         raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
                          "or remove the --do_eval argument.")
@@ -601,7 +717,7 @@ def main(args):
                                         from_tf=bool('.ckpt' in args.model_name_or_path),
                                         config=config,
                                         cache_dir=args.cache_dir if args.cache_dir else None)
-    add_special_tokens(model, tokenizer, additional_special_tokens=[args.start_special_token, args.end_special_token])
+    add_special_tokens(model, tokenizer, additional_special_tokens=[args.start_special_token, args.end_special_token], pad_token=args.pad_token)
     if args.add_inbetween_as_special_tokens:
         new_tokens = get_inbetween_tokens(args.train_data_file, start_token=args.start_special_token, end_token=args.end_special_token)
         logger.info('Detected %d new tokens', len(new_tokens))