diff --git a/Pipfile b/Pipfile index 1d23a164..a82cc1ff 100644 --- a/Pipfile +++ b/Pipfile @@ -18,7 +18,7 @@ pyrouge = ">=0.1.3" sacrebleu = "~=1.0" tensorboardX = "==2.0.*" requests = "~=2.22" -transformers = "==2.3.0" +transformers = "==2.5.1" radam = {git = "https://github.com/LiyuanLucasLiu/RAdam"} sentencepiece = ">=0.1.83,<0.2.0" diff --git a/genienlp/run_lm_finetuning.py b/genienlp/run_lm_finetuning.py index 137d07ac..896efc79 100644 --- a/genienlp/run_lm_finetuning.py +++ b/genienlp/run_lm_finetuning.py @@ -30,6 +30,9 @@ import pickle import re import shutil import torch +import math +import csv +import numpy as np from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler from torch.nn.utils.rnn import pad_sequence from torch.utils.data.distributed import DistributedSampler @@ -47,7 +50,7 @@ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, CamembertConfig, CamembertForMaskedLM, CamembertTokenizer) -from genienlp.util import set_seed +from genienlp.util import set_seed, get_number_of_lines logger = logging.getLogger(__name__) @@ -64,10 +67,12 @@ MODEL_CLASSES = { class TextDataset(Dataset): - def __init__(self, tokenizer, args, file_path=None, block_size=512, prompt_token='', evaluate=None): + def __init__(self, tokenizer, args, file_path=None, block_size=512, evaluate=None): + self.tokenizer = tokenizer + self.block_size = block_size assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) - cached_features_file = os.path.join(directory, os.path.basename(os.path.normpath(args.model_name_or_path)) + '_cached_lm_' + str(block_size) + '_' + filename) + cached_features_file = os.path.join(directory, os.path.basename(os.path.normpath(args.model_name_or_path)) + '_cached_lm_' + str(self.block_size) + '_' + filename) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) @@ -76,42 +81,84 @@ class TextDataset(Dataset): else: logger.info("Creating features from dataset file at %s", file_path) - prompt_token_id = tokenizer.convert_tokens_to_ids(prompt_token) - segment1_id = tokenizer.convert_tokens_to_ids(args.start_special_token) - segment2_id = tokenizer.convert_tokens_to_ids(args.end_special_token) + self.prompt_token_id = self.tokenizer.convert_tokens_to_ids(args.start_special_token) + self.end_token_id = self.tokenizer.convert_tokens_to_ids(args.end_special_token) + self.segment1_id = 0 + self.segment2_id = 1 + if args.model_type == 'gpt2': + self.segment1_id = self.prompt_token_id + self.segment2_id = self.end_token_id # print('prompt_token_id = ', prompt_token_id) self.examples = [] self.labels = [] self.position_ids = [] self.segment_ids = [] - max_input_length = 0 + self.max_input_length = 0 + + if not evaluate and args.aux_train_data_file is not None: + number_of_lines = get_number_of_lines(args.aux_train_data_file) + with open(args.aux_train_data_file, encoding="utf-8") as f: + reader = csv.reader(f, delimiter='\t') + for row in tqdm(reader, desc='Tokenizing Auxiliary File', total=number_of_lines): + self._add_example(row[0], None, args) + + number_of_lines = get_number_of_lines(file_path) with open(file_path, encoding="utf-8") as f: - for line in tqdm(f, desc='Tokenizing'): - tokens = tokenizer.tokenize(line) - tokenized_text = tokenizer.convert_tokens_to_ids(tokens) - tokenized_text = tokenized_text[0:block_size] # truncate longer sequences - # print(tokenized_text) - example = tokenizer.build_inputs_with_special_tokens(tokenized_text) - max_input_length = max(max_input_length, len(example)) - try: - prompt_token_location = tokenized_text.index(prompt_token_id) - except ValueError: - logger.warning('Prompt token not found after truncating the input. Dropping the example.') - continue + reader = csv.reader(f, delimiter='\t') + for row in tqdm(reader, desc='Tokenizing', total=number_of_lines): + self._add_example(row[0], row[1], args) - self.examples.append(example) - if args.train_all_tokens and not evaluate: - self.labels.append(example) - else: # During evaluation, we only care about the output sequence so we mask the input - self.labels.append([-1]*(prompt_token_location+1)+example[prompt_token_location+1:]) - self.position_ids.append([pos for pos in range(prompt_token_location+1)]+[pos for pos in range(len(example)-prompt_token_location-1)]) - self.segment_ids.append([segment1_id]*(prompt_token_location+1)+[segment2_id]*(len(example)-prompt_token_location-1)) + - logger.info('Maximum input length: %d', max_input_length) + logger.info('Maximum input length: %d', self.max_input_length) logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, 'wb') as handle: pickle.dump((self.examples, self.labels, self.position_ids, self.segment_ids), handle, protocol=pickle.HIGHEST_PROTOCOL) + def _add_example(self, input_sequence, output_sequence, args): + """ + Args: + input_sequence: if None, a corrupted version of the output_sequence will be used + """ + # TODO we should make use of tokenizer.build_inputs_with_special_tokens(sequence1, sequence2). Add special tokens manualy only if our model does not support two sequences (like GPT2). + + input_token_ids = self.tokenizer.encode(input_sequence, add_special_tokens=False) + [self.tokenizer.convert_tokens_to_ids(args.start_special_token)] + if output_sequence is None: + output_token_ids = [] + else: + output_token_ids = self.tokenizer.encode(output_sequence, add_special_tokens=False) + [self.tokenizer.convert_tokens_to_ids(args.end_special_token)] + tokenized_text = input_token_ids + output_token_ids + + tokenized_text = tokenized_text[0:self.block_size] # truncate longer sequences + # print('tokenized_text = ', tokenized_text) + + example = self.tokenizer.build_inputs_with_special_tokens(tokenized_text) + # Remove duplicate end_token for models like BERT and RoBERTa that already add it + if example[-2] == self.end_token_id: + example = example[:-1] + # print('example = ', example) + self.max_input_length = max(self.max_input_length, len(example)) + try: + prompt_token_location = example.index(self.prompt_token_id) + except ValueError: + logger.warning('Prompt token not found after truncating the input. Dropping the example.') + return + + self.examples.append(example) + if args.train_all_tokens and not evaluate or output_sequence is None: + self.labels.append(example) + else: # During evaluation, we only care about the output_sequence so we mask the input + self.labels.append([-100]*(prompt_token_location+1)+example[prompt_token_location+1:]) + + position_ids2 = range(len(example)-prompt_token_location-1) + if args.reverse_position_ids: + position_ids2 = reversed(position_ids2) + self.position_ids.append(list(range(prompt_token_location+1)) + list(position_ids2)) + self.segment_ids.append([self.segment1_id]*(prompt_token_location+1) + [self.segment2_id]*(len(example)-prompt_token_location-1)) + + # print('position_ids = ', self.position_ids[-1]) + # print('segment_ids = ', self.segment_ids[-1]) + def __len__(self): return len(self.examples) @@ -119,8 +166,24 @@ class TextDataset(Dataset): return torch.tensor(self.examples[item]), torch.tensor(self.labels[item]), torch.tensor(self.position_ids[item]), torch.tensor(self.segment_ids[item]) -def load_and_cache_examples(args, tokenizer, evaluate=False): - dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size, evaluate=evaluate) +def get_transformer_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, dimension): + num_warmup_steps = max(1, num_warmup_steps) + + def lr_lambda(current_step): + current_step += 1 + return 1. / math.sqrt(dimension) * min(1 / math.sqrt(current_step), current_step / (num_warmup_steps * math.sqrt(num_warmup_steps))) + + return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + +def load_and_cache_examples(args, tokenizer, evaluate=False, aux=False): + if evaluate: + if aux: + file_path = args.aux_eval_data_file + else: + file_path = args.eval_data_file + else: + file_path = args.train_data_file + dataset = TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size, evaluate=evaluate) return dataset @@ -153,15 +216,18 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False): shutil.rmtree(checkpoint) -def mask_tokens(inputs, tokenizer, args): - """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ - labels = inputs.clone() +def mask_tokens(inputs, labels, tokenizer, args): + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + """ # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()] + # print('labels.tolist() = ', labels.tolist()) + # print('special_tokens_mask = ', special_tokens_mask) probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() - labels[~masked_indices] = -1 # We only compute loss on masked tokens + labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices @@ -178,7 +244,7 @@ def mask_tokens(inputs, tokenizer, args): def pad_collate(batch, pad_token_id): (inputs, labels, position_ids, segment_ids) = zip(*batch) inputs_pad = pad_sequence(inputs, batch_first=True, padding_value=pad_token_id) - labels_pad = pad_sequence(labels, batch_first=True, padding_value=-1) + labels_pad = pad_sequence(labels, batch_first=True, padding_value=-100) position_ids = pad_sequence(position_ids, batch_first=True, padding_value=0) # will be ignored in the loss function, so its value does not matter segment_ids = pad_sequence(segment_ids, batch_first=True, padding_value=0) # will be ignored in the loss function, so its value does not matter @@ -227,7 +293,19 @@ def train(args, train_dataset, model, tokenizer): {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + + if args.scheduler == 'linear': + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + elif args.scheduler == 'transformer': + if args.model_type == 'bert': + dimension = model.config.hidden_size + elif args.model_type == 'gpt2': + dimension = model.config.n_embd + else: + logger.error('Cannot detect hidden size dimensions in this model type. Config: %s', model.config) + scheduler = get_transformer_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, dimension=dimension) + else: + logger.error('Unknown scheduler type.') # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')): @@ -270,7 +348,7 @@ def train(args, train_dataset, model, tokenizer): # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) - steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) + steps_trained_in_current_epoch = (global_step % (len(train_dataloader) // args.gradient_accumulation_steps)) * args.gradient_accumulation_steps logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) @@ -299,13 +377,22 @@ def train(args, train_dataset, model, tokenizer): steps_trained_in_current_epoch -= 1 continue - inputs, labels, position_ids, segment_ids = mask_tokens(batch, tokenizer, args) if args.mlm else batch # batch is a tuple (input, labels, position_ids, segment_ids) + inputs, labels, position_ids, segment_ids = batch # batch is a tuple (input, labels, position_ids, segment_ids) + if args.mlm: + inputs, labels = mask_tokens(inputs, labels, tokenizer, args) inputs = inputs.to(args.device) labels = labels.to(args.device) position_ids = position_ids.to(args.device) segment_ids = segment_ids.to(args.device) model.train() - outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids) + # print('inputs', inputs) + # print('labels', labels) + # print('position_ids', position_ids.shape) + # print('segment_ids', segment_ids.shape) + if args.mlm: + outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids) + else: + outputs = model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: @@ -334,6 +421,10 @@ def train(args, train_dataset, model, tokenizer): # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) + if args.aux_eval_data_file is not None: + aux_results = evaluate(args, model, tokenizer, aux=True) + for key, value in aux_results.items(): + tb_writer.add_scalar('auxiliary_eval_{}'.format(key), value, global_step) if best_eval_perplexity > results['perplexity']: best_eval_perplexity = results['perplexity'] if not os.path.exists(args.output_dir): @@ -356,7 +447,7 @@ def train(args, train_dataset, model, tokenizer): tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss - if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: + if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0 and args.save_total_limit > 0: checkpoint_prefix = 'checkpoint' # Save model checkpoint output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) @@ -388,11 +479,11 @@ def train(args, train_dataset, model, tokenizer): return global_step, tr_loss / global_step -def evaluate(args, model, tokenizer, prefix=""): +def evaluate(args, model, tokenizer, prefix="", aux=False): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir - eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) + eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True, aux=aux) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) @@ -416,14 +507,19 @@ def evaluate(args, model, tokenizer, prefix=""): model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): - inputs, labels, position_ids, segment_ids = mask_tokens(batch, tokenizer, args) if args.mlm else batch + inputs, labels, position_ids, segment_ids = batch + if args.mlm: + inputs, labels = mask_tokens(inputs, labels, tokenizer, args) inputs = inputs.to(args.device) labels = labels.to(args.device) position_ids = position_ids.to(args.device) segment_ids = segment_ids.to(args.device) with torch.no_grad(): - outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids) + if args.mlm: + outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids) + else: + outputs = model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 @@ -444,10 +540,11 @@ def evaluate(args, model, tokenizer, prefix=""): return result -def add_special_tokens(model, tokenizer, additional_special_tokens): +def add_special_tokens(model, tokenizer, additional_special_tokens, pad_token=None): """ Add special tokens to the tokenizer and the model if they have not already been added. """ - ATTR_TO_SPECIAL_TOKEN = {'pad_token': '', - 'additional_special_tokens': additional_special_tokens} + ATTR_TO_SPECIAL_TOKEN = {'additional_special_tokens': additional_special_tokens} + if pad_token is not None: + ATTR_TO_SPECIAL_TOKEN['pad_token'] = pad_token orig_num_tokens = len(tokenizer) num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there if num_added_tokens > 0: @@ -456,25 +553,33 @@ def add_special_tokens(model, tokenizer, additional_special_tokens): def parse_argv(parser): ## Required parameters - parser.add_argument("--train_data_file", default=None, type=str, required=True, - help="The input training data file (a text file).") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") - parser.add_argument("--tensorboard_dir", default=None, type=str, required=True, + + ## Other parameters + parser.add_argument("--tensorboard_dir", default=None, type=str, help="The output directory where the tensorboard files will be written.") - - + parser.add_argument("--train_data_file", default=None, type=str, + help="The input training data file.") + parser.add_argument("--aux_train_data_file", default=None, type=str, + help="An input training data file for the target domain.") parser.add_argument('--start_special_token', type=str, default='', help='The special token for the start of paraphrases.') parser.add_argument('--end_special_token', type=str, default='', help='The special token for the end of paraphrases.') + parser.add_argument('--pad_token', type=str, default='', + help='The special token for padding..') parser.add_argument('--add_inbetween_as_special_tokens', action='store_true', help='The space-separated tokens between --start_special_token and --end_special_token will be added as special tokens. Useful for ThingTalk code.') parser.add_argument('--train_all_tokens', action='store_true', help='If True, the model will be trained on input and output sequences, as opposed to only tokens of the output sequence') - ## Other parameters + parser.add_argument("--reverse_position_ids", action='store_true', + help='If we assume we know the length of the output sequence beforehand, we can do a better job at generation.') + parser.add_argument("--eval_data_file", default=None, type=str, help="An optional input evaluation data file to evaluate the perplexity on (a text file).") + parser.add_argument("--aux_eval_data_file", default=None, type=str, + help="An additional input evaluation data file to evaluate the perplexity on (a text file).") parser.add_argument("--model_type", default="bert", type=str, help="The model architecture to be fine-tuned.") @@ -525,6 +630,8 @@ def parse_argv(parser): help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") + parser.add_argument("--scheduler", default='linear', type=str, choices=['linear', 'transformer'], + help="The type of learning rate scheduler to use.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") @@ -556,6 +663,15 @@ def main(args): if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm: raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") + if args.model_type in ['bert'] and (args.pad_token != '[PAD]' or args.start_special_token != '[SEP]' or args.end_special_token != '[SEP]'): + raise ValueError("BERT already has its own special tokens [PAD] and [SEP]. You should use them for better results.") + if args.do_train: + if args.train_data_file is None: + raise ValueError("Cannot do training without a training data file. Either supply a file to --train_data_file " + "or remove the --do_train argument.") + if args.tensorboard_dir is None: + raise ValueError("Cannot do training without specifying --tensorboard_dir") + if args.eval_data_file is None and args.do_eval: raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") @@ -601,7 +717,7 @@ def main(args): from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) - add_special_tokens(model, tokenizer, additional_special_tokens=[args.start_special_token, args.end_special_token]) + add_special_tokens(model, tokenizer, additional_special_tokens=[args.start_special_token, args.end_special_token], pad_token=args.pad_token) if args.add_inbetween_as_special_tokens: new_tokens = get_inbetween_tokens(args.train_data_file, start_token=args.start_special_token, end_token=args.end_special_token) logger.info('Detected %d new tokens', len(new_tokens))