more features and fixes for paraphraser training

- auxiliary train set for mixing seq2seq and LM modeling loss
- auxiliary dev set to calculate perplexity on
- support training of masked LMs
- transformers==2.5.1
- reversed poisition ids for when the length of output is assumed to be known
This commit is contained in:
Sina 2020-04-18 19:03:45 -07:00
parent 423cc2330f
commit b0a0398576
2 changed files with 170 additions and 54 deletions

View File

@ -18,7 +18,7 @@ pyrouge = ">=0.1.3"
sacrebleu = "~=1.0"
tensorboardX = "==2.0.*"
requests = "~=2.22"
transformers = "==2.3.0"
transformers = "==2.5.1"
radam = {git = "https://github.com/LiyuanLucasLiu/RAdam"}
sentencepiece = ">=0.1.83,<0.2.0"

View File

@ -30,6 +30,9 @@ import pickle
import re
import shutil
import torch
import math
import csv
import numpy as np
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.distributed import DistributedSampler
@ -47,7 +50,7 @@ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
from genienlp.util import set_seed
from genienlp.util import set_seed, get_number_of_lines
logger = logging.getLogger(__name__)
@ -64,10 +67,12 @@ MODEL_CLASSES = {
class TextDataset(Dataset):
def __init__(self, tokenizer, args, file_path=None, block_size=512, prompt_token='<paraphrase>', evaluate=None):
def __init__(self, tokenizer, args, file_path=None, block_size=512, evaluate=None):
self.tokenizer = tokenizer
self.block_size = block_size
assert os.path.isfile(file_path)
directory, filename = os.path.split(file_path)
cached_features_file = os.path.join(directory, os.path.basename(os.path.normpath(args.model_name_or_path)) + '_cached_lm_' + str(block_size) + '_' + filename)
cached_features_file = os.path.join(directory, os.path.basename(os.path.normpath(args.model_name_or_path)) + '_cached_lm_' + str(self.block_size) + '_' + filename)
if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file)
@ -76,42 +81,84 @@ class TextDataset(Dataset):
else:
logger.info("Creating features from dataset file at %s", file_path)
prompt_token_id = tokenizer.convert_tokens_to_ids(prompt_token)
segment1_id = tokenizer.convert_tokens_to_ids(args.start_special_token)
segment2_id = tokenizer.convert_tokens_to_ids(args.end_special_token)
self.prompt_token_id = self.tokenizer.convert_tokens_to_ids(args.start_special_token)
self.end_token_id = self.tokenizer.convert_tokens_to_ids(args.end_special_token)
self.segment1_id = 0
self.segment2_id = 1
if args.model_type == 'gpt2':
self.segment1_id = self.prompt_token_id
self.segment2_id = self.end_token_id
# print('prompt_token_id = ', prompt_token_id)
self.examples = []
self.labels = []
self.position_ids = []
self.segment_ids = []
max_input_length = 0
self.max_input_length = 0
if not evaluate and args.aux_train_data_file is not None:
number_of_lines = get_number_of_lines(args.aux_train_data_file)
with open(args.aux_train_data_file, encoding="utf-8") as f:
reader = csv.reader(f, delimiter='\t')
for row in tqdm(reader, desc='Tokenizing Auxiliary File', total=number_of_lines):
self._add_example(row[0], None, args)
number_of_lines = get_number_of_lines(file_path)
with open(file_path, encoding="utf-8") as f:
for line in tqdm(f, desc='Tokenizing'):
tokens = tokenizer.tokenize(line)
tokenized_text = tokenizer.convert_tokens_to_ids(tokens)
tokenized_text = tokenized_text[0:block_size] # truncate longer sequences
# print(tokenized_text)
example = tokenizer.build_inputs_with_special_tokens(tokenized_text)
max_input_length = max(max_input_length, len(example))
try:
prompt_token_location = tokenized_text.index(prompt_token_id)
except ValueError:
logger.warning('Prompt token not found after truncating the input. Dropping the example.')
continue
reader = csv.reader(f, delimiter='\t')
for row in tqdm(reader, desc='Tokenizing', total=number_of_lines):
self._add_example(row[0], row[1], args)
self.examples.append(example)
if args.train_all_tokens and not evaluate:
self.labels.append(example)
else: # During evaluation, we only care about the output sequence so we mask the input
self.labels.append([-1]*(prompt_token_location+1)+example[prompt_token_location+1:])
self.position_ids.append([pos for pos in range(prompt_token_location+1)]+[pos for pos in range(len(example)-prompt_token_location-1)])
self.segment_ids.append([segment1_id]*(prompt_token_location+1)+[segment2_id]*(len(example)-prompt_token_location-1))
logger.info('Maximum input length: %d', max_input_length)
logger.info('Maximum input length: %d', self.max_input_length)
logger.info("Saving features into cached file %s", cached_features_file)
with open(cached_features_file, 'wb') as handle:
pickle.dump((self.examples, self.labels, self.position_ids, self.segment_ids), handle, protocol=pickle.HIGHEST_PROTOCOL)
def _add_example(self, input_sequence, output_sequence, args):
"""
Args:
input_sequence: if None, a corrupted version of the output_sequence will be used
"""
# TODO we should make use of tokenizer.build_inputs_with_special_tokens(sequence1, sequence2). Add special tokens manualy only if our model does not support two sequences (like GPT2).
input_token_ids = self.tokenizer.encode(input_sequence, add_special_tokens=False) + [self.tokenizer.convert_tokens_to_ids(args.start_special_token)]
if output_sequence is None:
output_token_ids = []
else:
output_token_ids = self.tokenizer.encode(output_sequence, add_special_tokens=False) + [self.tokenizer.convert_tokens_to_ids(args.end_special_token)]
tokenized_text = input_token_ids + output_token_ids
tokenized_text = tokenized_text[0:self.block_size] # truncate longer sequences
# print('tokenized_text = ', tokenized_text)
example = self.tokenizer.build_inputs_with_special_tokens(tokenized_text)
# Remove duplicate end_token for models like BERT and RoBERTa that already add it
if example[-2] == self.end_token_id:
example = example[:-1]
# print('example = ', example)
self.max_input_length = max(self.max_input_length, len(example))
try:
prompt_token_location = example.index(self.prompt_token_id)
except ValueError:
logger.warning('Prompt token not found after truncating the input. Dropping the example.')
return
self.examples.append(example)
if args.train_all_tokens and not evaluate or output_sequence is None:
self.labels.append(example)
else: # During evaluation, we only care about the output_sequence so we mask the input
self.labels.append([-100]*(prompt_token_location+1)+example[prompt_token_location+1:])
position_ids2 = range(len(example)-prompt_token_location-1)
if args.reverse_position_ids:
position_ids2 = reversed(position_ids2)
self.position_ids.append(list(range(prompt_token_location+1)) + list(position_ids2))
self.segment_ids.append([self.segment1_id]*(prompt_token_location+1) + [self.segment2_id]*(len(example)-prompt_token_location-1))
# print('position_ids = ', self.position_ids[-1])
# print('segment_ids = ', self.segment_ids[-1])
def __len__(self):
return len(self.examples)
@ -119,8 +166,24 @@ class TextDataset(Dataset):
return torch.tensor(self.examples[item]), torch.tensor(self.labels[item]), torch.tensor(self.position_ids[item]), torch.tensor(self.segment_ids[item])
def load_and_cache_examples(args, tokenizer, evaluate=False):
dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size, evaluate=evaluate)
def get_transformer_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, dimension):
num_warmup_steps = max(1, num_warmup_steps)
def lr_lambda(current_step):
current_step += 1
return 1. / math.sqrt(dimension) * min(1 / math.sqrt(current_step), current_step / (num_warmup_steps * math.sqrt(num_warmup_steps)))
return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
def load_and_cache_examples(args, tokenizer, evaluate=False, aux=False):
if evaluate:
if aux:
file_path = args.aux_eval_data_file
else:
file_path = args.eval_data_file
else:
file_path = args.train_data_file
dataset = TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size, evaluate=evaluate)
return dataset
@ -153,15 +216,18 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
shutil.rmtree(checkpoint)
def mask_tokens(inputs, tokenizer, args):
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
labels = inputs.clone()
def mask_tokens(inputs, labels, tokenizer, args):
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
probability_matrix = torch.full(labels.shape, args.mlm_probability)
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
# print('labels.tolist() = ', labels.tolist())
# print('special_tokens_mask = ', special_tokens_mask)
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -1 # We only compute loss on masked tokens
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
@ -178,7 +244,7 @@ def mask_tokens(inputs, tokenizer, args):
def pad_collate(batch, pad_token_id):
(inputs, labels, position_ids, segment_ids) = zip(*batch)
inputs_pad = pad_sequence(inputs, batch_first=True, padding_value=pad_token_id)
labels_pad = pad_sequence(labels, batch_first=True, padding_value=-1)
labels_pad = pad_sequence(labels, batch_first=True, padding_value=-100)
position_ids = pad_sequence(position_ids, batch_first=True, padding_value=0) # will be ignored in the loss function, so its value does not matter
segment_ids = pad_sequence(segment_ids, batch_first=True, padding_value=0) # will be ignored in the loss function, so its value does not matter
@ -227,7 +293,19 @@ def train(args, train_dataset, model, tokenizer):
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.scheduler == 'linear':
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
elif args.scheduler == 'transformer':
if args.model_type == 'bert':
dimension = model.config.hidden_size
elif args.model_type == 'gpt2':
dimension = model.config.n_embd
else:
logger.error('Cannot detect hidden size dimensions in this model type. Config: %s', model.config)
scheduler = get_transformer_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, dimension=dimension)
else:
logger.error('Unknown scheduler type.')
# Check if saved optimizer or scheduler states exist
if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
@ -270,7 +348,7 @@ def train(args, train_dataset, model, tokenizer):
# set global_step to gobal_step of last saved checkpoint from model path
global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
steps_trained_in_current_epoch = (global_step % (len(train_dataloader) // args.gradient_accumulation_steps)) * args.gradient_accumulation_steps
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
logger.info(" Continuing training from epoch %d", epochs_trained)
@ -299,13 +377,22 @@ def train(args, train_dataset, model, tokenizer):
steps_trained_in_current_epoch -= 1
continue
inputs, labels, position_ids, segment_ids = mask_tokens(batch, tokenizer, args) if args.mlm else batch # batch is a tuple (input, labels, position_ids, segment_ids)
inputs, labels, position_ids, segment_ids = batch # batch is a tuple (input, labels, position_ids, segment_ids)
if args.mlm:
inputs, labels = mask_tokens(inputs, labels, tokenizer, args)
inputs = inputs.to(args.device)
labels = labels.to(args.device)
position_ids = position_ids.to(args.device)
segment_ids = segment_ids.to(args.device)
model.train()
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
# print('inputs', inputs)
# print('labels', labels)
# print('position_ids', position_ids.shape)
# print('segment_ids', segment_ids.shape)
if args.mlm:
outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
else:
outputs = model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
if args.n_gpu > 1:
@ -334,6 +421,10 @@ def train(args, train_dataset, model, tokenizer):
# Log metrics
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
results = evaluate(args, model, tokenizer)
if args.aux_eval_data_file is not None:
aux_results = evaluate(args, model, tokenizer, aux=True)
for key, value in aux_results.items():
tb_writer.add_scalar('auxiliary_eval_{}'.format(key), value, global_step)
if best_eval_perplexity > results['perplexity']:
best_eval_perplexity = results['perplexity']
if not os.path.exists(args.output_dir):
@ -356,7 +447,7 @@ def train(args, train_dataset, model, tokenizer):
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
logging_loss = tr_loss
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0 and args.save_total_limit > 0:
checkpoint_prefix = 'checkpoint'
# Save model checkpoint
output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
@ -388,11 +479,11 @@ def train(args, train_dataset, model, tokenizer):
return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, prefix=""):
def evaluate(args, model, tokenizer, prefix="", aux=False):
# Loop to handle MNLI double evaluation (matched, mis-matched)
eval_output_dir = args.output_dir
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True, aux=aux)
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
os.makedirs(eval_output_dir)
@ -416,14 +507,19 @@ def evaluate(args, model, tokenizer, prefix=""):
model.eval()
for batch in tqdm(eval_dataloader, desc="Evaluating"):
inputs, labels, position_ids, segment_ids = mask_tokens(batch, tokenizer, args) if args.mlm else batch
inputs, labels, position_ids, segment_ids = batch
if args.mlm:
inputs, labels = mask_tokens(inputs, labels, tokenizer, args)
inputs = inputs.to(args.device)
labels = labels.to(args.device)
position_ids = position_ids.to(args.device)
segment_ids = segment_ids.to(args.device)
with torch.no_grad():
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
if args.mlm:
outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
else:
outputs = model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
lm_loss = outputs[0]
eval_loss += lm_loss.mean().item()
nb_eval_steps += 1
@ -444,10 +540,11 @@ def evaluate(args, model, tokenizer, prefix=""):
return result
def add_special_tokens(model, tokenizer, additional_special_tokens):
def add_special_tokens(model, tokenizer, additional_special_tokens, pad_token=None):
""" Add special tokens to the tokenizer and the model if they have not already been added. """
ATTR_TO_SPECIAL_TOKEN = {'pad_token': '<pad>',
'additional_special_tokens': additional_special_tokens}
ATTR_TO_SPECIAL_TOKEN = {'additional_special_tokens': additional_special_tokens}
if pad_token is not None:
ATTR_TO_SPECIAL_TOKEN['pad_token'] = pad_token
orig_num_tokens = len(tokenizer)
num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
if num_added_tokens > 0:
@ -456,25 +553,33 @@ def add_special_tokens(model, tokenizer, additional_special_tokens):
def parse_argv(parser):
## Required parameters
parser.add_argument("--train_data_file", default=None, type=str, required=True,
help="The input training data file (a text file).")
parser.add_argument("--output_dir", default=None, type=str, required=True,
help="The output directory where the model predictions and checkpoints will be written.")
parser.add_argument("--tensorboard_dir", default=None, type=str, required=True,
## Other parameters
parser.add_argument("--tensorboard_dir", default=None, type=str,
help="The output directory where the tensorboard files will be written.")
parser.add_argument("--train_data_file", default=None, type=str,
help="The input training data file.")
parser.add_argument("--aux_train_data_file", default=None, type=str,
help="An input training data file for the target domain.")
parser.add_argument('--start_special_token', type=str, default='<paraphrase>',
help='The special token for the start of paraphrases.')
parser.add_argument('--end_special_token', type=str, default='</paraphrase>',
help='The special token for the end of paraphrases.')
parser.add_argument('--pad_token', type=str, default='<pad>',
help='The special token for padding..')
parser.add_argument('--add_inbetween_as_special_tokens', action='store_true',
help='The space-separated tokens between --start_special_token and --end_special_token will be added as special tokens. Useful for ThingTalk code.')
parser.add_argument('--train_all_tokens', action='store_true',
help='If True, the model will be trained on input and output sequences, as opposed to only tokens of the output sequence')
## Other parameters
parser.add_argument("--reverse_position_ids", action='store_true',
help='If we assume we know the length of the output sequence beforehand, we can do a better job at generation.')
parser.add_argument("--eval_data_file", default=None, type=str,
help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
parser.add_argument("--aux_eval_data_file", default=None, type=str,
help="An additional input evaluation data file to evaluate the perplexity on (a text file).")
parser.add_argument("--model_type", default="bert", type=str,
help="The model architecture to be fine-tuned.")
@ -525,6 +630,8 @@ def parse_argv(parser):
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
parser.add_argument("--warmup_steps", default=0, type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument("--scheduler", default='linear', type=str, choices=['linear', 'transformer'],
help="The type of learning rate scheduler to use.")
parser.add_argument('--logging_steps', type=int, default=50,
help="Log every X updates steps.")
@ -556,6 +663,15 @@ def main(args):
if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
"flag (masked language modeling).")
if args.model_type in ['bert'] and (args.pad_token != '[PAD]' or args.start_special_token != '[SEP]' or args.end_special_token != '[SEP]'):
raise ValueError("BERT already has its own special tokens [PAD] and [SEP]. You should use them for better results.")
if args.do_train:
if args.train_data_file is None:
raise ValueError("Cannot do training without a training data file. Either supply a file to --train_data_file "
"or remove the --do_train argument.")
if args.tensorboard_dir is None:
raise ValueError("Cannot do training without specifying --tensorboard_dir")
if args.eval_data_file is None and args.do_eval:
raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
"or remove the --do_eval argument.")
@ -601,7 +717,7 @@ def main(args):
from_tf=bool('.ckpt' in args.model_name_or_path),
config=config,
cache_dir=args.cache_dir if args.cache_dir else None)
add_special_tokens(model, tokenizer, additional_special_tokens=[args.start_special_token, args.end_special_token])
add_special_tokens(model, tokenizer, additional_special_tokens=[args.start_special_token, args.end_special_token], pad_token=args.pad_token)
if args.add_inbetween_as_special_tokens:
new_tokens = get_inbetween_tokens(args.train_data_file, start_token=args.start_special_token, end_token=args.end_special_token)
logger.info('Detected %d new tokens', len(new_tokens))