more features and fixes for paraphraser training
- auxiliary train set for mixing seq2seq and LM modeling loss - auxiliary dev set to calculate perplexity on - support training of masked LMs - transformers==2.5.1 - reversed poisition ids for when the length of output is assumed to be known
This commit is contained in:
parent
423cc2330f
commit
b0a0398576
2
Pipfile
2
Pipfile
|
@ -18,7 +18,7 @@ pyrouge = ">=0.1.3"
|
|||
sacrebleu = "~=1.0"
|
||||
tensorboardX = "==2.0.*"
|
||||
requests = "~=2.22"
|
||||
transformers = "==2.3.0"
|
||||
transformers = "==2.5.1"
|
||||
radam = {git = "https://github.com/LiyuanLucasLiu/RAdam"}
|
||||
sentencepiece = ">=0.1.83,<0.2.0"
|
||||
|
||||
|
|
|
@ -30,6 +30,9 @@ import pickle
|
|||
import re
|
||||
import shutil
|
||||
import torch
|
||||
import math
|
||||
import csv
|
||||
import numpy as np
|
||||
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
@ -47,7 +50,7 @@ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
|
|||
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
|
||||
CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
|
||||
|
||||
from genienlp.util import set_seed
|
||||
from genienlp.util import set_seed, get_number_of_lines
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -64,10 +67,12 @@ MODEL_CLASSES = {
|
|||
|
||||
|
||||
class TextDataset(Dataset):
|
||||
def __init__(self, tokenizer, args, file_path=None, block_size=512, prompt_token='<paraphrase>', evaluate=None):
|
||||
def __init__(self, tokenizer, args, file_path=None, block_size=512, evaluate=None):
|
||||
self.tokenizer = tokenizer
|
||||
self.block_size = block_size
|
||||
assert os.path.isfile(file_path)
|
||||
directory, filename = os.path.split(file_path)
|
||||
cached_features_file = os.path.join(directory, os.path.basename(os.path.normpath(args.model_name_or_path)) + '_cached_lm_' + str(block_size) + '_' + filename)
|
||||
cached_features_file = os.path.join(directory, os.path.basename(os.path.normpath(args.model_name_or_path)) + '_cached_lm_' + str(self.block_size) + '_' + filename)
|
||||
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
|
@ -76,42 +81,84 @@ class TextDataset(Dataset):
|
|||
else:
|
||||
logger.info("Creating features from dataset file at %s", file_path)
|
||||
|
||||
prompt_token_id = tokenizer.convert_tokens_to_ids(prompt_token)
|
||||
segment1_id = tokenizer.convert_tokens_to_ids(args.start_special_token)
|
||||
segment2_id = tokenizer.convert_tokens_to_ids(args.end_special_token)
|
||||
self.prompt_token_id = self.tokenizer.convert_tokens_to_ids(args.start_special_token)
|
||||
self.end_token_id = self.tokenizer.convert_tokens_to_ids(args.end_special_token)
|
||||
self.segment1_id = 0
|
||||
self.segment2_id = 1
|
||||
if args.model_type == 'gpt2':
|
||||
self.segment1_id = self.prompt_token_id
|
||||
self.segment2_id = self.end_token_id
|
||||
# print('prompt_token_id = ', prompt_token_id)
|
||||
self.examples = []
|
||||
self.labels = []
|
||||
self.position_ids = []
|
||||
self.segment_ids = []
|
||||
max_input_length = 0
|
||||
self.max_input_length = 0
|
||||
|
||||
if not evaluate and args.aux_train_data_file is not None:
|
||||
number_of_lines = get_number_of_lines(args.aux_train_data_file)
|
||||
with open(args.aux_train_data_file, encoding="utf-8") as f:
|
||||
reader = csv.reader(f, delimiter='\t')
|
||||
for row in tqdm(reader, desc='Tokenizing Auxiliary File', total=number_of_lines):
|
||||
self._add_example(row[0], None, args)
|
||||
|
||||
number_of_lines = get_number_of_lines(file_path)
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
for line in tqdm(f, desc='Tokenizing'):
|
||||
tokens = tokenizer.tokenize(line)
|
||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokens)
|
||||
tokenized_text = tokenized_text[0:block_size] # truncate longer sequences
|
||||
# print(tokenized_text)
|
||||
example = tokenizer.build_inputs_with_special_tokens(tokenized_text)
|
||||
max_input_length = max(max_input_length, len(example))
|
||||
try:
|
||||
prompt_token_location = tokenized_text.index(prompt_token_id)
|
||||
except ValueError:
|
||||
logger.warning('Prompt token not found after truncating the input. Dropping the example.')
|
||||
continue
|
||||
reader = csv.reader(f, delimiter='\t')
|
||||
for row in tqdm(reader, desc='Tokenizing', total=number_of_lines):
|
||||
self._add_example(row[0], row[1], args)
|
||||
|
||||
self.examples.append(example)
|
||||
if args.train_all_tokens and not evaluate:
|
||||
self.labels.append(example)
|
||||
else: # During evaluation, we only care about the output sequence so we mask the input
|
||||
self.labels.append([-1]*(prompt_token_location+1)+example[prompt_token_location+1:])
|
||||
self.position_ids.append([pos for pos in range(prompt_token_location+1)]+[pos for pos in range(len(example)-prompt_token_location-1)])
|
||||
self.segment_ids.append([segment1_id]*(prompt_token_location+1)+[segment2_id]*(len(example)-prompt_token_location-1))
|
||||
|
||||
|
||||
logger.info('Maximum input length: %d', max_input_length)
|
||||
logger.info('Maximum input length: %d', self.max_input_length)
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
with open(cached_features_file, 'wb') as handle:
|
||||
pickle.dump((self.examples, self.labels, self.position_ids, self.segment_ids), handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def _add_example(self, input_sequence, output_sequence, args):
|
||||
"""
|
||||
Args:
|
||||
input_sequence: if None, a corrupted version of the output_sequence will be used
|
||||
"""
|
||||
# TODO we should make use of tokenizer.build_inputs_with_special_tokens(sequence1, sequence2). Add special tokens manualy only if our model does not support two sequences (like GPT2).
|
||||
|
||||
input_token_ids = self.tokenizer.encode(input_sequence, add_special_tokens=False) + [self.tokenizer.convert_tokens_to_ids(args.start_special_token)]
|
||||
if output_sequence is None:
|
||||
output_token_ids = []
|
||||
else:
|
||||
output_token_ids = self.tokenizer.encode(output_sequence, add_special_tokens=False) + [self.tokenizer.convert_tokens_to_ids(args.end_special_token)]
|
||||
tokenized_text = input_token_ids + output_token_ids
|
||||
|
||||
tokenized_text = tokenized_text[0:self.block_size] # truncate longer sequences
|
||||
# print('tokenized_text = ', tokenized_text)
|
||||
|
||||
example = self.tokenizer.build_inputs_with_special_tokens(tokenized_text)
|
||||
# Remove duplicate end_token for models like BERT and RoBERTa that already add it
|
||||
if example[-2] == self.end_token_id:
|
||||
example = example[:-1]
|
||||
# print('example = ', example)
|
||||
self.max_input_length = max(self.max_input_length, len(example))
|
||||
try:
|
||||
prompt_token_location = example.index(self.prompt_token_id)
|
||||
except ValueError:
|
||||
logger.warning('Prompt token not found after truncating the input. Dropping the example.')
|
||||
return
|
||||
|
||||
self.examples.append(example)
|
||||
if args.train_all_tokens and not evaluate or output_sequence is None:
|
||||
self.labels.append(example)
|
||||
else: # During evaluation, we only care about the output_sequence so we mask the input
|
||||
self.labels.append([-100]*(prompt_token_location+1)+example[prompt_token_location+1:])
|
||||
|
||||
position_ids2 = range(len(example)-prompt_token_location-1)
|
||||
if args.reverse_position_ids:
|
||||
position_ids2 = reversed(position_ids2)
|
||||
self.position_ids.append(list(range(prompt_token_location+1)) + list(position_ids2))
|
||||
self.segment_ids.append([self.segment1_id]*(prompt_token_location+1) + [self.segment2_id]*(len(example)-prompt_token_location-1))
|
||||
|
||||
# print('position_ids = ', self.position_ids[-1])
|
||||
# print('segment_ids = ', self.segment_ids[-1])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
|
@ -119,8 +166,24 @@ class TextDataset(Dataset):
|
|||
return torch.tensor(self.examples[item]), torch.tensor(self.labels[item]), torch.tensor(self.position_ids[item]), torch.tensor(self.segment_ids[item])
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False):
|
||||
dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size, evaluate=evaluate)
|
||||
def get_transformer_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, dimension):
|
||||
num_warmup_steps = max(1, num_warmup_steps)
|
||||
|
||||
def lr_lambda(current_step):
|
||||
current_step += 1
|
||||
return 1. / math.sqrt(dimension) * min(1 / math.sqrt(current_step), current_step / (num_warmup_steps * math.sqrt(num_warmup_steps)))
|
||||
|
||||
return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False, aux=False):
|
||||
if evaluate:
|
||||
if aux:
|
||||
file_path = args.aux_eval_data_file
|
||||
else:
|
||||
file_path = args.eval_data_file
|
||||
else:
|
||||
file_path = args.train_data_file
|
||||
dataset = TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size, evaluate=evaluate)
|
||||
return dataset
|
||||
|
||||
|
||||
|
@ -153,15 +216,18 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
|
|||
shutil.rmtree(checkpoint)
|
||||
|
||||
|
||||
def mask_tokens(inputs, tokenizer, args):
|
||||
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
|
||||
labels = inputs.clone()
|
||||
def mask_tokens(inputs, labels, tokenizer, args):
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
|
||||
"""
|
||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||
probability_matrix = torch.full(labels.shape, args.mlm_probability)
|
||||
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
|
||||
# print('labels.tolist() = ', labels.tolist())
|
||||
# print('special_tokens_mask = ', special_tokens_mask)
|
||||
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
|
||||
masked_indices = torch.bernoulli(probability_matrix).bool()
|
||||
labels[~masked_indices] = -1 # We only compute loss on masked tokens
|
||||
labels[~masked_indices] = -100 # We only compute loss on masked tokens
|
||||
|
||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
||||
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
|
||||
|
@ -178,7 +244,7 @@ def mask_tokens(inputs, tokenizer, args):
|
|||
def pad_collate(batch, pad_token_id):
|
||||
(inputs, labels, position_ids, segment_ids) = zip(*batch)
|
||||
inputs_pad = pad_sequence(inputs, batch_first=True, padding_value=pad_token_id)
|
||||
labels_pad = pad_sequence(labels, batch_first=True, padding_value=-1)
|
||||
labels_pad = pad_sequence(labels, batch_first=True, padding_value=-100)
|
||||
position_ids = pad_sequence(position_ids, batch_first=True, padding_value=0) # will be ignored in the loss function, so its value does not matter
|
||||
segment_ids = pad_sequence(segment_ids, batch_first=True, padding_value=0) # will be ignored in the loss function, so its value does not matter
|
||||
|
||||
|
@ -227,7 +293,19 @@ def train(args, train_dataset, model, tokenizer):
|
|||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
|
||||
if args.scheduler == 'linear':
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
elif args.scheduler == 'transformer':
|
||||
if args.model_type == 'bert':
|
||||
dimension = model.config.hidden_size
|
||||
elif args.model_type == 'gpt2':
|
||||
dimension = model.config.n_embd
|
||||
else:
|
||||
logger.error('Cannot detect hidden size dimensions in this model type. Config: %s', model.config)
|
||||
scheduler = get_transformer_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, dimension=dimension)
|
||||
else:
|
||||
logger.error('Unknown scheduler type.')
|
||||
|
||||
# Check if saved optimizer or scheduler states exist
|
||||
if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
|
||||
|
@ -270,7 +348,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = (global_step % (len(train_dataloader) // args.gradient_accumulation_steps)) * args.gradient_accumulation_steps
|
||||
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
|
@ -299,13 +377,22 @@ def train(args, train_dataset, model, tokenizer):
|
|||
steps_trained_in_current_epoch -= 1
|
||||
continue
|
||||
|
||||
inputs, labels, position_ids, segment_ids = mask_tokens(batch, tokenizer, args) if args.mlm else batch # batch is a tuple (input, labels, position_ids, segment_ids)
|
||||
inputs, labels, position_ids, segment_ids = batch # batch is a tuple (input, labels, position_ids, segment_ids)
|
||||
if args.mlm:
|
||||
inputs, labels = mask_tokens(inputs, labels, tokenizer, args)
|
||||
inputs = inputs.to(args.device)
|
||||
labels = labels.to(args.device)
|
||||
position_ids = position_ids.to(args.device)
|
||||
segment_ids = segment_ids.to(args.device)
|
||||
model.train()
|
||||
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
|
||||
# print('inputs', inputs)
|
||||
# print('labels', labels)
|
||||
# print('position_ids', position_ids.shape)
|
||||
# print('segment_ids', segment_ids.shape)
|
||||
if args.mlm:
|
||||
outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
|
||||
else:
|
||||
outputs = model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
|
||||
if args.n_gpu > 1:
|
||||
|
@ -334,6 +421,10 @@ def train(args, train_dataset, model, tokenizer):
|
|||
# Log metrics
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
if args.aux_eval_data_file is not None:
|
||||
aux_results = evaluate(args, model, tokenizer, aux=True)
|
||||
for key, value in aux_results.items():
|
||||
tb_writer.add_scalar('auxiliary_eval_{}'.format(key), value, global_step)
|
||||
if best_eval_perplexity > results['perplexity']:
|
||||
best_eval_perplexity = results['perplexity']
|
||||
if not os.path.exists(args.output_dir):
|
||||
|
@ -356,7 +447,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0 and args.save_total_limit > 0:
|
||||
checkpoint_prefix = 'checkpoint'
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
|
||||
|
@ -388,11 +479,11 @@ def train(args, train_dataset, model, tokenizer):
|
|||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args, model, tokenizer, prefix=""):
|
||||
def evaluate(args, model, tokenizer, prefix="", aux=False):
|
||||
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
||||
eval_output_dir = args.output_dir
|
||||
|
||||
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
|
||||
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True, aux=aux)
|
||||
|
||||
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(eval_output_dir)
|
||||
|
@ -416,14 +507,19 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||
model.eval()
|
||||
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
inputs, labels, position_ids, segment_ids = mask_tokens(batch, tokenizer, args) if args.mlm else batch
|
||||
inputs, labels, position_ids, segment_ids = batch
|
||||
if args.mlm:
|
||||
inputs, labels = mask_tokens(inputs, labels, tokenizer, args)
|
||||
inputs = inputs.to(args.device)
|
||||
labels = labels.to(args.device)
|
||||
position_ids = position_ids.to(args.device)
|
||||
segment_ids = segment_ids.to(args.device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
|
||||
if args.mlm:
|
||||
outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
|
||||
else:
|
||||
outputs = model(inputs, labels=labels, position_ids=position_ids, token_type_ids=segment_ids)
|
||||
lm_loss = outputs[0]
|
||||
eval_loss += lm_loss.mean().item()
|
||||
nb_eval_steps += 1
|
||||
|
@ -444,10 +540,11 @@ def evaluate(args, model, tokenizer, prefix=""):
|
|||
|
||||
return result
|
||||
|
||||
def add_special_tokens(model, tokenizer, additional_special_tokens):
|
||||
def add_special_tokens(model, tokenizer, additional_special_tokens, pad_token=None):
|
||||
""" Add special tokens to the tokenizer and the model if they have not already been added. """
|
||||
ATTR_TO_SPECIAL_TOKEN = {'pad_token': '<pad>',
|
||||
'additional_special_tokens': additional_special_tokens}
|
||||
ATTR_TO_SPECIAL_TOKEN = {'additional_special_tokens': additional_special_tokens}
|
||||
if pad_token is not None:
|
||||
ATTR_TO_SPECIAL_TOKEN['pad_token'] = pad_token
|
||||
orig_num_tokens = len(tokenizer)
|
||||
num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
|
||||
if num_added_tokens > 0:
|
||||
|
@ -456,25 +553,33 @@ def add_special_tokens(model, tokenizer, additional_special_tokens):
|
|||
|
||||
def parse_argv(parser):
|
||||
## Required parameters
|
||||
parser.add_argument("--train_data_file", default=None, type=str, required=True,
|
||||
help="The input training data file (a text file).")
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
parser.add_argument("--tensorboard_dir", default=None, type=str, required=True,
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--tensorboard_dir", default=None, type=str,
|
||||
help="The output directory where the tensorboard files will be written.")
|
||||
|
||||
|
||||
parser.add_argument("--train_data_file", default=None, type=str,
|
||||
help="The input training data file.")
|
||||
parser.add_argument("--aux_train_data_file", default=None, type=str,
|
||||
help="An input training data file for the target domain.")
|
||||
parser.add_argument('--start_special_token', type=str, default='<paraphrase>',
|
||||
help='The special token for the start of paraphrases.')
|
||||
parser.add_argument('--end_special_token', type=str, default='</paraphrase>',
|
||||
help='The special token for the end of paraphrases.')
|
||||
parser.add_argument('--pad_token', type=str, default='<pad>',
|
||||
help='The special token for padding..')
|
||||
parser.add_argument('--add_inbetween_as_special_tokens', action='store_true',
|
||||
help='The space-separated tokens between --start_special_token and --end_special_token will be added as special tokens. Useful for ThingTalk code.')
|
||||
parser.add_argument('--train_all_tokens', action='store_true',
|
||||
help='If True, the model will be trained on input and output sequences, as opposed to only tokens of the output sequence')
|
||||
## Other parameters
|
||||
parser.add_argument("--reverse_position_ids", action='store_true',
|
||||
help='If we assume we know the length of the output sequence beforehand, we can do a better job at generation.')
|
||||
|
||||
parser.add_argument("--eval_data_file", default=None, type=str,
|
||||
help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
|
||||
parser.add_argument("--aux_eval_data_file", default=None, type=str,
|
||||
help="An additional input evaluation data file to evaluate the perplexity on (a text file).")
|
||||
|
||||
parser.add_argument("--model_type", default="bert", type=str,
|
||||
help="The model architecture to be fine-tuned.")
|
||||
|
@ -525,6 +630,8 @@ def parse_argv(parser):
|
|||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--scheduler", default='linear', type=str, choices=['linear', 'transformer'],
|
||||
help="The type of learning rate scheduler to use.")
|
||||
|
||||
parser.add_argument('--logging_steps', type=int, default=50,
|
||||
help="Log every X updates steps.")
|
||||
|
@ -556,6 +663,15 @@ def main(args):
|
|||
if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
|
||||
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
||||
"flag (masked language modeling).")
|
||||
if args.model_type in ['bert'] and (args.pad_token != '[PAD]' or args.start_special_token != '[SEP]' or args.end_special_token != '[SEP]'):
|
||||
raise ValueError("BERT already has its own special tokens [PAD] and [SEP]. You should use them for better results.")
|
||||
if args.do_train:
|
||||
if args.train_data_file is None:
|
||||
raise ValueError("Cannot do training without a training data file. Either supply a file to --train_data_file "
|
||||
"or remove the --do_train argument.")
|
||||
if args.tensorboard_dir is None:
|
||||
raise ValueError("Cannot do training without specifying --tensorboard_dir")
|
||||
|
||||
if args.eval_data_file is None and args.do_eval:
|
||||
raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
|
||||
"or remove the --do_eval argument.")
|
||||
|
@ -601,7 +717,7 @@ def main(args):
|
|||
from_tf=bool('.ckpt' in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
add_special_tokens(model, tokenizer, additional_special_tokens=[args.start_special_token, args.end_special_token])
|
||||
add_special_tokens(model, tokenizer, additional_special_tokens=[args.start_special_token, args.end_special_token], pad_token=args.pad_token)
|
||||
if args.add_inbetween_as_special_tokens:
|
||||
new_tokens = get_inbetween_tokens(args.train_data_file, start_token=args.start_special_token, end_token=args.end_special_token)
|
||||
logger.info('Detected %d new tokens', len(new_tokens))
|
||||
|
|
Loading…
Reference in New Issue