bump transformers version to 3.5.1

This commit is contained in:
mehrad 2020-11-13 15:11:42 -08:00
parent 653424214c
commit 8b644cca02
12 changed files with 281 additions and 1119 deletions

View File

@ -19,9 +19,9 @@ pyrouge = ">=0.1.3"
sacrebleu = "~=1.0"
tensorboardX = "==2.0.*"
requests = "~=2.22"
transformers = "==2.11"
transformers = "==3.5.1"
radam = {git = "https://github.com/LiyuanLucasLiu/RAdam"}
sentencepiece = ">=0.1.83,<0.2.0"
sentencepiece = "==0.1.91"
mosestokenizer = '~=1.1'
matplotlib = '~=3.1'
seaborn = '~=0.9'

View File

@ -1,6 +1,8 @@
from typing import List
from transformers import GPT2LMHeadModel
import torch
from transformers.modeling_gpt2 import GPT2LMHeadModel
class GPT2Seq2Seq(GPT2LMHeadModel):
def __init__(self, config):
@ -23,7 +25,7 @@ class GPT2Seq2Seq(GPT2LMHeadModel):
return copy_input_sequences
#TODO check if this function is used
def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty):
""" repetition penalty from CTRL (https://arxiv.org/abs/1909.05858), but much faster on GPU
"""
@ -37,15 +39,7 @@ class GPT2Seq2Seq(GPT2LMHeadModel):
need_divide = need_change > 0
need_multiply = need_change < 0
lprobs = need_divide * lprobs / repetition_penalty + need_multiply * lprobs * repetition_penalty + (1-m) * lprobs
# old, slow implementation
# if repetition_penalty != 1.0:
# for i in range(context.shape[0]):
# for previous_token in set(generated[i].tolist()):
# if lprobs[i, previous_token] > 0:
# lprobs[i, previous_token] /= repetition_penalty
# else:
# lprobs[i, previous_token] *= repetition_penalty
def generate(self, **kwargs):
# change arguments so that they have the same meaning as seq2seq models
@ -68,18 +62,20 @@ class GPT2Seq2Seq(GPT2LMHeadModel):
return outputs
def prepare_inputs_for_generation(self, input_ids, past, **kwargs):
sep_token_position = (input_ids==self.sep_token_id).to(torch.long)
assert (torch.sum(sep_token_position, dim=1)==1).all(), 'All input_ids must contain exactly one sep_token. sep_token_position = %s\nsep_token_id = %d' % (str(sep_token_position), self.sep_token_id)
def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
sep_token_position = (input_ids == self.sep_token_id).to(torch.long)
assert (torch.sum(sep_token_position, dim=1) == 1).all(), 'All input_ids must contain exactly one sep_token.' \
' sep_token_position = %s\nsep_token_id = %d' % (str(sep_token_position), self.sep_token_id)
token_type_ids = torch.cumsum(sep_token_position, dim=1) - sep_token_position
attention_mask = (input_ids!=self.pad_token_id).to(torch.long) # 0 means mask, 1 means no mask
position_ids = ((torch.cumsum(attention_mask, dim=1)-1)*(1-token_type_ids)+(torch.cumsum(token_type_ids, dim=1)-1)*token_type_ids).clamp(min=0)
token_type_ids = self.sep_token_id * (1-token_type_ids) + self.eos_token_id * token_type_ids
attention_mask = (input_ids != self.pad_token_id).to(torch.long) # 0 means mask, 1 means no mask
position_ids = ((torch.cumsum(attention_mask, dim=1) - 1) * (1 - token_type_ids) +
(torch.cumsum(token_type_ids, dim=1) - 1) * token_type_ids).clamp(min=0)
token_type_ids = self.sep_token_id * (1 - token_type_ids) + self.eos_token_id * token_type_ids
if past:
input_ids = input_ids[:, -1].unsqueeze(-1)
position_ids = position_ids[:, -1].unsqueeze(-1)
token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
inputs = {"input_ids": input_ids, "position_ids": position_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "past": past}
inputs = {"input_ids": input_ids, "position_ids": position_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "past_key_values": past}
return inputs

View File

@ -308,7 +308,7 @@ def create_features_from_tsv_file(file_path, tokenizer, input_column, gold_colum
def is_question(sentence: str):
question_words = ['which', 'what', 'where', 'how', 'who', 'when', 'is', 'are', 'am', \
question_words = ['which', 'what', 'where', 'how', 'who', 'when', 'is', 'are', 'am',
'can', 'could', 'would', 'will', 'have', 'did', 'do', 'does', 'no is', 'yes is']
for w in question_words:
if sentence.startswith(w+' '):

View File

@ -168,11 +168,11 @@ class TextDataset(Dataset):
def _add_marian_example(self, input_sequence, output_sequence):
model_inputs = self.tokenizer.prepare_translation_batch([input_sequence], [output_sequence])
model_inputs = self.tokenizer.prepare_seq2seq_batch([input_sequence], [output_sequence])
encoded_input_ids = model_inputs['input_ids'].tolist()[0]
encoded_attention_mask = model_inputs['attention_mask'].tolist()[0]
encoded_output_ids = model_inputs['decoder_input_ids'].tolist()[0]
encoded_output_ids = model_inputs['labels'].tolist()[0]
self._update_seq2seq_example(encoded_input_ids, encoded_attention_mask, encoded_output_ids)

View File

@ -67,12 +67,12 @@ def check_args(args):
if args.model_type == 'marian' and args.model_name_or_path.rsplit('-', 1)[1] not in MARIAN_GROUP_MEMBERS and args.tgt_lang:
logger.warning('Target language should not be provided when using models with single language pairs,'
'otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...')
' otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...')
args.tgt_lang = None
if args.model_type == 'marian' and args.model_name_or_path.rsplit('-', 2)[1] not in MARIAN_GROUP_MEMBERS and args.src_lang:
logger.warning('Source language should not be provided when using models with single language pairs,'
'otherwise the translation outputs will be incorrect; thus we ignore the source language you provided...')
' otherwise the translation outputs will be incorrect; thus we ignore the source language you provided...')
args.src_lang = None
if args.model_type == 'mbart' and not (args.tgt_lang and args.src_lang):

View File

@ -45,11 +45,10 @@ import torch
from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
from .transformers_utils import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP
from transformers import GPT2Tokenizer, T5Tokenizer, MarianTokenizer
from transformers import GPT2Tokenizer, T5Tokenizer, MarianTokenizer, BartTokenizer
from transformers import BartForConditionalGeneration
from .transformers_utils import MarianMTModel, T5ForConditionalGeneration, BartForConditionalGeneration as MBartForConditionalGeneration
from .transformers_utils import BartTokenizer, MBartTokenizer
from .transformers_utils import GenieMarianMTModel, GenieT5ForConditionalGeneration, GenieBartForConditionalGeneration, GenieMBartForConditionalGeneration
from .transformers_utils import GenieMBartTokenizer
from transformers import PretrainedConfig
@ -69,10 +68,10 @@ ALL_MODELS = sum((tuple(map.keys()) for map in (GPT2_PRETRAINED_CONFIG_ARCHIVE_M
MODEL_CLASSES = {
'gpt2': (GPT2Seq2Seq, GPT2Tokenizer, {'bos_token': '<unk>', 'sep_token': '<paraphrase>', 'eos_token': '</paraphrase>'}),
't5': (T5ForConditionalGeneration, T5Tokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
'bart': (BartForConditionalGeneration, BartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
'mbart': (MBartForConditionalGeneration, MBartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
'marian': (MarianMTModel, MarianTokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
't5': (GenieT5ForConditionalGeneration, T5Tokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
'bart': (GenieBartForConditionalGeneration, BartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
'mbart': (GenieMBartForConditionalGeneration, GenieMBartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
'marian': (GenieMarianMTModel, MarianTokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
}
@ -126,14 +125,17 @@ def parse_argv(parser):
parser.add_argument('--batch_size', type=int, default=4,
help="Batch size for text generation for each GPU.")
parser.add_argument('--pad_token', type=str, default='<pad>',
help='The special token for padding, if tokenizer does not have that')
parser.add_argument('--cache_dir', default='.embeddings', type=str, help='where to save transforemrs cached models, configs, and tokenizers.')
parser.add_argument('--trained_model_type', type=str, help='if provided we make sure the loaded model matches the model_type')
parser.add_argument('--src_lang', type=str, default='en', help='source language used for translation task')
parser.add_argument('--src_lang', type=str, help='source language used for translation task')
parser.add_argument('--tgt_lang', type=str, help='target language used for translation task')
parser.add_argument('--return_attentions', action='store_true', help='return self and cross attention weights for seq2seq models')
parser.add_argument('--return_hidden_states', action='store_true', help='return all hidden states for seq2seq models')
parser.add_argument('--output_attentions', action='store_true', help='return self and cross attention weights for seq2seq models')
parser.add_argument('--output_hidden_states', action='store_true', help='return all hidden states for seq2seq models')
parser.add_argument('--att_pooling', type=str, default='max', help='pooling used to calculate decoder-encoder attention values across different heads')
parser.add_argument('--plot_heatmaps', action='store_true', help='whether to plot decoder-encoder attention heatmaps')
@ -277,12 +279,12 @@ def run_multi_process_generation(args):
def run_single_process_generation(args, config):
model_class, tokenizer_class, special_tokens = MODEL_CLASSES[args.model_type]
return_attentions = args.return_attentions
return_hidden_states = args.return_hidden_states
output_attentions = args.output_attentions
output_hidden_states = args.output_hidden_states
model = model_class.from_pretrained(args.model_name_or_path,
output_attentions=return_attentions,
output_hidden_states=return_hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
cache_dir=args.cache_dir)
model.to(args.device)
@ -297,6 +299,11 @@ def run_single_process_generation(args, config):
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
eos_token_id = tokenizer.convert_tokens_to_ids(special_tokens['eos_token'])
sep_token_id = tokenizer.convert_tokens_to_ids(special_tokens['sep_token'])
if tokenizer.pad_token is None:
# this assigns pad token but doesn't add it to the vocabulary
tokenizer.pad_token = args.pad_token
pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
if pad_token_id is None:
@ -403,13 +410,12 @@ def run_single_process_generation(args, config):
temperature=args.temperature[hyperparameter_idx] if args.temperature[hyperparameter_idx] > 0 else 1.0, # if temperature==0, we do not sample
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
return_attentions=return_attentions,
return_hidden_states=return_hidden_states,
use_cache=True,
output_attentions=output_attentions
)
# TODO fix the way output attention is handled. Some models do not support it.
if return_attentions:
if output_attentions:
decoded, all_encoder_attentions = outputs
else:
decoded = outputs
@ -434,7 +440,7 @@ def run_single_process_generation(args, config):
min_index = min_index + 1
out_cropped = out[:min_index]
if args.task == 'translate':
if args.task == 'translate' and output_attentions:
src_tokens = tokenizer.convert_ids_to_tokens(batch_context_tensor[sample_index])
tgt_tokens = tokenizer.convert_ids_to_tokens(out_cropped)

View File

@ -43,10 +43,10 @@ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
CamembertConfig, CamembertForMaskedLM, CamembertTokenizer,
BartConfig, BartForConditionalGeneration, BartTokenizer,
MarianConfig, MarianTokenizer)
MBartConfig, MBartForConditionalGeneration,
MarianConfig, MarianMTModel, MarianTokenizer)
from .transformers_utils import BartForConditionalGeneration as MBartForConditionalGeneration
from .transformers_utils import MBartTokenizer, MarianMTModel
from .transformers_utils import GenieMBartTokenizer
from genienlp.util import set_seed, split_file_on_disk
from genienlp.paraphrase.data_utils import mask_tokens, add_special_tokens
@ -66,7 +66,7 @@ MODEL_CLASSES = {
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
'bart': (BartConfig, BartForConditionalGeneration, BartTokenizer),
'mbart': (BartConfig, MBartForConditionalGeneration, MBartTokenizer),
'mbart': (MBartConfig, MBartForConditionalGeneration, GenieMBartTokenizer),
'marian': (MarianConfig, MarianMTModel, MarianTokenizer)
}
@ -184,7 +184,7 @@ def train(args, train_dataset, model, tokenizer, input_file_name=None, multiple_
steps_trained_in_current_epoch -= 1
continue
inputs, attention_mask, labels, position_ids, segment_ids = batch # batch is a tuple (input, labels, position_ids, segment_ids)
inputs, attention_mask, labels, position_ids, segment_ids = batch
if args.mlm:
inputs, labels = mask_tokens(inputs, labels, tokenizer, args.mlm_probability, args.mlm_ignore_index)
@ -195,7 +195,7 @@ def train(args, train_dataset, model, tokenizer, input_file_name=None, multiple_
segment_ids = segment_ids.to(args.device)
model.train()
model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids}
model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids, 'use_cache': False}
# prepare inputs for mbart, and marian
if args.model_type in ['mbart', 'marian']:
@ -349,7 +349,7 @@ def evaluate(args, model, tokenizer, prefix="", aux=False):
segment_ids = segment_ids.to(args.device)
with torch.no_grad():
model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids}
model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids, 'use_cache': False}
if args.model_type in ['mbart', 'marian']:
model_inputs['attention_mask'] = attention_mask

File diff suppressed because it is too large Load Diff

View File

@ -56,8 +56,8 @@ setuptools.setup(
'pyrouge>=0.1.3',
'sacrebleu~=1.0',
'requests~=2.22',
'transformers==2.11',
'sentencepiece>=0.1.83,<0.2.0',
'transformers==3.5.1',
'sentencepiece==0.1.91',
'mosestokenizer~=1.1',
]
)

View File

@ -3,4 +3,4 @@ show me nearby hotels with both a " catalan " and " sauna " zeigen Sie mir in de
find people graduate of Stanford. Leute finden, die Stanford graduieren.
what is the highest rated hotel ? was ist das am höchsten bewertete Hotel ?
find hotels with 2 star ratings . Hotels mit 2 Sterne Bewertungen finden.
what is the rating of " rosedon " in " glenorchy " ? Wie hoch ist die Bewertung von „ Rosedon " in „ Glenorchy " ?
what is the rating of " rosedon " in " glenorchy " ? Wie hoch ist die Bewertung von " rosedon " " in " glenorchy " " ?

Can't render this file because it contains an unexpected character in line 1 and column 48.

View File

@ -1,6 +1,6 @@
who has a 8 star rating with over 8 reviews in " fonte " ? wer hat eine 8 Sterne Bewertung mit über 8 Bewertungen in " fonte " ?
show me nearby hotels with both a " catalan " and " sauna " ich sah mich in der Nähe von Hotels mit sowohl " Katalanen " als auch " Sauna " zeigen, " sowohl " Katalanen
find people graduate of Stanford. - es gibt Leute, die an der Stanford University studieren.
what is the highest rated hotel ? Was ist das Hotel mit dem höchsten Preis ?
show me nearby hotels with both a " catalan " and " sauna " ich sah in der Nähe Hotels mit " catalan " und " sauna " .
find people graduate of Stanford. finden Menschen Absolventen von Stanford.
what is the highest rated hotel ? Was ist das höchst bewertete Hotel ?
find hotels with 2 star ratings . finden Sie Hotels mit 2 Sternenbewertungen .
what is the rating of " rosedon " in " glenorchy " ? Was ist die Bewertung von " rosedon " in " glenorchy " ?

Can't render this file because it contains an unexpected character in line 1 and column 48.

View File

@ -125,7 +125,7 @@ done
# masked paraphrasing tests
cp -r $SRCDIR/dataset/paraphrasing/ $workdir/masked_paraphrasing/
for model in "sshleifer/bart-tiny-random" ; do
for model in "sshleifer/bart-tiny-random" "sshleifer/tiny-mbart" ; do
if [[ $model == *mbart* ]] ; then
model_type="mbart"
@ -160,7 +160,7 @@ for model in "t5-small" "Helsinki-NLP/opus-mt-en-de" ; do
fi
# use a pre-trained model
pipenv run python3 -m genienlp run-paraphrase --model_name_or_path $model --length 15 --temperature 0 --repetition_penalty 1.0 --num_samples 1 --batch_size 3 --input_file $workdir/translation/en-de/dev_"$base_model"_aligned.tsv --input_column 0 --gold_column 1 --output_file $workdir/generated_"$base_model"_aligned.tsv --skip_heuristics --att_pooling mean --task translate --tgt_lang de --replace_qp --return_attentions
pipenv run python3 -m genienlp run-paraphrase --model_name_or_path $model --length 15 --temperature 0 --repetition_penalty 1.0 --num_samples 1 --batch_size 3 --input_file $workdir/translation/en-de/dev_"$base_model"_aligned.tsv --input_column 0 --gold_column 1 --output_file $workdir/generated_"$base_model"_aligned.tsv --skip_heuristics --att_pooling mean --task translate --tgt_lang de --replace_qp --force_replace_qp --return_attentions
# check if result file exists and exact match accuracy is 100%
cut -f2 $workdir/translation/en-de/dev_"$base_model"_aligned.tsv | diff -u - $workdir/generated_"$base_model"_aligned.tsv