bump transformers version to 3.5.1
This commit is contained in:
parent
653424214c
commit
8b644cca02
4
Pipfile
4
Pipfile
|
@ -19,9 +19,9 @@ pyrouge = ">=0.1.3"
|
|||
sacrebleu = "~=1.0"
|
||||
tensorboardX = "==2.0.*"
|
||||
requests = "~=2.22"
|
||||
transformers = "==2.11"
|
||||
transformers = "==3.5.1"
|
||||
radam = {git = "https://github.com/LiyuanLucasLiu/RAdam"}
|
||||
sentencepiece = ">=0.1.83,<0.2.0"
|
||||
sentencepiece = "==0.1.91"
|
||||
mosestokenizer = '~=1.1'
|
||||
matplotlib = '~=3.1'
|
||||
seaborn = '~=0.9'
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from typing import List
|
||||
from transformers import GPT2LMHeadModel
|
||||
|
||||
import torch
|
||||
from transformers.modeling_gpt2 import GPT2LMHeadModel
|
||||
|
||||
|
||||
class GPT2Seq2Seq(GPT2LMHeadModel):
|
||||
def __init__(self, config):
|
||||
|
@ -23,7 +25,7 @@ class GPT2Seq2Seq(GPT2LMHeadModel):
|
|||
|
||||
return copy_input_sequences
|
||||
|
||||
|
||||
#TODO check if this function is used
|
||||
def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty):
|
||||
""" repetition penalty from CTRL (https://arxiv.org/abs/1909.05858), but much faster on GPU
|
||||
"""
|
||||
|
@ -37,15 +39,7 @@ class GPT2Seq2Seq(GPT2LMHeadModel):
|
|||
need_divide = need_change > 0
|
||||
need_multiply = need_change < 0
|
||||
lprobs = need_divide * lprobs / repetition_penalty + need_multiply * lprobs * repetition_penalty + (1-m) * lprobs
|
||||
|
||||
# old, slow implementation
|
||||
# if repetition_penalty != 1.0:
|
||||
# for i in range(context.shape[0]):
|
||||
# for previous_token in set(generated[i].tolist()):
|
||||
# if lprobs[i, previous_token] > 0:
|
||||
# lprobs[i, previous_token] /= repetition_penalty
|
||||
# else:
|
||||
# lprobs[i, previous_token] *= repetition_penalty
|
||||
|
||||
|
||||
def generate(self, **kwargs):
|
||||
# change arguments so that they have the same meaning as seq2seq models
|
||||
|
@ -68,18 +62,20 @@ class GPT2Seq2Seq(GPT2LMHeadModel):
|
|||
return outputs
|
||||
|
||||
|
||||
def prepare_inputs_for_generation(self, input_ids, past, **kwargs):
|
||||
sep_token_position = (input_ids==self.sep_token_id).to(torch.long)
|
||||
assert (torch.sum(sep_token_position, dim=1)==1).all(), 'All input_ids must contain exactly one sep_token. sep_token_position = %s\nsep_token_id = %d' % (str(sep_token_position), self.sep_token_id)
|
||||
def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
|
||||
sep_token_position = (input_ids == self.sep_token_id).to(torch.long)
|
||||
assert (torch.sum(sep_token_position, dim=1) == 1).all(), 'All input_ids must contain exactly one sep_token.' \
|
||||
' sep_token_position = %s\nsep_token_id = %d' % (str(sep_token_position), self.sep_token_id)
|
||||
token_type_ids = torch.cumsum(sep_token_position, dim=1) - sep_token_position
|
||||
attention_mask = (input_ids!=self.pad_token_id).to(torch.long) # 0 means mask, 1 means no mask
|
||||
position_ids = ((torch.cumsum(attention_mask, dim=1)-1)*(1-token_type_ids)+(torch.cumsum(token_type_ids, dim=1)-1)*token_type_ids).clamp(min=0)
|
||||
token_type_ids = self.sep_token_id * (1-token_type_ids) + self.eos_token_id * token_type_ids
|
||||
attention_mask = (input_ids != self.pad_token_id).to(torch.long) # 0 means mask, 1 means no mask
|
||||
position_ids = ((torch.cumsum(attention_mask, dim=1) - 1) * (1 - token_type_ids) +
|
||||
(torch.cumsum(token_type_ids, dim=1) - 1) * token_type_ids).clamp(min=0)
|
||||
token_type_ids = self.sep_token_id * (1 - token_type_ids) + self.eos_token_id * token_type_ids
|
||||
|
||||
if past:
|
||||
input_ids = input_ids[:, -1].unsqueeze(-1)
|
||||
position_ids = position_ids[:, -1].unsqueeze(-1)
|
||||
token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
|
||||
|
||||
inputs = {"input_ids": input_ids, "position_ids": position_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "past": past}
|
||||
inputs = {"input_ids": input_ids, "position_ids": position_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "past_key_values": past}
|
||||
return inputs
|
|
@ -308,7 +308,7 @@ def create_features_from_tsv_file(file_path, tokenizer, input_column, gold_colum
|
|||
|
||||
|
||||
def is_question(sentence: str):
|
||||
question_words = ['which', 'what', 'where', 'how', 'who', 'when', 'is', 'are', 'am', \
|
||||
question_words = ['which', 'what', 'where', 'how', 'who', 'when', 'is', 'are', 'am',
|
||||
'can', 'could', 'would', 'will', 'have', 'did', 'do', 'does', 'no is', 'yes is']
|
||||
for w in question_words:
|
||||
if sentence.startswith(w+' '):
|
||||
|
|
|
@ -168,11 +168,11 @@ class TextDataset(Dataset):
|
|||
|
||||
def _add_marian_example(self, input_sequence, output_sequence):
|
||||
|
||||
model_inputs = self.tokenizer.prepare_translation_batch([input_sequence], [output_sequence])
|
||||
model_inputs = self.tokenizer.prepare_seq2seq_batch([input_sequence], [output_sequence])
|
||||
|
||||
encoded_input_ids = model_inputs['input_ids'].tolist()[0]
|
||||
encoded_attention_mask = model_inputs['attention_mask'].tolist()[0]
|
||||
encoded_output_ids = model_inputs['decoder_input_ids'].tolist()[0]
|
||||
encoded_output_ids = model_inputs['labels'].tolist()[0]
|
||||
|
||||
self._update_seq2seq_example(encoded_input_ids, encoded_attention_mask, encoded_output_ids)
|
||||
|
||||
|
|
|
@ -67,12 +67,12 @@ def check_args(args):
|
|||
|
||||
if args.model_type == 'marian' and args.model_name_or_path.rsplit('-', 1)[1] not in MARIAN_GROUP_MEMBERS and args.tgt_lang:
|
||||
logger.warning('Target language should not be provided when using models with single language pairs,'
|
||||
'otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...')
|
||||
' otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...')
|
||||
args.tgt_lang = None
|
||||
|
||||
if args.model_type == 'marian' and args.model_name_or_path.rsplit('-', 2)[1] not in MARIAN_GROUP_MEMBERS and args.src_lang:
|
||||
logger.warning('Source language should not be provided when using models with single language pairs,'
|
||||
'otherwise the translation outputs will be incorrect; thus we ignore the source language you provided...')
|
||||
' otherwise the translation outputs will be incorrect; thus we ignore the source language you provided...')
|
||||
args.src_lang = None
|
||||
|
||||
if args.model_type == 'mbart' and not (args.tgt_lang and args.src_lang):
|
||||
|
|
|
@ -45,11 +45,10 @@ import torch
|
|||
from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
from .transformers_utils import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
from transformers import GPT2Tokenizer, T5Tokenizer, MarianTokenizer
|
||||
from transformers import GPT2Tokenizer, T5Tokenizer, MarianTokenizer, BartTokenizer
|
||||
|
||||
from transformers import BartForConditionalGeneration
|
||||
from .transformers_utils import MarianMTModel, T5ForConditionalGeneration, BartForConditionalGeneration as MBartForConditionalGeneration
|
||||
from .transformers_utils import BartTokenizer, MBartTokenizer
|
||||
from .transformers_utils import GenieMarianMTModel, GenieT5ForConditionalGeneration, GenieBartForConditionalGeneration, GenieMBartForConditionalGeneration
|
||||
from .transformers_utils import GenieMBartTokenizer
|
||||
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
@ -69,10 +68,10 @@ ALL_MODELS = sum((tuple(map.keys()) for map in (GPT2_PRETRAINED_CONFIG_ARCHIVE_M
|
|||
|
||||
MODEL_CLASSES = {
|
||||
'gpt2': (GPT2Seq2Seq, GPT2Tokenizer, {'bos_token': '<unk>', 'sep_token': '<paraphrase>', 'eos_token': '</paraphrase>'}),
|
||||
't5': (T5ForConditionalGeneration, T5Tokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
|
||||
'bart': (BartForConditionalGeneration, BartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
|
||||
'mbart': (MBartForConditionalGeneration, MBartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
|
||||
'marian': (MarianMTModel, MarianTokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
|
||||
't5': (GenieT5ForConditionalGeneration, T5Tokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
|
||||
'bart': (GenieBartForConditionalGeneration, BartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
|
||||
'mbart': (GenieMBartForConditionalGeneration, GenieMBartTokenizer, {'bos_token': '<s>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
|
||||
'marian': (GenieMarianMTModel, MarianTokenizer, {'bos_token': '<unk>', 'sep_token': '<unk>', 'eos_token': '</s>'}),
|
||||
}
|
||||
|
||||
|
||||
|
@ -126,14 +125,17 @@ def parse_argv(parser):
|
|||
parser.add_argument('--batch_size', type=int, default=4,
|
||||
help="Batch size for text generation for each GPU.")
|
||||
|
||||
parser.add_argument('--pad_token', type=str, default='<pad>',
|
||||
help='The special token for padding, if tokenizer does not have that')
|
||||
|
||||
parser.add_argument('--cache_dir', default='.embeddings', type=str, help='where to save transforemrs cached models, configs, and tokenizers.')
|
||||
|
||||
parser.add_argument('--trained_model_type', type=str, help='if provided we make sure the loaded model matches the model_type')
|
||||
|
||||
parser.add_argument('--src_lang', type=str, default='en', help='source language used for translation task')
|
||||
parser.add_argument('--src_lang', type=str, help='source language used for translation task')
|
||||
parser.add_argument('--tgt_lang', type=str, help='target language used for translation task')
|
||||
parser.add_argument('--return_attentions', action='store_true', help='return self and cross attention weights for seq2seq models')
|
||||
parser.add_argument('--return_hidden_states', action='store_true', help='return all hidden states for seq2seq models')
|
||||
parser.add_argument('--output_attentions', action='store_true', help='return self and cross attention weights for seq2seq models')
|
||||
parser.add_argument('--output_hidden_states', action='store_true', help='return all hidden states for seq2seq models')
|
||||
|
||||
parser.add_argument('--att_pooling', type=str, default='max', help='pooling used to calculate decoder-encoder attention values across different heads')
|
||||
parser.add_argument('--plot_heatmaps', action='store_true', help='whether to plot decoder-encoder attention heatmaps')
|
||||
|
@ -277,12 +279,12 @@ def run_multi_process_generation(args):
|
|||
def run_single_process_generation(args, config):
|
||||
model_class, tokenizer_class, special_tokens = MODEL_CLASSES[args.model_type]
|
||||
|
||||
return_attentions = args.return_attentions
|
||||
return_hidden_states = args.return_hidden_states
|
||||
output_attentions = args.output_attentions
|
||||
output_hidden_states = args.output_hidden_states
|
||||
|
||||
model = model_class.from_pretrained(args.model_name_or_path,
|
||||
output_attentions=return_attentions,
|
||||
output_hidden_states=return_hidden_states,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
cache_dir=args.cache_dir)
|
||||
model.to(args.device)
|
||||
|
||||
|
@ -297,6 +299,11 @@ def run_single_process_generation(args, config):
|
|||
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
|
||||
eos_token_id = tokenizer.convert_tokens_to_ids(special_tokens['eos_token'])
|
||||
sep_token_id = tokenizer.convert_tokens_to_ids(special_tokens['sep_token'])
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
# this assigns pad token but doesn't add it to the vocabulary
|
||||
tokenizer.pad_token = args.pad_token
|
||||
|
||||
pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
|
||||
|
||||
if pad_token_id is None:
|
||||
|
@ -403,13 +410,12 @@ def run_single_process_generation(args, config):
|
|||
temperature=args.temperature[hyperparameter_idx] if args.temperature[hyperparameter_idx] > 0 else 1.0, # if temperature==0, we do not sample
|
||||
eos_token_id=eos_token_id,
|
||||
pad_token_id=pad_token_id,
|
||||
return_attentions=return_attentions,
|
||||
return_hidden_states=return_hidden_states,
|
||||
use_cache=True,
|
||||
output_attentions=output_attentions
|
||||
)
|
||||
|
||||
# TODO fix the way output attention is handled. Some models do not support it.
|
||||
if return_attentions:
|
||||
if output_attentions:
|
||||
decoded, all_encoder_attentions = outputs
|
||||
else:
|
||||
decoded = outputs
|
||||
|
@ -434,7 +440,7 @@ def run_single_process_generation(args, config):
|
|||
min_index = min_index + 1
|
||||
out_cropped = out[:min_index]
|
||||
|
||||
if args.task == 'translate':
|
||||
if args.task == 'translate' and output_attentions:
|
||||
src_tokens = tokenizer.convert_ids_to_tokens(batch_context_tensor[sample_index])
|
||||
tgt_tokens = tokenizer.convert_ids_to_tokens(out_cropped)
|
||||
|
||||
|
|
|
@ -43,10 +43,10 @@ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
|
|||
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
|
||||
CamembertConfig, CamembertForMaskedLM, CamembertTokenizer,
|
||||
BartConfig, BartForConditionalGeneration, BartTokenizer,
|
||||
MarianConfig, MarianTokenizer)
|
||||
MBartConfig, MBartForConditionalGeneration,
|
||||
MarianConfig, MarianMTModel, MarianTokenizer)
|
||||
|
||||
from .transformers_utils import BartForConditionalGeneration as MBartForConditionalGeneration
|
||||
from .transformers_utils import MBartTokenizer, MarianMTModel
|
||||
from .transformers_utils import GenieMBartTokenizer
|
||||
|
||||
from genienlp.util import set_seed, split_file_on_disk
|
||||
from genienlp.paraphrase.data_utils import mask_tokens, add_special_tokens
|
||||
|
@ -66,7 +66,7 @@ MODEL_CLASSES = {
|
|||
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
|
||||
'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
|
||||
'bart': (BartConfig, BartForConditionalGeneration, BartTokenizer),
|
||||
'mbart': (BartConfig, MBartForConditionalGeneration, MBartTokenizer),
|
||||
'mbart': (MBartConfig, MBartForConditionalGeneration, GenieMBartTokenizer),
|
||||
'marian': (MarianConfig, MarianMTModel, MarianTokenizer)
|
||||
}
|
||||
|
||||
|
@ -184,7 +184,7 @@ def train(args, train_dataset, model, tokenizer, input_file_name=None, multiple_
|
|||
steps_trained_in_current_epoch -= 1
|
||||
continue
|
||||
|
||||
inputs, attention_mask, labels, position_ids, segment_ids = batch # batch is a tuple (input, labels, position_ids, segment_ids)
|
||||
inputs, attention_mask, labels, position_ids, segment_ids = batch
|
||||
|
||||
if args.mlm:
|
||||
inputs, labels = mask_tokens(inputs, labels, tokenizer, args.mlm_probability, args.mlm_ignore_index)
|
||||
|
@ -195,7 +195,7 @@ def train(args, train_dataset, model, tokenizer, input_file_name=None, multiple_
|
|||
segment_ids = segment_ids.to(args.device)
|
||||
model.train()
|
||||
|
||||
model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids}
|
||||
model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids, 'use_cache': False}
|
||||
|
||||
# prepare inputs for mbart, and marian
|
||||
if args.model_type in ['mbart', 'marian']:
|
||||
|
@ -349,7 +349,7 @@ def evaluate(args, model, tokenizer, prefix="", aux=False):
|
|||
segment_ids = segment_ids.to(args.device)
|
||||
|
||||
with torch.no_grad():
|
||||
model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids}
|
||||
model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids, 'use_cache': False}
|
||||
|
||||
if args.model_type in ['mbart', 'marian']:
|
||||
model_inputs['attention_mask'] = attention_mask
|
||||
|
|
File diff suppressed because it is too large
Load Diff
4
setup.py
4
setup.py
|
@ -56,8 +56,8 @@ setuptools.setup(
|
|||
'pyrouge>=0.1.3',
|
||||
'sacrebleu~=1.0',
|
||||
'requests~=2.22',
|
||||
'transformers==2.11',
|
||||
'sentencepiece>=0.1.83,<0.2.0',
|
||||
'transformers==3.5.1',
|
||||
'sentencepiece==0.1.91',
|
||||
'mosestokenizer~=1.1',
|
||||
]
|
||||
)
|
||||
|
|
|
@ -3,4 +3,4 @@ show me nearby hotels with both a " catalan " and " sauna " zeigen Sie mir in de
|
|||
find people graduate of Stanford. Leute finden, die Stanford graduieren.
|
||||
what is the highest rated hotel ? was ist das am höchsten bewertete Hotel ?
|
||||
find hotels with 2 star ratings . Hotels mit 2 Sterne Bewertungen finden.
|
||||
what is the rating of " rosedon " in " glenorchy " ? Wie hoch ist die Bewertung von „ Rosedon " in „ Glenorchy " ?
|
||||
what is the rating of " rosedon " in " glenorchy " ? Wie hoch ist die Bewertung von " rosedon " " in " glenorchy " " ?
|
||||
|
|
Can't render this file because it contains an unexpected character in line 1 and column 48.
|
|
@ -1,6 +1,6 @@
|
|||
who has a 8 star rating with over 8 reviews in " fonte " ? wer hat eine 8 Sterne Bewertung mit über 8 Bewertungen in " fonte " ?
|
||||
show me nearby hotels with both a " catalan " and " sauna " ich sah mich in der Nähe von Hotels mit sowohl " Katalanen " als auch " Sauna " zeigen, " sowohl " Katalanen
|
||||
find people graduate of Stanford. - es gibt Leute, die an der Stanford University studieren.
|
||||
what is the highest rated hotel ? Was ist das Hotel mit dem höchsten Preis ?
|
||||
show me nearby hotels with both a " catalan " and " sauna " ich sah in der Nähe Hotels mit " catalan " und " sauna " .
|
||||
find people graduate of Stanford. finden Menschen Absolventen von Stanford.
|
||||
what is the highest rated hotel ? Was ist das höchst bewertete Hotel ?
|
||||
find hotels with 2 star ratings . finden Sie Hotels mit 2 Sternenbewertungen .
|
||||
what is the rating of " rosedon " in " glenorchy " ? Was ist die Bewertung von " rosedon " in " glenorchy " ?
|
||||
|
|
Can't render this file because it contains an unexpected character in line 1 and column 48.
|
|
@ -125,7 +125,7 @@ done
|
|||
# masked paraphrasing tests
|
||||
cp -r $SRCDIR/dataset/paraphrasing/ $workdir/masked_paraphrasing/
|
||||
|
||||
for model in "sshleifer/bart-tiny-random" ; do
|
||||
for model in "sshleifer/bart-tiny-random" "sshleifer/tiny-mbart" ; do
|
||||
|
||||
if [[ $model == *mbart* ]] ; then
|
||||
model_type="mbart"
|
||||
|
@ -160,7 +160,7 @@ for model in "t5-small" "Helsinki-NLP/opus-mt-en-de" ; do
|
|||
fi
|
||||
|
||||
# use a pre-trained model
|
||||
pipenv run python3 -m genienlp run-paraphrase --model_name_or_path $model --length 15 --temperature 0 --repetition_penalty 1.0 --num_samples 1 --batch_size 3 --input_file $workdir/translation/en-de/dev_"$base_model"_aligned.tsv --input_column 0 --gold_column 1 --output_file $workdir/generated_"$base_model"_aligned.tsv --skip_heuristics --att_pooling mean --task translate --tgt_lang de --replace_qp --return_attentions
|
||||
pipenv run python3 -m genienlp run-paraphrase --model_name_or_path $model --length 15 --temperature 0 --repetition_penalty 1.0 --num_samples 1 --batch_size 3 --input_file $workdir/translation/en-de/dev_"$base_model"_aligned.tsv --input_column 0 --gold_column 1 --output_file $workdir/generated_"$base_model"_aligned.tsv --skip_heuristics --att_pooling mean --task translate --tgt_lang de --replace_qp --force_replace_qp --return_attentions
|
||||
|
||||
# check if result file exists and exact match accuracy is 100%
|
||||
cut -f2 $workdir/translation/en-de/dev_"$base_model"_aligned.tsv | diff -u - $workdir/generated_"$base_model"_aligned.tsv
|
||||
|
|
Loading…
Reference in New Issue