diff --git a/Pipfile b/Pipfile index 013badbe..9722feec 100644 --- a/Pipfile +++ b/Pipfile @@ -19,9 +19,9 @@ pyrouge = ">=0.1.3" sacrebleu = "~=1.0" tensorboardX = "==2.0.*" requests = "~=2.22" -transformers = "==2.11" +transformers = "==3.5.1" radam = {git = "https://github.com/LiyuanLucasLiu/RAdam"} -sentencepiece = ">=0.1.83,<0.2.0" +sentencepiece = "==0.1.91" mosestokenizer = '~=1.1' matplotlib = '~=3.1' seaborn = '~=0.9' diff --git a/genienlp/paraphrase/GPT2Seq2Seq.py b/genienlp/paraphrase/GPT2Seq2Seq.py index 8a496036..71ab4b01 100644 --- a/genienlp/paraphrase/GPT2Seq2Seq.py +++ b/genienlp/paraphrase/GPT2Seq2Seq.py @@ -1,6 +1,8 @@ from typing import List -from transformers import GPT2LMHeadModel + import torch +from transformers.modeling_gpt2 import GPT2LMHeadModel + class GPT2Seq2Seq(GPT2LMHeadModel): def __init__(self, config): @@ -23,7 +25,7 @@ class GPT2Seq2Seq(GPT2LMHeadModel): return copy_input_sequences - + #TODO check if this function is used def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty): """ repetition penalty from CTRL (https://arxiv.org/abs/1909.05858), but much faster on GPU """ @@ -37,15 +39,7 @@ class GPT2Seq2Seq(GPT2LMHeadModel): need_divide = need_change > 0 need_multiply = need_change < 0 lprobs = need_divide * lprobs / repetition_penalty + need_multiply * lprobs * repetition_penalty + (1-m) * lprobs - - # old, slow implementation - # if repetition_penalty != 1.0: - # for i in range(context.shape[0]): - # for previous_token in set(generated[i].tolist()): - # if lprobs[i, previous_token] > 0: - # lprobs[i, previous_token] /= repetition_penalty - # else: - # lprobs[i, previous_token] *= repetition_penalty + def generate(self, **kwargs): # change arguments so that they have the same meaning as seq2seq models @@ -68,18 +62,20 @@ class GPT2Seq2Seq(GPT2LMHeadModel): return outputs - def prepare_inputs_for_generation(self, input_ids, past, **kwargs): - sep_token_position = (input_ids==self.sep_token_id).to(torch.long) - assert (torch.sum(sep_token_position, dim=1)==1).all(), 'All input_ids must contain exactly one sep_token. sep_token_position = %s\nsep_token_id = %d' % (str(sep_token_position), self.sep_token_id) + def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): + sep_token_position = (input_ids == self.sep_token_id).to(torch.long) + assert (torch.sum(sep_token_position, dim=1) == 1).all(), 'All input_ids must contain exactly one sep_token.' \ + ' sep_token_position = %s\nsep_token_id = %d' % (str(sep_token_position), self.sep_token_id) token_type_ids = torch.cumsum(sep_token_position, dim=1) - sep_token_position - attention_mask = (input_ids!=self.pad_token_id).to(torch.long) # 0 means mask, 1 means no mask - position_ids = ((torch.cumsum(attention_mask, dim=1)-1)*(1-token_type_ids)+(torch.cumsum(token_type_ids, dim=1)-1)*token_type_ids).clamp(min=0) - token_type_ids = self.sep_token_id * (1-token_type_ids) + self.eos_token_id * token_type_ids + attention_mask = (input_ids != self.pad_token_id).to(torch.long) # 0 means mask, 1 means no mask + position_ids = ((torch.cumsum(attention_mask, dim=1) - 1) * (1 - token_type_ids) + + (torch.cumsum(token_type_ids, dim=1) - 1) * token_type_ids).clamp(min=0) + token_type_ids = self.sep_token_id * (1 - token_type_ids) + self.eos_token_id * token_type_ids if past: input_ids = input_ids[:, -1].unsqueeze(-1) position_ids = position_ids[:, -1].unsqueeze(-1) token_type_ids = token_type_ids[:, -1].unsqueeze(-1) - inputs = {"input_ids": input_ids, "position_ids": position_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "past": past} + inputs = {"input_ids": input_ids, "position_ids": position_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask, "past_key_values": past} return inputs \ No newline at end of file diff --git a/genienlp/paraphrase/data_utils.py b/genienlp/paraphrase/data_utils.py index c32366f2..4478a534 100644 --- a/genienlp/paraphrase/data_utils.py +++ b/genienlp/paraphrase/data_utils.py @@ -308,7 +308,7 @@ def create_features_from_tsv_file(file_path, tokenizer, input_column, gold_colum def is_question(sentence: str): - question_words = ['which', 'what', 'where', 'how', 'who', 'when', 'is', 'are', 'am', \ + question_words = ['which', 'what', 'where', 'how', 'who', 'when', 'is', 'are', 'am', 'can', 'could', 'would', 'will', 'have', 'did', 'do', 'does', 'no is', 'yes is'] for w in question_words: if sentence.startswith(w+' '): diff --git a/genienlp/paraphrase/dataset.py b/genienlp/paraphrase/dataset.py index 93197c78..8927ae5e 100644 --- a/genienlp/paraphrase/dataset.py +++ b/genienlp/paraphrase/dataset.py @@ -168,11 +168,11 @@ class TextDataset(Dataset): def _add_marian_example(self, input_sequence, output_sequence): - model_inputs = self.tokenizer.prepare_translation_batch([input_sequence], [output_sequence]) + model_inputs = self.tokenizer.prepare_seq2seq_batch([input_sequence], [output_sequence]) encoded_input_ids = model_inputs['input_ids'].tolist()[0] encoded_attention_mask = model_inputs['attention_mask'].tolist()[0] - encoded_output_ids = model_inputs['decoder_input_ids'].tolist()[0] + encoded_output_ids = model_inputs['labels'].tolist()[0] self._update_seq2seq_example(encoded_input_ids, encoded_attention_mask, encoded_output_ids) diff --git a/genienlp/paraphrase/model_utils.py b/genienlp/paraphrase/model_utils.py index ce863af1..5af51efa 100644 --- a/genienlp/paraphrase/model_utils.py +++ b/genienlp/paraphrase/model_utils.py @@ -67,12 +67,12 @@ def check_args(args): if args.model_type == 'marian' and args.model_name_or_path.rsplit('-', 1)[1] not in MARIAN_GROUP_MEMBERS and args.tgt_lang: logger.warning('Target language should not be provided when using models with single language pairs,' - 'otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...') + ' otherwise the translation outputs will be incorrect; thus we ignore the target language you provided...') args.tgt_lang = None if args.model_type == 'marian' and args.model_name_or_path.rsplit('-', 2)[1] not in MARIAN_GROUP_MEMBERS and args.src_lang: logger.warning('Source language should not be provided when using models with single language pairs,' - 'otherwise the translation outputs will be incorrect; thus we ignore the source language you provided...') + ' otherwise the translation outputs will be incorrect; thus we ignore the source language you provided...') args.src_lang = None if args.model_type == 'mbart' and not (args.tgt_lang and args.src_lang): diff --git a/genienlp/paraphrase/run_generation.py b/genienlp/paraphrase/run_generation.py index cc1c87b2..a650b195 100644 --- a/genienlp/paraphrase/run_generation.py +++ b/genienlp/paraphrase/run_generation.py @@ -45,11 +45,10 @@ import torch from transformers import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP from .transformers_utils import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP -from transformers import GPT2Tokenizer, T5Tokenizer, MarianTokenizer +from transformers import GPT2Tokenizer, T5Tokenizer, MarianTokenizer, BartTokenizer -from transformers import BartForConditionalGeneration -from .transformers_utils import MarianMTModel, T5ForConditionalGeneration, BartForConditionalGeneration as MBartForConditionalGeneration -from .transformers_utils import BartTokenizer, MBartTokenizer +from .transformers_utils import GenieMarianMTModel, GenieT5ForConditionalGeneration, GenieBartForConditionalGeneration, GenieMBartForConditionalGeneration +from .transformers_utils import GenieMBartTokenizer from transformers import PretrainedConfig @@ -69,10 +68,10 @@ ALL_MODELS = sum((tuple(map.keys()) for map in (GPT2_PRETRAINED_CONFIG_ARCHIVE_M MODEL_CLASSES = { 'gpt2': (GPT2Seq2Seq, GPT2Tokenizer, {'bos_token': '', 'sep_token': '', 'eos_token': ''}), - 't5': (T5ForConditionalGeneration, T5Tokenizer, {'bos_token': '', 'sep_token': '', 'eos_token': ''}), - 'bart': (BartForConditionalGeneration, BartTokenizer, {'bos_token': '', 'sep_token': '', 'eos_token': ''}), - 'mbart': (MBartForConditionalGeneration, MBartTokenizer, {'bos_token': '', 'sep_token': '', 'eos_token': ''}), - 'marian': (MarianMTModel, MarianTokenizer, {'bos_token': '', 'sep_token': '', 'eos_token': ''}), + 't5': (GenieT5ForConditionalGeneration, T5Tokenizer, {'bos_token': '', 'sep_token': '', 'eos_token': ''}), + 'bart': (GenieBartForConditionalGeneration, BartTokenizer, {'bos_token': '', 'sep_token': '', 'eos_token': ''}), + 'mbart': (GenieMBartForConditionalGeneration, GenieMBartTokenizer, {'bos_token': '', 'sep_token': '', 'eos_token': ''}), + 'marian': (GenieMarianMTModel, MarianTokenizer, {'bos_token': '', 'sep_token': '', 'eos_token': ''}), } @@ -126,14 +125,17 @@ def parse_argv(parser): parser.add_argument('--batch_size', type=int, default=4, help="Batch size for text generation for each GPU.") + parser.add_argument('--pad_token', type=str, default='', + help='The special token for padding, if tokenizer does not have that') + parser.add_argument('--cache_dir', default='.embeddings', type=str, help='where to save transforemrs cached models, configs, and tokenizers.') parser.add_argument('--trained_model_type', type=str, help='if provided we make sure the loaded model matches the model_type') - parser.add_argument('--src_lang', type=str, default='en', help='source language used for translation task') + parser.add_argument('--src_lang', type=str, help='source language used for translation task') parser.add_argument('--tgt_lang', type=str, help='target language used for translation task') - parser.add_argument('--return_attentions', action='store_true', help='return self and cross attention weights for seq2seq models') - parser.add_argument('--return_hidden_states', action='store_true', help='return all hidden states for seq2seq models') + parser.add_argument('--output_attentions', action='store_true', help='return self and cross attention weights for seq2seq models') + parser.add_argument('--output_hidden_states', action='store_true', help='return all hidden states for seq2seq models') parser.add_argument('--att_pooling', type=str, default='max', help='pooling used to calculate decoder-encoder attention values across different heads') parser.add_argument('--plot_heatmaps', action='store_true', help='whether to plot decoder-encoder attention heatmaps') @@ -277,12 +279,12 @@ def run_multi_process_generation(args): def run_single_process_generation(args, config): model_class, tokenizer_class, special_tokens = MODEL_CLASSES[args.model_type] - return_attentions = args.return_attentions - return_hidden_states = args.return_hidden_states + output_attentions = args.output_attentions + output_hidden_states = args.output_hidden_states model = model_class.from_pretrained(args.model_name_or_path, - output_attentions=return_attentions, - output_hidden_states=return_hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, cache_dir=args.cache_dir) model.to(args.device) @@ -297,6 +299,11 @@ def run_single_process_generation(args, config): tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) eos_token_id = tokenizer.convert_tokens_to_ids(special_tokens['eos_token']) sep_token_id = tokenizer.convert_tokens_to_ids(special_tokens['sep_token']) + + if tokenizer.pad_token is None: + # this assigns pad token but doesn't add it to the vocabulary + tokenizer.pad_token = args.pad_token + pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) if pad_token_id is None: @@ -403,13 +410,12 @@ def run_single_process_generation(args, config): temperature=args.temperature[hyperparameter_idx] if args.temperature[hyperparameter_idx] > 0 else 1.0, # if temperature==0, we do not sample eos_token_id=eos_token_id, pad_token_id=pad_token_id, - return_attentions=return_attentions, - return_hidden_states=return_hidden_states, use_cache=True, + output_attentions=output_attentions ) # TODO fix the way output attention is handled. Some models do not support it. - if return_attentions: + if output_attentions: decoded, all_encoder_attentions = outputs else: decoded = outputs @@ -434,7 +440,7 @@ def run_single_process_generation(args, config): min_index = min_index + 1 out_cropped = out[:min_index] - if args.task == 'translate': + if args.task == 'translate' and output_attentions: src_tokens = tokenizer.convert_ids_to_tokens(batch_context_tensor[sample_index]) tgt_tokens = tokenizer.convert_ids_to_tokens(out_cropped) diff --git a/genienlp/paraphrase/run_lm_finetuning.py b/genienlp/paraphrase/run_lm_finetuning.py index fd8613e4..7f03fb93 100644 --- a/genienlp/paraphrase/run_lm_finetuning.py +++ b/genienlp/paraphrase/run_lm_finetuning.py @@ -43,10 +43,10 @@ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, CamembertConfig, CamembertForMaskedLM, CamembertTokenizer, BartConfig, BartForConditionalGeneration, BartTokenizer, - MarianConfig, MarianTokenizer) + MBartConfig, MBartForConditionalGeneration, + MarianConfig, MarianMTModel, MarianTokenizer) -from .transformers_utils import BartForConditionalGeneration as MBartForConditionalGeneration -from .transformers_utils import MBartTokenizer, MarianMTModel +from .transformers_utils import GenieMBartTokenizer from genienlp.util import set_seed, split_file_on_disk from genienlp.paraphrase.data_utils import mask_tokens, add_special_tokens @@ -66,7 +66,7 @@ MODEL_CLASSES = { 'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), 'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), 'bart': (BartConfig, BartForConditionalGeneration, BartTokenizer), - 'mbart': (BartConfig, MBartForConditionalGeneration, MBartTokenizer), + 'mbart': (MBartConfig, MBartForConditionalGeneration, GenieMBartTokenizer), 'marian': (MarianConfig, MarianMTModel, MarianTokenizer) } @@ -184,7 +184,7 @@ def train(args, train_dataset, model, tokenizer, input_file_name=None, multiple_ steps_trained_in_current_epoch -= 1 continue - inputs, attention_mask, labels, position_ids, segment_ids = batch # batch is a tuple (input, labels, position_ids, segment_ids) + inputs, attention_mask, labels, position_ids, segment_ids = batch if args.mlm: inputs, labels = mask_tokens(inputs, labels, tokenizer, args.mlm_probability, args.mlm_ignore_index) @@ -195,7 +195,7 @@ def train(args, train_dataset, model, tokenizer, input_file_name=None, multiple_ segment_ids = segment_ids.to(args.device) model.train() - model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids} + model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids, 'use_cache': False} # prepare inputs for mbart, and marian if args.model_type in ['mbart', 'marian']: @@ -349,7 +349,7 @@ def evaluate(args, model, tokenizer, prefix="", aux=False): segment_ids = segment_ids.to(args.device) with torch.no_grad(): - model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids} + model_inputs = {'input_ids': inputs, 'position_ids': position_ids, 'token_type_ids': segment_ids, 'use_cache': False} if args.model_type in ['mbart', 'marian']: model_inputs['attention_mask'] = attention_mask diff --git a/genienlp/paraphrase/transformers_utils.py b/genienlp/paraphrase/transformers_utils.py index 7378c917..09d6e9e5 100644 --- a/genienlp/paraphrase/transformers_utils.py +++ b/genienlp/paraphrase/transformers_utils.py @@ -1,10 +1,16 @@ -import copy import re +import torch +import torch.nn.functional as F +from typing import List, Optional -from transformers.modeling_bart import LayerNorm, LearnedPositionalEmbedding, BartEncoder, SelfAttention, invert_mask, \ - SinusoidalPositionalEmbedding, BartModel, BartForConditionalGeneration +from transformers import LogitsProcessorList +from transformers.modeling_marian import MarianMTModel +from transformers.modeling_bart import BartForConditionalGeneration +from transformers.modeling_mbart import MBartForConditionalGeneration +from transformers.modeling_t5 import T5ForConditionalGeneration +from transformers.modeling_utils import PreTrainedModel -from transformers.modeling_t5 import T5ForConditionalGeneration, T5PreTrainedModel, T5LayerNorm, T5Block +from transformers.tokenization_mbart import MBartTokenizer, _all_mbart_models, SPM_URL SPIECE_UNDERLINE = "▁" @@ -63,606 +69,10 @@ MARIAN_GROUP_MEMBERS = { "yue", "yue_Hans", "yue_Hant", "zho", "zho_Hans", "zho_Hant", "zlm_Latn", "zsm_Latn", "zul", "zza"] } -# coding=utf-8 -# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch BART model, ported from the fairseq repo.""" -import math -import random -from typing import List, Optional -import torch -import torch.nn as nn -import torch.nn.functional as F +############### -from transformers.activations import ACT2FN -from transformers.configuration_bart import BartConfig -from transformers.modeling_utils import calc_banned_ngram_tokens, calc_banned_bad_words_ids, top_k_top_p_filtering - - -class DecoderLayer(nn.Module): - def __init__(self, config: BartConfig): - super().__init__() - self.embed_dim = config.d_model - self.output_attentions = config.output_attentions - self.self_attn = SelfAttention( - embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout, - ) - self.dropout = config.dropout - self.activation_fn = ACT2FN[config.activation_function] - self.activation_dropout = config.activation_dropout - self.normalize_before = config.normalize_before - - self.self_attn_layer_norm = LayerNorm(self.embed_dim) - self.encoder_attn = SelfAttention( - self.embed_dim, - config.decoder_attention_heads, - dropout=config.attention_dropout, - encoder_decoder_attention=True, - ) - self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) - self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) - self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) - self.final_layer_norm = LayerNorm(self.embed_dim) - - def forward( - self, - x, - encoder_hidden_states, - encoder_attn_mask=None, - layer_state=None, - causal_mask=None, - decoder_padding_mask=None, - ): - residual = x - - if layer_state is None: - layer_state = {} - if self.normalize_before: - x = self.self_attn_layer_norm(x) - - # Self Attention - x, self_attn_weights = self.self_attn( - query=x, - key=x, - layer_state=layer_state, # adds keys to layer state - key_padding_mask=decoder_padding_mask, - attn_mask=causal_mask, - need_weights=self.output_attentions, - ) - x = F.dropout(x, p=self.dropout, training=self.training) - x = residual + x - if not self.normalize_before: - x = self.self_attn_layer_norm(x) - - # Cross attention - residual = x - assert self.encoder_attn.cache_key != self.self_attn.cache_key - if self.normalize_before: - x = self.encoder_attn_layer_norm(x) - x, cross_attn_weights = self.encoder_attn( - query=x, - key=encoder_hidden_states, - key_padding_mask=encoder_attn_mask, - layer_state=layer_state, # mutates layer state - need_weights=self.output_attentions, - ) - x = F.dropout(x, p=self.dropout, training=self.training) - x = residual + x - if not self.normalize_before: - x = self.encoder_attn_layer_norm(x) - - # Fully Connected - residual = x - if self.normalize_before: - x = self.final_layer_norm(x) - x = self.activation_fn(self.fc1(x)) - x = F.dropout(x, p=self.activation_dropout, training=self.training) - x = self.fc2(x) - x = F.dropout(x, p=self.dropout, training=self.training) - x = residual + x - if not self.normalize_before: - x = self.final_layer_norm(x) - return ( - x, - self_attn_weights, - cross_attn_weights, - layer_state, - ) # both self_attn and cross-attn weights, following t5, layer_state = cache for decoding - # attention weight has size (bsz, num_heads, tgt_len, src_len) - - -class BartDecoder(nn.Module): - """ - Transformer decoder consisting of *config.decoder_layers* layers. Each layer - is a :class:`DecoderLayer`. - Args: - config: BartConfig - embed_tokens (torch.nn.Embedding): output embedding - """ - - def __init__(self, config: BartConfig, embed_tokens: nn.Embedding): - super().__init__() - self.output_attentions = config.output_attentions - self.output_hidden_states = config.output_hidden_states - self.dropout = config.dropout - self.layerdrop = config.decoder_layerdrop - self.padding_idx = embed_tokens.padding_idx - self.max_target_positions = config.max_position_embeddings - self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 - self.embed_tokens = embed_tokens - if config.static_position_embeddings: - self.embed_positions = SinusoidalPositionalEmbedding( - config.max_position_embeddings, config.d_model, config.pad_token_id - ) - else: - self.embed_positions = LearnedPositionalEmbedding( - config.max_position_embeddings, config.d_model, self.padding_idx, - ) - self.layers = nn.ModuleList( - [DecoderLayer(config) for _ in range(config.decoder_layers)] - ) # type: List[DecoderLayer] - self.layernorm_embedding = LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity() - self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None - - def forward( - self, - input_ids, - encoder_hidden_states, - encoder_padding_mask, - decoder_padding_mask, - decoder_causal_mask, - decoder_cached_states=None, - use_cache=False, - **unused - ): - """ - Includes several features from "Jointly Learning to Align and - Translate with Transformer Models" (Garg et al., EMNLP 2019). - - Args: - input_ids (LongTensor): previous decoder outputs of shape - `(batch, tgt_len)`, for teacher forcing - encoder_hidden_states: output from the encoder, used for - encoder-side attention - encoder_padding_mask: for ignoring pad tokens - decoder_cached_states (dict or None): dictionary used for storing state during generation - - Returns: - tuple: - - the decoder's features of shape `(batch, tgt_len, embed_dim)` - - hidden states - - attentions - """ - # check attention mask and invert - if encoder_padding_mask is not None: - encoder_padding_mask = invert_mask(encoder_padding_mask) - - # embed positions - positions = self.embed_positions(input_ids, use_cache=use_cache) - - if use_cache: - input_ids = input_ids[:, -1:] - positions = positions[:, -1:] # happens after we embed them - # assert input_ids.ne(self.padding_idx).any() - - x = self.embed_tokens(input_ids) * self.embed_scale - x += positions - x = self.layernorm_embedding(x) - x = F.dropout(x, p=self.dropout, training=self.training) - - # Convert to Bart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) - x = x.transpose(0, 1) - encoder_hidden_states = encoder_hidden_states.transpose(0, 1) - - # decoder layers - all_hidden_states = () - all_self_attns = () - all_cross_attns = () - next_decoder_cache = [] - for idx, decoder_layer in enumerate(self.layers): - # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - if self.output_hidden_states: - all_hidden_states += (x,) - dropout_probability = random.uniform(0, 1) - if self.training and (dropout_probability < self.layerdrop): - continue - - layer_state = decoder_cached_states[idx] if decoder_cached_states is not None else None - - x, layer_self_attn, layer_cross_attention, layer_past = decoder_layer( - x, - encoder_hidden_states, - encoder_attn_mask=encoder_padding_mask, - decoder_padding_mask=decoder_padding_mask, - layer_state=layer_state, - causal_mask=decoder_causal_mask, - ) - - if use_cache: - next_decoder_cache.append(layer_past.copy()) - - if self.layer_norm and (idx == len(self.layers) - 1): # last layer of mbart - x = self.layer_norm(x) - if self.output_attentions: - all_self_attns += (layer_self_attn,) - all_cross_attns += (layer_cross_attention,) - - # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) - all_hidden_states = [hidden_state.transpose(0, 1) for hidden_state in all_hidden_states] - x = x.transpose(0, 1) - encoder_hidden_states = encoder_hidden_states.transpose(0, 1) - - if use_cache: - next_cache = ((encoder_hidden_states, encoder_padding_mask), next_decoder_cache) - else: - next_cache = None - return x, next_cache, all_hidden_states, list(all_self_attns), list(all_cross_attns) - - -class BartModel(BartModel): - def __init__(self, config: BartConfig): - super().__init__(config) - self.output_attentions = config.output_attentions - self.output_hidden_states = config.output_hidden_states - - padding_idx, vocab_size = config.pad_token_id, config.vocab_size - self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) - - self.encoder = BartEncoder(config, self.shared) - self.decoder = BartDecoder(config, self.shared) - - self.init_weights() - - -class BartForConditionalGeneration(BartForConditionalGeneration): - base_model_prefix = "model" - - def __init__(self, config: BartConfig): - super().__init__(config) - base_model = BartModel(config) - self.model = base_model - self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) - - def prepare_inputs_for_generation(self, decoder_input_ids, past, attention_mask, use_cache, **kwargs): - assert past is not None, "past has to be defined for encoder_outputs" - - # first step, decoder_cached_states are empty - # first step - if kwargs['cur_len'] == 1: - encoder_outputs, decoder_cached_states = past[0], None - else: - if use_cache: - if len(past) < 2: - encoder_outputs, decoder_cached_states = past[0], None - else: - encoder_outputs, decoder_cached_states = past[0], past[1] - else: - encoder_outputs, decoder_cached_states = past[0], None - - if not isinstance(encoder_outputs, tuple): - encoder_outputs = (encoder_outputs, ) - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "decoder_cached_states": decoder_cached_states, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - - def _generate_no_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - bos_token_id, - pad_token_id, - eos_token_id, - decoder_start_token_id, - batch_size, - encoder_outputs, - attention_mask, - use_cache, - model_specific_kwargs, - ): - """ Generate sequences for each example without beam search (num_beams == 1). - All returned sequence are generated independantly. - """ - # length of generated sentences / unfinished sentences - unfinished_sents = input_ids.new(batch_size).fill_(1) - sent_lengths = input_ids.new(batch_size).fill_(max_length) - - if getattr(self.config, 'encoder_layers', None): - num_layers = self.config.encoder_layers - else: - num_layers = self.config.num_layers - - if getattr(self.config, 'encoder_attention_heads', None): - num_heads = self.config.encoder_attention_heads - else: - num_heads = self.config.num_heads - - all_encoder_attentions = [input_ids.new_full([batch_size, num_heads, max_length, - encoder_outputs[0].size(1)], dtype=torch.float32, - fill_value=-1000000) for _ in range(num_layers)] - - # encoder outputs for Bart and models inheriting from BartModel encoder is (encoder hidden outputs of last layer, all_hidden_states, all_attention_weights ) - # it always outputs all_hidden_states and all_attention_weights and then filters empty ones out when passed through BartModel - - # on the other hand, T5 encoder outputs (last-layer hidden state, presents, all_hidden_states, all_attention_weights) only if returning them is requested - # otherwise it just returns last-layer hidden states - past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models - if not isinstance(past, tuple): - past = (past,) - - while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation( - input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, cur_len=cur_len, **model_specific_kwargs - ) - - outputs = self(**model_inputs) - # decoder_outputs = x, next_cache, all_hidden_states, all_self_attns, all_cross_attns - # encoder_outputs = encoder_hidden_last_layer + all_hidden_states + all_self_attns - # outputs = decoder_outputs + encoder_outputs - - # outputs is then filtered if attention weights, hidden states, or cached_decoding_values are empty - # so the index below is adjusted - # remember we always return attention weights - - next_token_logits = outputs[0][:, -1, :] - - index = 2 + int(model_specific_kwargs['return_hidden_states']) + int(use_cache) - for i in range(num_layers): - all_encoder_attentions[i][:, :, [cur_len - 1], :] = outputs[index][i] - - # if model has past, then set the past variable to speed up decoding - if self._use_cache(outputs, use_cache): - past = outputs[1] - - # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) - if repetition_penalty != 1.0: - self.enforce_repetition_penalty_(next_token_logits, batch_size, 1, input_ids, repetition_penalty) - - - if no_repeat_ngram_size > 0: - # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 - banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) - for batch_idx in range(batch_size): - next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf") - - if bad_words_ids is not None: - # calculate a list of banned tokens according to bad words - banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) - - for batch_idx in range(batch_size): - next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf") - - # #TODO added and modified by mehrad from transformers modeling_utils.py - # if self.config.is_encoder_decoder: - # if cur_len == 1: - # self._force_token_ids_generation(next_token_logits, model_specific_kwargs['tgt_lang_id']) - # if cur_len == max_length - 1 and self.config.eos_token_id is not None: - # self._force_token_ids_generation(next_token_logits, self.config.eos_token_id) - - # set bos token prob to zero - # if bos_token_id is not None: - # next_token_logits[:, bos_token_id] = -float("inf") - - # set eos token prob to zero if min_length is not reached - if eos_token_id is not None and cur_len < min_length: - next_token_logits[:, eos_token_id] = -float("inf") - - if do_sample: - # Temperature (higher temperature => more likely to sample low probability tokens) - if temperature != 1.0: - next_token_logits = next_token_logits / temperature - # Top-p/top-k filtering - next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) - # Sample - probs = F.softmax(next_token_logits, dim=-1) - next_token = torch.multinomial(probs, num_samples=1).squeeze(1) - else: - # Greedy decoding - next_token = torch.argmax(next_token_logits, dim=-1) - - # update generations and finished sentences - if eos_token_id is not None: - # pad finished sentences if eos_token_id exist - tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) - else: - tokens_to_add = next_token - - # add token and increase length by one - input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) - cur_len = cur_len + 1 - - if eos_token_id is not None: - eos_in_sents = tokens_to_add == eos_token_id - # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length - is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool() - sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len) - # unfinished_sents is set to zero if eos in sentence - unfinished_sents.mul_((~eos_in_sents).long()) - - # stop when there is a in each sentence, or if we exceed the maximum length - if unfinished_sents.max() == 0: - break - - # extend attention_mask for new generated input if only decoder - if self.config.is_encoder_decoder is False: - attention_mask = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - - # if there are different sentences lengths in the batch, some batches have to be padded - if sent_lengths.min().item() != sent_lengths.max().item(): - assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" - # finished sents are filled with pad_token - decoded = input_ids.new(batch_size, sent_lengths.max().item()).fill_(pad_token_id) - else: - decoded = input_ids - - for hypo_idx, hypo in enumerate(input_ids): - decoded[hypo_idx, : sent_lengths[hypo_idx]] = hypo[: sent_lengths[hypo_idx]] - - # List of each encoder layer cross-attention values each with size (bsz, num_heads, tgt_len, src_len) - all_encoder_attentions = [layer_all_encoder_attentions[:, :, :sent_lengths.max().item(), :] for layer_all_encoder_attentions in all_encoder_attentions] - - return decoded, all_encoder_attentions - - - - -# coding=utf-8 -# Copyright 2020 Marian Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch MarianMTModel model, ported from the Marian C++ repo.""" - - -# from transformers.modeling_bart import BartForConditionalGeneration - - -MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST = [ - # See all Marian models at https://huggingface.co/models?search=Helsinki-NLP -] - - -class MarianMTModel(BartForConditionalGeneration): - r""" - Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. - Model API is identical to BartForConditionalGeneration. - Available models are listed at `Model List `__ - - Examples:: - - from transformers import MarianTokenizer, MarianMTModel - from typing import List - src = 'fr' # source language - trg = 'en' # target language - sample_text = "où est l'arrêt de bus ?" - mname = f'Helsinki-NLP/opus-mt-{src}-{trg}' - - model = MarianMTModel.from_pretrained(mname) - tok = MarianTokenizer.from_pretrained(mname) - batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference - gen = model.generate(**batch) # for forward pass: model(**batch) - words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?" - - """ - - def prepare_logits_for_generation(self, logits, cur_len, max_length): - logits[:, self.config.pad_token_id] = float("-inf") - if cur_len == max_length - 1 and self.config.eos_token_id is not None: - self._force_token_ids_generation(logits, self.config.eos_token_id) - return logits - - -# coding=utf-8 -# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from transformers.tokenization_roberta import RobertaTokenizer -from transformers.tokenization_utils import BatchEncoding -from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer - - -# vocab and merges same as roberta -vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" -merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" -_all_bart_models = [ - "facebook/bart-large", - "facebook/bart-large-mnli", - "facebook/bart-large-cnn", - "facebook/bart-large-xsum", - "yjernite/bart_eli5", -] - - -class BartTokenizer(RobertaTokenizer): - # merges and vocab same as Roberta - max_model_input_sizes = {m: 1024 for m in _all_bart_models} - pretrained_vocab_files_map = { - "vocab_file": {m: vocab_url for m in _all_bart_models}, - "merges_file": {m: merges_url for m in _all_bart_models}, - } - -_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"] -SPM_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/sentence.bpe.model" - -FAIRSEQ_LANGUAGE_CODES = [ - "ar_AR", - "cs_CZ", - "de_DE", - "en_XX", - "es_XX", - "et_EE", - "fi_FI", - "fr_XX", - "gu_IN", - "hi_IN", - "it_IT", - "ja_XX", - "kk_KZ", - "ko_KR", - "lt_LT", - "lv_LV", - "my_MM", - "ne_NP", - "nl_XX", - "ro_RO", - "ru_RU", - "si_LK", - "tr_TR", - "vi_VN", - "zh_CN", -] - - -class MBartTokenizer(XLMRobertaTokenizer): +class GenieMBartTokenizer(MBartTokenizer): vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"} max_model_input_sizes = {m: 1024 for m in _all_mbart_models} pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}} @@ -670,236 +80,9 @@ class MBartTokenizer(XLMRobertaTokenizer): prefix_tokens: List[int] = [] suffix_tokens: List[int] = [] - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, *args, tokenizer_file=None, **kwargs): + super().__init__(*args, tokenizer_file=tokenizer_file, **kwargs) - self.sp_model_size = len(self.sp_model) - self.lang_code_to_id = { - code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES) - } - self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()} - self.cur_lang_code = self.lang_code_to_id["en_XX"] - self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset - - self.fairseq_tokens_to_ids.update(self.lang_code_to_id) - self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - self._additional_special_tokens = list(self.lang_code_to_id.keys()) - self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX")) - - @property - def vocab_size(self): - return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1 # Plus 1 for the mask token - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - prefix_ones = [1] * len(self.prefix_tokens) - suffix_ones = [1] * len(self.suffix_tokens) - if token_ids_1 is None: - return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones - return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - if token_ids_1 is None: - return self.prefix_tokens + token_ids_0 + self.suffix_tokens - # We don't expect to process pairs, but leave the pair logic for API consistency - return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens - - # The __call__ was implemented only in transformers >=3.0.0 - def __call__( - self, - text, - text_pair=None, - add_special_tokens: bool = True, - padding=False, - truncation=False, - max_length: Optional[int] = None, - stride: int = 0, - is_split_into_words: bool = False, - pad_to_multiple_of: Optional[int] = None, - return_tensors=None, - return_token_type_ids: Optional[bool] = None, - return_attention_mask: Optional[bool] = None, - return_overflowing_tokens: bool = False, - return_special_tokens_mask: bool = False, - return_offsets_mapping: bool = False, - return_length: bool = False, - verbose: bool = True, - **kwargs - ) -> BatchEncoding: - """ - Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of - sequences. - - Args: - text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): - The sequence or batch of sequences to be encoded. - Each sequence can be a string or a list of strings (pretokenized string). - If the sequences are provided as list of strings (pretokenized), you must set - :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): - The sequence or batch of sequences to be encoded. - Each sequence can be a string or a list of strings (pretokenized string). - If the sequences are provided as list of strings (pretokenized), you must set - :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - """ - # Input type checking for clearer error - assert isinstance(text, str) or ( - isinstance(text, (list, tuple)) - and ( - len(text) == 0 - or ( - isinstance(text[0], str) - or (isinstance(text[0], (list, tuple)) and ( - len(text[0]) == 0 or isinstance(text[0][0], str))) - ) - ) - ), ( - "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " - "or `List[List[str]]` (batch of pretokenized examples)." - ) - - assert ( - text_pair is None - or isinstance(text_pair, str) - or ( - isinstance(text_pair, (list, tuple)) - and ( - len(text_pair) == 0 - or ( - isinstance(text_pair[0], str) - or ( - isinstance(text_pair[0], (list, tuple)) - and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)) - ) - ) - ) - ) - ), ( - "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " - "or `List[List[str]]` (batch of pretokenized examples)." - ) - - is_batched = bool( - (not is_split_into_words and isinstance(text, (list, tuple))) - or ( - is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], - (list, tuple)) - ) - ) - - if is_batched: - batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text - return self.batch_encode_plus( - batch_text_or_text_pairs=batch_text_or_text_pairs, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - is_split_into_words=is_split_into_words, - pad_to_multiple_of=pad_to_multiple_of, - return_tensors=return_tensors, - return_token_type_ids=return_token_type_ids, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_length=return_length, - verbose=verbose, - **kwargs, - ) - else: - return self.encode_plus( - text=text, - text_pair=text_pair, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - is_split_into_words=is_split_into_words, - pad_to_multiple_of=pad_to_multiple_of, - return_tensors=return_tensors, - return_token_type_ids=return_token_type_ids, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_length=return_length, - verbose=verbose, - **kwargs, - ) - - def prepare_seq2seq_batch( - self, - src_texts: List[str], - src_lang: str = "en_XX", - tgt_texts: Optional[List[str]] = None, - tgt_lang: str = "ro_RO", - max_length: Optional[int] = None, - max_target_length: Optional[int] = None, - truncation: bool = True, - padding: str = "longest", - return_tensors: str = "pt", - add_prefix_space: bool = False, # ignored - **kwargs, - ) -> BatchEncoding: - if max_length is None: - max_length = self.max_len - self.set_src_lang_special_tokens(src_lang) - model_inputs: BatchEncoding = self( - src_texts, - add_special_tokens=True, - return_tensors=return_tensors, - max_length=max_length, - padding=padding, - truncation=truncation, - **kwargs, - ) - if tgt_texts is None: - return model_inputs - # Process tgt_texts - if max_target_length is None: - max_target_length = max_length - self.set_tgt_lang_special_tokens(tgt_lang) - - labels = self( - tgt_texts, - add_special_tokens=True, - return_tensors=return_tensors, - padding=padding, - max_length=max_target_length, - truncation=True, - **kwargs, - )["input_ids"] - model_inputs["labels"] = labels - self.set_src_lang_special_tokens(src_lang) # sets to src_lang - return model_inputs - - ##TODO comment out to use original mbart tokenization used by huggingface - # def set_src_lang_special_tokens(self, src_lang) -> None: - # """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code].""" - # self.cur_lang_code = self.lang_code_to_id[src_lang] - # self.prefix_tokens = [] - # self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] - # - # def set_tgt_lang_special_tokens(self, lang: str) -> None: - # """Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos].""" - # self.cur_lang_code = self.lang_code_to_id[lang] - # self.prefix_tokens = [] - # self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] - def set_src_lang_special_tokens(self, src_lang) -> None: """Reset the special tokens to the source lang setting. Prefix [bos_token_id], suffix =[eos_token_id].""" self.cur_lang_code = self.lang_code_to_id[src_lang] @@ -911,248 +94,225 @@ class MBartTokenizer(XLMRobertaTokenizer): self.cur_lang_code = self.lang_code_to_id[lang] self.prefix_tokens = [self.cur_lang_code] self.suffix_tokens = [self.eos_token_id] + +############### -class T5Stack(T5PreTrainedModel): - def __init__(self, config, embed_tokens=None): - super().__init__(config) - self.output_attentions = config.output_attentions - self.output_hidden_states = config.output_hidden_states - - self.embed_tokens = embed_tokens - self.is_decoder = config.is_decoder - - self.block = nn.ModuleList( - [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] - ) - self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout_rate) - - self.init_weights() - - def get_input_embeddings(self): - return self.embed_tokens - - def get_output_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, new_embeddings): - self.embed_tokens = new_embeddings - - def forward( - self, - input_ids=None, - attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - inputs_embeds=None, - head_mask=None, - past_key_value_states=None, - use_cache=False, +class GeniePreTrainedModel(PreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + def greedy_search( + self, + input_ids: torch.LongTensor, + logits_processor: Optional[LogitsProcessorList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + **model_kwargs ): - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - if self.is_decoder: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + # init values + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + max_length = max_length if max_length is not None else self.config.max_length + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + + # init sequence length tensors + sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation( + input_ids, max_length + ) + + output_attentions = model_kwargs.get('output_attentions', None) + + if output_attentions: + batch_size = input_ids.size(0) + if getattr(self.config, 'encoder_layers', None): + num_layers = self.config.encoder_layers else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - if inputs_embeds is None: - assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings" - inputs_embeds = self.embed_tokens(input_ids) - - batch_size, seq_length = input_shape - - if past_key_value_states is not None: - assert seq_length == 1, "Input shape is {}, but should be {} when using past_key_value_sates".format( - input_shape, (batch_size, 1) + num_layers = self.config.num_layers + + if getattr(self.config, 'encoder_attention_heads', None): + num_heads = self.config.encoder_attention_heads + else: + num_heads = self.config.num_heads + + if model_kwargs.get('encoder_outputs', None): + seq_length = model_kwargs['encoder_outputs'][0].size(1) + else: + seq_length = max_length + + all_cross_attentions = [input_ids.new_full([batch_size, num_heads, max_length, seq_length], + dtype=torch.float32, + fill_value=-1000000) + for _ in range(num_layers)] + + while cur_len < max_length: + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + outputs = self(**model_inputs, return_dict=True) + next_token_logits = outputs.logits[:, -1, :] + + if output_attentions: + for i in range(num_layers): + all_cross_attentions[i][:, :, [cur_len - 1], :] = outputs.cross_attentions[i] + + # pre-process distribution + scores = logits_processor(input_ids, next_token_logits) + + # argmax + next_tokens = torch.argmax(scores, dim=-1) + + # add code that transfomers next_tokens to tokens_to_add + if eos_token_id is not None: + assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined." + next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences) + + # add token and increase length by one + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + + # update sequence length + if eos_token_id is not None: + sequence_lengths, unfinished_sequences = self._update_seq_length_for_generation( + sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id + ) + + # update model kwargs + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder ) - # required mask seq length can be calculated via length of past - # key value states and seq_length = 1 for the last token - mask_seq_length = past_key_value_states[0][0].shape[2] + seq_length + + # stop when there is a in each sentence, or if we exceed the maximum length + if unfinished_sequences.max() == 0: + break + + # increase cur_len + cur_len = cur_len + 1 + + if output_attentions: + # List of each encoder layer cross-attention values each with size (bsz, num_heads, tgt_len, src_len) + all_cross_attentions = [layer_all_cross_attentions[:, :, :sequence_lengths.max().item(), :] for + layer_all_cross_attentions in all_cross_attentions] + + return input_ids, all_cross_attentions else: - mask_seq_length = seq_length + return input_ids + + + def sample( + self, + input_ids: torch.LongTensor, + logits_processor: Optional[LogitsProcessorList] = None, + logits_warper: Optional[LogitsProcessorList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + **model_kwargs + ): + # init values + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList() + max_length = max_length if max_length is not None else self.config.max_length + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - if attention_mask is None: - attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device) - if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: - encoder_seq_length = encoder_hidden_states.shape[1] - encoder_attention_mask = torch.ones( - batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long + # init sequence length tensors + sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation( + input_ids, max_length + ) + + output_attentions = model_kwargs.get('output_attentions', None) + + if output_attentions: + batch_size = input_ids.size(0) + if getattr(self.config, 'encoder_layers', None): + num_layers = self.config.encoder_layers + else: + num_layers = self.config.num_layers + + if getattr(self.config, 'encoder_attention_heads', None): + num_heads = self.config.encoder_attention_heads + else: + num_heads = self.config.num_heads + + if model_kwargs.get('encoder_outputs', None): + seq_length = model_kwargs['encoder_outputs'][0].size(1) + else: + seq_length = max_length + + all_cross_attentions = [input_ids.new_full([batch_size, num_heads, max_length, seq_length], + dtype=torch.float32, + fill_value=-1000000) + for _ in range(num_layers)] + + # auto-regressive generation + while cur_len < max_length: + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + outputs = self(**model_inputs, return_dict=True) + next_token_logits = outputs.logits[:, -1, :] + + if output_attentions: + for i in range(num_layers): + all_cross_attentions[i][:, :, [cur_len - 1], :] = outputs.cross_attentions[i] + + # pre-process distribution + scores = logits_processor(input_ids, next_token_logits) + scores = logits_warper(input_ids, scores) + + # sample + probs = F.softmax(scores, dim=-1) + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + + # add code that transfomers next_tokens to tokens_to_add + if eos_token_id is not None: + assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined." + next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences) + + # add token and increase length by one + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + cur_len = cur_len + 1 + + # update sequence length + if eos_token_id is not None: + sequence_lengths, unfinished_sequences = self._update_seq_length_for_generation( + sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id + ) + + # stop when there is a in each sentence, or if we exceed the maximul length + if unfinished_sequences.max() == 0: + break + + # update model kwargs + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder ) - - # initialize past_key_value_states with `None` if past does not exist - if past_key_value_states is None: - past_key_value_states = [None] * len(self.block) - - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device) - - if self.is_decoder and encoder_attention_mask is not None: - encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + + if output_attentions: + # List of each encoder layer cross-attention values each with size (bsz, num_heads, tgt_len, src_len) + all_cross_attentions = [layer_all_cross_attentions[:, :, :sequence_lengths.max().item(), :] for + layer_all_cross_attentions in all_cross_attentions] + + return input_ids, all_cross_attentions else: - encoder_extended_attention_mask = None - - # Prepare head mask if needed - head_mask = self.get_head_mask(head_mask, self.config.num_layers) - present_key_value_states = () - all_hidden_states = () - all_attentions = () - cross_attentions = () - position_bias = None - encoder_decoder_position_bias = None - - hidden_states = self.dropout(inputs_embeds) - - for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)): - if self.output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - layer_outputs = layer_module( - hidden_states, - attention_mask=extended_attention_mask, - position_bias=position_bias, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - encoder_decoder_position_bias=encoder_decoder_position_bias, - head_mask=head_mask[i], - past_key_value_state=past_key_value_state, - use_cache=use_cache, - ) - # layer_outputs is a tuple with: - # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) - hidden_states, present_key_value_state = layer_outputs[:2] - - if i == 0: - # We share the position biases between the layers - the first layer store them - # layer_outputs = hidden-states, key-value-states (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) - position_bias = layer_outputs[3 if self.output_attentions else 2] - if self.is_decoder and encoder_hidden_states is not None: - encoder_decoder_position_bias = layer_outputs[5 if self.output_attentions else 3] - # append next layer key value states - present_key_value_states = present_key_value_states + (present_key_value_state,) - - if self.output_attentions: - all_attentions = all_attentions + (layer_outputs[2],) # add self-attention - if self.is_decoder and encoder_hidden_states is not None: - if i==0: - cross_attentions = cross_attentions + (layer_outputs[4],) # add cross-attention - else: - cross_attentions = cross_attentions + (layer_outputs[3],) # add cross-attention - - hidden_states = self.final_layer_norm(hidden_states) - hidden_states = self.dropout(hidden_states) - - # Add last layer - if self.output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - outputs = (hidden_states,) - if use_cache is True: - assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) - outputs = outputs + (present_key_value_states,) - if self.output_hidden_states: - outputs = outputs + (all_hidden_states,) - if self.output_attentions: - outputs = outputs + (all_attentions,) + (cross_attentions,) - return outputs # last-layer hidden state, (presents,) (all hidden states), (all self_attentions), (all cross_attentions) + return input_ids -class T5ForConditionalGeneration(T5ForConditionalGeneration): +class GenieMarianMTModel(MarianMTModel, GeniePreTrainedModel): def __init__(self, config): super().__init__(config) - self.model_dim = config.d_model - - self.shared = nn.Embedding(config.vocab_size, config.d_model) - - encoder_config = copy.deepcopy(config) - self.encoder = T5Stack(encoder_config, self.shared) - - decoder_config = copy.deepcopy(config) - decoder_config.is_decoder = True - self.decoder = T5Stack(decoder_config, self.shared) - - self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) - - self.init_weights() +class GenieBartForConditionalGeneration(BartForConditionalGeneration, GeniePreTrainedModel): + def __init__(self, config): + super().__init__(config) - def _generate_no_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - bos_token_id, - pad_token_id, - eos_token_id, - decoder_start_token_id, - batch_size, - encoder_outputs, - attention_mask, - use_cache, - model_specific_kwargs, - ): - return BartForConditionalGeneration._generate_no_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - bos_token_id, - pad_token_id, - eos_token_id, - decoder_start_token_id, - batch_size, - encoder_outputs, - attention_mask, - use_cache, - model_specific_kwargs,) - - def prepare_inputs_for_generation(self, input_ids, past, attention_mask, use_cache, **kwargs): - assert past is not None, "past has to be defined for encoder_outputs" - - # first step - if kwargs['cur_len'] == 1: - encoder_outputs, decoder_past_key_value_states = past[0], None - else: - if use_cache: - if len(past) < 2: - encoder_outputs, decoder_past_key_value_states = past[0], None - else: - encoder_outputs, decoder_past_key_value_states = past[0], past[1] - else: - encoder_outputs, decoder_past_key_value_states = past[0], None - - if not isinstance(encoder_outputs, tuple): - encoder_outputs = (encoder_outputs, ) - - return { - "decoder_input_ids": input_ids, - "decoder_past_key_value_states": decoder_past_key_value_states, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "use_cache": use_cache, - } +class GenieMBartForConditionalGeneration(MBartForConditionalGeneration, GeniePreTrainedModel): + def __init__(self, config): + super().__init__(config) +class GenieT5ForConditionalGeneration(T5ForConditionalGeneration, GeniePreTrainedModel): + def __init__(self, config): + super().__init__(config) diff --git a/setup.py b/setup.py index c3ec9941..343b1c7e 100644 --- a/setup.py +++ b/setup.py @@ -56,8 +56,8 @@ setuptools.setup( 'pyrouge>=0.1.3', 'sacrebleu~=1.0', 'requests~=2.22', - 'transformers==2.11', - 'sentencepiece>=0.1.83,<0.2.0', + 'transformers==3.5.1', + 'sentencepiece==0.1.91', 'mosestokenizer~=1.1', ] ) diff --git a/tests/dataset/translation/en-de/dev_marian_aligned.tsv b/tests/dataset/translation/en-de/dev_marian_aligned.tsv index 5af1f0ef..d5fe1b50 100644 --- a/tests/dataset/translation/en-de/dev_marian_aligned.tsv +++ b/tests/dataset/translation/en-de/dev_marian_aligned.tsv @@ -3,4 +3,4 @@ show me nearby hotels with both a " catalan " and " sauna " zeigen Sie mir in de find people graduate of Stanford. Leute finden, die Stanford graduieren. what is the highest rated hotel ? was ist das am höchsten bewertete Hotel ? find hotels with 2 star ratings . Hotels mit 2 Sterne Bewertungen finden. -what is the rating of " rosedon " in " glenorchy " ? Wie hoch ist die Bewertung von „ Rosedon " in „ Glenorchy " ? +what is the rating of " rosedon " in " glenorchy " ? Wie hoch ist die Bewertung von " rosedon " " in " glenorchy " " ? diff --git a/tests/dataset/translation/en-de/dev_t5_aligned.tsv b/tests/dataset/translation/en-de/dev_t5_aligned.tsv index 2a3cf83c..5336fc8d 100644 --- a/tests/dataset/translation/en-de/dev_t5_aligned.tsv +++ b/tests/dataset/translation/en-de/dev_t5_aligned.tsv @@ -1,6 +1,6 @@ who has a 8 star rating with over 8 reviews in " fonte " ? wer hat eine 8 Sterne Bewertung mit über 8 Bewertungen in " fonte " ? -show me nearby hotels with both a " catalan " and " sauna " ich sah mich in der Nähe von Hotels mit sowohl " Katalanen " als auch " Sauna " zeigen, " sowohl " Katalanen -find people graduate of Stanford. - es gibt Leute, die an der Stanford University studieren. -what is the highest rated hotel ? Was ist das Hotel mit dem höchsten Preis ? +show me nearby hotels with both a " catalan " and " sauna " ich sah in der Nähe Hotels mit " catalan " und " sauna " . +find people graduate of Stanford. finden Menschen Absolventen von Stanford. +what is the highest rated hotel ? Was ist das höchst bewertete Hotel ? find hotels with 2 star ratings . finden Sie Hotels mit 2 Sternenbewertungen . what is the rating of " rosedon " in " glenorchy " ? Was ist die Bewertung von " rosedon " in " glenorchy " ? diff --git a/tests/test.sh b/tests/test.sh index 41994b86..20b2fd3d 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -125,7 +125,7 @@ done # masked paraphrasing tests cp -r $SRCDIR/dataset/paraphrasing/ $workdir/masked_paraphrasing/ -for model in "sshleifer/bart-tiny-random" ; do +for model in "sshleifer/bart-tiny-random" "sshleifer/tiny-mbart" ; do if [[ $model == *mbart* ]] ; then model_type="mbart" @@ -160,7 +160,7 @@ for model in "t5-small" "Helsinki-NLP/opus-mt-en-de" ; do fi # use a pre-trained model - pipenv run python3 -m genienlp run-paraphrase --model_name_or_path $model --length 15 --temperature 0 --repetition_penalty 1.0 --num_samples 1 --batch_size 3 --input_file $workdir/translation/en-de/dev_"$base_model"_aligned.tsv --input_column 0 --gold_column 1 --output_file $workdir/generated_"$base_model"_aligned.tsv --skip_heuristics --att_pooling mean --task translate --tgt_lang de --replace_qp --return_attentions + pipenv run python3 -m genienlp run-paraphrase --model_name_or_path $model --length 15 --temperature 0 --repetition_penalty 1.0 --num_samples 1 --batch_size 3 --input_file $workdir/translation/en-de/dev_"$base_model"_aligned.tsv --input_column 0 --gold_column 1 --output_file $workdir/generated_"$base_model"_aligned.tsv --skip_heuristics --att_pooling mean --task translate --tgt_lang de --replace_qp --force_replace_qp --return_attentions # check if result file exists and exact match accuracy is 100% cut -f2 $workdir/translation/en-de/dev_"$base_model"_aligned.tsv | diff -u - $workdir/generated_"$base_model"_aligned.tsv