diff --git a/genienlp/data_utils/numericalizer.py b/genienlp/data_utils/numericalizer.py index f1af1273..d9aea351 100644 --- a/genienlp/data_utils/numericalizer.py +++ b/genienlp/data_utils/numericalizer.py @@ -326,7 +326,7 @@ class TransformerNumericalizer(object): sentence = self._apply_special_token_preprocessing(sentence) encoded = self._tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=None, - return_length=True, padding=False) + return_length=True, padding=False, return_attention_mask=False) numerical = encoded.data['input_ids'] length = encoded.data['length'] diff --git a/genienlp/models/transformer_seq2seq.py b/genienlp/models/transformer_seq2seq.py index 03e231a4..c8775cac 100644 --- a/genienlp/models/transformer_seq2seq.py +++ b/genienlp/models/transformer_seq2seq.py @@ -71,7 +71,10 @@ class TransformerSeq2Seq(GenieModel): # with a lowercase letter, so we leave it answer = answer[:, 1:].contiguous() - return self.model(batch.context.value, labels=answer) + # setting pad output tokens to -100 means they will be ignored in calculating loss + answer[answer==self.numericalizer.pad_id] = -100 + + return self.model(batch.context.value, labels=answer, attention_mask=(batch.context.value!=self.numericalizer.pad_id)) else: return self.model(**kwargs) @@ -91,10 +94,10 @@ class TransformerSeq2Seq(GenieModel): ): input_ids = batch.context.value - # TODO attention_mask + # when attention_mask is not provided to generate(), it will default to masking pad tokens, which is the correct thing generated = self.model.generate(input_ids=input_ids, max_length=max_output_length, - min_length=2, # generate at least one token after BOS + min_length=2, # generate at least one token after BOS bos_token_id=self.numericalizer._tokenizer.bos_token_id, pad_token_id=self.numericalizer._tokenizer.pad_token_id, early_stopping=True,