168 lines
4.6 KiB
Bash
Executable File
168 lines
4.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
. ./tests/lib.sh
|
|
|
|
i=0
|
|
|
|
# test almond_natural_seq2seq and almond_paraphrase tasks
|
|
for model in \
|
|
"sshleifer/bart-tiny-random"; do
|
|
|
|
# train
|
|
genienlp train \
|
|
$SHARED_TRAIN_HPARAMS \
|
|
--train_tasks almond_natural_seq2seq \
|
|
--train_batch_tokens 100 \
|
|
--val_batch_size 100 \
|
|
--train_iterations 6 \
|
|
--save $workdir/model_$i \
|
|
--data $SRCDIR/dataset/ \
|
|
--model TransformerSeq2Seq \
|
|
--pretrained_model $model
|
|
|
|
# train for 0 iterations
|
|
genienlp train \
|
|
$SHARED_TRAIN_HPARAMS \
|
|
--train_tasks almond_natural_seq2seq \
|
|
--train_batch_tokens 100 \
|
|
--val_batch_size 100 \
|
|
--train_iterations 0 \
|
|
--save $workdir/model_$i \
|
|
--data $SRCDIR/dataset/ \
|
|
--model TransformerSeq2Seq \
|
|
--override_question "" \
|
|
--pretrained_model $model
|
|
|
|
# greedy prediction
|
|
genienlp predict \
|
|
--tasks almond_paraphrase \
|
|
--evaluate test \
|
|
--path $workdir/model_$i \
|
|
--overwrite \
|
|
--eval_dir $workdir/model_$i/eval_results/ \
|
|
--data $SRCDIR/dataset/ \
|
|
--embeddings $EMBEDDING_DIR \
|
|
--extra_metrics rouge1 rougeL
|
|
|
|
# use as a HuggingFace model directly in genienlp predict
|
|
genienlp predict \
|
|
--tasks almond_paraphrase \
|
|
--evaluate test \
|
|
--path $model \
|
|
--overwrite \
|
|
--eval_dir $workdir/model_$i/hf_results/ \
|
|
--data $SRCDIR/dataset/ \
|
|
--embeddings $EMBEDDING_DIR \
|
|
--pred_languages en \
|
|
--model TransformerSeq2Seq \
|
|
--min_output_length 1 \
|
|
--max_output_length 150 \
|
|
--val_batch_size 100 \
|
|
--is_hf_model
|
|
|
|
# check if result file exists
|
|
if test ! -f $workdir/model_$i/eval_results/test/almond_paraphrase.tsv || \
|
|
test ! -f $workdir/model_$i/eval_results/test/almond_paraphrase.results.json || \
|
|
test ! -f $workdir/model_$i/hf_results/test/almond_paraphrase.tsv || \
|
|
test ! -f $workdir/model_$i/hf_results/test/almond_paraphrase.results.json; then
|
|
echo "File not found!"
|
|
exit 1
|
|
fi
|
|
|
|
# check if eval_results matche hf_results
|
|
diff -u $workdir/model_$i/hf_results/test/almond_paraphrase.tsv $workdir/model_$i/eval_results/test/almond_paraphrase.tsv
|
|
|
|
rm -rf $workdir/model_$i
|
|
i=$((i+1))
|
|
done
|
|
|
|
|
|
# tests for the old paraphrasing code
|
|
cp -r $SRCDIR/dataset/paraphrasing/ $workdir/paraphrasing/
|
|
for model in "sshleifer/bart-tiny-random" ; do
|
|
|
|
if [[ $model == *gpt2* ]] ; then
|
|
model_type="gpt2"
|
|
elif [[ $model == */bart* ]] ; then
|
|
model_type="bart"
|
|
fi
|
|
|
|
# use a pre-trained model to paraphrase almond's train set
|
|
genienlp run-paraphrase \
|
|
--model_name_or_path $model \
|
|
--length 15 \
|
|
--temperature 0.4 \
|
|
--repetition_penalty 1.0 \
|
|
--num_samples 4 \
|
|
--input_file $SRCDIR/dataset/almond/train.tsv \
|
|
--input_column 1 \
|
|
--output_file $workdir/generated_"$model_type".tsv \
|
|
--task paraphrase
|
|
|
|
# check if result file exists
|
|
if test ! -f $workdir/generated_"$model_type".tsv ; then
|
|
echo "File not found!"
|
|
exit 1
|
|
fi
|
|
rm -rf $workdir/generated_"$model_type".tsv
|
|
rm -rf $workdir/"$model_type"
|
|
|
|
done
|
|
|
|
|
|
# masked paraphrasing tests
|
|
cp -r $SRCDIR/dataset/paraphrasing/ $workdir/masked_paraphrasing/
|
|
|
|
for model in "sshleifer/bart-tiny-random" "sshleifer/tiny-mbart" ; do
|
|
|
|
if [[ $model == *mbart* ]] ; then
|
|
model_type="mbart"
|
|
elif [[ $model == *bart* ]] ; then
|
|
model_type="bart"
|
|
fi
|
|
|
|
# use a pre-trained model
|
|
genienlp run-paraphrase \
|
|
--model_name_or_path $model \
|
|
--length 15 \
|
|
--temperature 0 \
|
|
--repetition_penalty 1.0 \
|
|
--num_samples 1 \
|
|
--batch_size 3 \
|
|
--input_file $workdir/masked_paraphrasing/dev.tsv \
|
|
--input_column 0 \
|
|
--gold_column 1 \
|
|
--output_file $workdir/generated_"$model_type".tsv \
|
|
--skip_heuristics \
|
|
--task paraphrase \
|
|
--infill_text \
|
|
--num_text_spans 1 \
|
|
--src_lang en \
|
|
--tgt_lang en
|
|
|
|
# create input file for sts filtering
|
|
paste <(cut -f1-2 $workdir/masked_paraphrasing/dev.tsv) <(cut -f2 $workdir/generated_"$model_type".tsv) <(cut -f3 $workdir/masked_paraphrasing/dev.tsv) > $workdir/sts_input_"$model_type".tsv
|
|
|
|
# calculate sts score for paraphrases
|
|
genienlp sts-calculate-scores \
|
|
--input_file $workdir/sts_input_"$model_type".tsv \
|
|
--output_file $workdir/sts_output_score_"$model_type".tsv
|
|
|
|
# filter paraphrases based on sts score
|
|
genienlp sts-filter \
|
|
--input_file $workdir/sts_output_score_"$model_type".tsv \
|
|
--output_file $workdir/sts_output_"$model_type".tsv \
|
|
--filtering_metric constant \
|
|
--filtering_threshold 0.98
|
|
|
|
|
|
if test ! -f $workdir/generated_"$model_type".tsv || test ! -f $workdir/sts_output_"$model_type".tsv ; then
|
|
echo "File not found!"
|
|
exit 1
|
|
fi
|
|
|
|
done
|
|
|
|
rm -fr $workdir
|
|
rm -rf $SRCDIR/torch-shm-fi
|