From 25ebee39648a3b643a4ac4e2a2ce61af1b7b1a8d Mon Sep 17 00:00:00 2001 From: Sina Date: Wed, 24 Aug 2022 21:20:05 -0700 Subject: [PATCH] Factor out some shared parameters in test cases --- tests/lib.sh | 6 ++-- tests/test_NED.sh | 8 +---- tests/test_calibration.sh | 8 +---- tests/test_cuda.sh | 8 +---- tests/test_e2e_dialogues.sh | 10 ++---- tests/test_kfserver.sh | 8 +---- tests/test_main_almond.sh | 7 +---- tests/test_paraphrasing.sh | 10 ++---- tests/test_sequence_classification.sh | 45 ++++++++++++++++++++++++--- tests/test_token_classification.sh | 24 +++++--------- tests/test_translation.sh | 20 ++++-------- 11 files changed, 66 insertions(+), 88 deletions(-) diff --git a/tests/lib.sh b/tests/lib.sh index 978f080e..86b69f4e 100755 --- a/tests/lib.sh +++ b/tests/lib.sh @@ -4,8 +4,8 @@ SRCDIR=`dirname $0` on_error () { - rm -fr $workdir - rm -rf $SRCDIR/torch-shm-file-* + rm -fr $workdir + rm -rf $SRCDIR/torch-shm-file-* } # allow faster local testing @@ -17,6 +17,8 @@ else fi export SENTENCE_TRANSFORMERS_HOME="$EMBEDDING_DIR" +# parameters that are commonly passed to `genienlp train` test cases +export SHARED_TRAIN_HPARAMS="--embeddings $EMBEDDING_DIR --exist_ok --no_commit --preserve_case --save_every 2 --log_every 2 --val_every 2" TMPDIR=`pwd` workdir=`mktemp -d $TMPDIR/genieNLP-tests-XXXXXX` diff --git a/tests/test_NED.sh b/tests/test_NED.sh index af1507e9..03db0615 100755 --- a/tests/test_NED.sh +++ b/tests/test_NED.sh @@ -16,21 +16,15 @@ do # train genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks almond_dialogue_nlu \ --train_batch_tokens 100 \ --val_batch_size 100 \ --train_iterations 6 \ - --preserve_case \ - --save_every 2 \ - --log_every 2 \ - --val_every 2 \ --save $workdir/model_$i \ --database_dir $SRCDIR/database/ \ --data $SRCDIR/dataset/thingpedia_99/ \ --bootleg_output_dir $SRCDIR/dataset/thingpedia_99/bootleg/ \ - --exist_ok \ - --embeddings $EMBEDDING_DIR \ - --no_commit \ --do_ned \ --min_entity_len 2 \ --max_entity_len 4 \ diff --git a/tests/test_calibration.sh b/tests/test_calibration.sh index a3e07106..849b435b 100644 --- a/tests/test_calibration.sh +++ b/tests/test_calibration.sh @@ -10,19 +10,13 @@ do # train genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks almond \ --train_batch_tokens 100 \ --val_batch_size 100 \ --train_iterations 6 \ - --preserve_case \ - --save_every 2 \ - --log_every 2 \ - --val_every 2 \ --save $workdir/model_$i \ --data $SRCDIR/dataset/ \ - --exist_ok \ - --embeddings $EMBEDDING_DIR \ - --no_commit \ $hparams # greedy prediction diff --git a/tests/test_cuda.sh b/tests/test_cuda.sh index 5c5e4b1e..5850ab71 100755 --- a/tests/test_cuda.sh +++ b/tests/test_cuda.sh @@ -10,19 +10,13 @@ do # train genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks almond \ --train_batch_tokens 100 \ --val_batch_size 100 \ --train_iterations 2 \ - --preserve_case \ - --save_every 2 \ - --log_every 2 \ - --val_every 2 \ --save $workdir/model_$i \ --data $SRCDIR/dataset/ \ - --exist_ok \ - --embeddings $EMBEDDING_DIR \ - --no_commit \ $hparams # generate a long sequence diff --git a/tests/test_e2e_dialogues.sh b/tests/test_e2e_dialogues.sh index 3d4ecb98..619079ff 100755 --- a/tests/test_e2e_dialogues.sh +++ b/tests/test_e2e_dialogues.sh @@ -19,20 +19,14 @@ for i in ${!hparams[*]}; do # train genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks ${tasks[i]} \ --train_batch_tokens 100 \ --val_batch_size 300 \ --train_iterations 4 \ --min_output_length 2 \ - --preserve_case \ - --save_every 2 \ - --log_every 2 \ - --val_every 2 \ --save $workdir/model_$i \ - --data $SRCDIR/dataset/bitod \ - --exist_ok \ - --embeddings $EMBEDDING_DIR \ - --no_commit ${hparams[i]} + --data $SRCDIR/dataset/bitod # greedy prediction genienlp predict \ diff --git a/tests/test_kfserver.sh b/tests/test_kfserver.sh index b9a1ddfd..f821a528 100644 --- a/tests/test_kfserver.sh +++ b/tests/test_kfserver.sh @@ -10,19 +10,13 @@ do # train genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks almond \ --train_batch_tokens 100 \ --val_batch_size 100 \ --train_iterations 6 \ - --preserve_case \ - --save_every 2 \ - --log_every 2 \ - --val_every 2 \ --save $workdir/model_$i \ --data $SRCDIR/dataset/ \ - --exist_ok \ - --embeddings $EMBEDDING_DIR \ - --no_commit \ $hparams # run kfserver in background diff --git a/tests/test_main_almond.sh b/tests/test_main_almond.sh index 89614463..5c669afc 100755 --- a/tests/test_main_almond.sh +++ b/tests/test_main_almond.sh @@ -15,18 +15,13 @@ do # train genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks almond \ --train_batch_tokens 100 \ --val_batch_size 100 \ --train_iterations 4 \ - --preserve_case \ - --save_every 2 \ - --log_every 2 --val_every 2 \ --save $workdir/model_$i \ --data $SRCDIR/dataset/ \ - --exist_ok \ - --embeddings $EMBEDDING_DIR \ - --no_commit \ $hparams # greedy prediction diff --git a/tests/test_paraphrasing.sh b/tests/test_paraphrasing.sh index 74c9474d..1ebd6333 100755 --- a/tests/test_paraphrasing.sh +++ b/tests/test_paraphrasing.sh @@ -10,21 +10,15 @@ for model in \ # train genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks almond_natural_seq2seq \ --train_batch_tokens 100 \ --val_batch_size 100 \ --train_iterations 6 \ - --preserve_case \ - --save_every 2 \ - --log_every 2 \ - --val_every 2 \ --save $workdir/model_$i \ --data $SRCDIR/dataset/ \ --model TransformerSeq2Seq \ - --pretrained_model $model \ - --exist_ok \ - --embeddings $EMBEDDING_DIR \ - --no_commit + --pretrained_model $model # greedy prediction genienlp predict \ diff --git a/tests/test_sequence_classification.sh b/tests/test_sequence_classification.sh index 06a0e235..d3e9a75a 100755 --- a/tests/test_sequence_classification.sh +++ b/tests/test_sequence_classification.sh @@ -4,7 +4,20 @@ # Test ood task # train -genienlp train --train_tasks ood_task --model TransformerForSequenceClassification --pretrained_model distilbert-base-uncased --min_output_length 1 --save $workdir/model --train_iterations 20 --save_every 10 --log_every 10 --val_every 10 --data $SRCDIR/dataset/ood/ --force_fast_tokenizer --train_batch_tokens 200 --num_print 0 +genienlp train \ + --train_tasks ood_task \ + --model TransformerForSequenceClassification \ + --pretrained_model distilbert-base-uncased \ + --min_output_length 1 \ + --save $workdir/model \ + --train_iterations 20 \ + --save_every 10 \ + --log_every 10 \ + --val_every 10 \ + --data $SRCDIR/dataset/ood/ \ + --force_fast_tokenizer \ + --train_batch_tokens 200 \ + --num_print 0 # greedy prediction genienlp predict --tasks ood_task --evaluate valid --pred_set_name eval --path $workdir/model --overwrite --eval_dir $workdir/model/eval_results/ --data $SRCDIR/dataset/ood/ --embeddings $EMBEDDING_DIR --val_batch_size 200 @@ -23,15 +36,37 @@ rm -rf $workdir/model # Test bitod_error_cls task # train -genienlp train --train_tasks bitod_error_cls --model TransformerForSequenceClassification --pretrained_model distilbert-base-uncased --min_output_length 1 --save $workdir/model_error/ --train_iterations 100 --save_every 50 --log_every 50 --val_every 50 --data $SRCDIR/dataset/bitod_error/ --force_fast_tokenizer --train_batch_tokens 200 --num_print 0 +genienlp train \ + --train_tasks bitod_error_cls \ + --model TransformerForSequenceClassification \ + --pretrained_model distilbert-base-uncased \ + --min_output_length 1 \ + --save $workdir/model_error/ \ + --train_iterations 100 \ + --save_every 50 \ + --log_every 50 \ + --val_every 50 \ + --data $SRCDIR/dataset/bitod_error/ \ + --force_fast_tokenizer \ + --train_batch_tokens 200 \ + --num_print 0 # greedy prediction -genienlp predict --tasks bitod_error_cls --evaluate valid --pred_set_name valid --path $workdir/model_error --overwrite --eval_dir $workdir/model_error/eval_results/ --data $SRCDIR/dataset/bitod_error/ --embeddings $EMBEDDING_DIR --val_batch_size 200 +genienlp predict \ + --tasks bitod_error_cls \ + --evaluate valid \ + --pred_set_name valid \ + --path $workdir/model_error \ + --overwrite \ + --eval_dir $workdir/model_error/eval_results/ \ + --data $SRCDIR/dataset/bitod_error/ \ + --embeddings $EMBEDDING_DIR \ + --val_batch_size 200 # check if result file exists if test ! -f $workdir/model_error/eval_results/valid/bitod_error_cls.tsv ; then - echo "File not found!" - exit 1 + echo "File not found!" + exit 1 fi # check if predictions matches expected_results diff --git a/tests/test_token_classification.sh b/tests/test_token_classification.sh index 8a5c95bc..06515730 100755 --- a/tests/test_token_classification.sh +++ b/tests/test_token_classification.sh @@ -11,20 +11,16 @@ do # train genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks cross_ner \ --model TransformerForTokenClassification \ --pretrained_model bert-base-cased \ - --force_fast_tokenizer --train_batch_tokens 200 \ + --force_fast_tokenizer \ + --train_batch_tokens 200 \ --val_batch_size 200 \ - --train_iterations 4 --preserve_case \ - --save_every 2 \ - --log_every 2 \ - --val_every 2 \ + --train_iterations 4 \ --save $workdir/model_$i \ - --data $SRCDIR/dataset/cross_ner/ \ - --embeddings $EMBEDDING_DIR $hparams \ - --exist_ok \ - --no_commit + --data $SRCDIR/dataset/cross_ner/ # greedy prediction genienlp predict \ @@ -59,6 +55,7 @@ do # train genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks conll2003 \ --crossner_domains music \ --model TransformerForTokenClassification \ @@ -68,15 +65,8 @@ do --train_batch_tokens 100 \ --val_batch_size 100 \ --train_iterations 4 \ - --preserve_case \ - --save_every 2 \ - --log_every 2 \ - --val_every 2 \ --save $workdir/model_$i \ - --data $SRCDIR/dataset/cross_ner/ \ - --embeddings $EMBEDDING_DIR $hparams \ - --exist_ok \ - --no_commit + --data $SRCDIR/dataset/cross_ner/ # greedy prediction genienlp predict \ diff --git a/tests/test_translation.sh b/tests/test_translation.sh index 23a6e0e6..6edad81f 100755 --- a/tests/test_translation.sh +++ b/tests/test_translation.sh @@ -21,6 +21,7 @@ for model in "Helsinki-NLP/opus-mt-en-de" ; do # save model genienlp train \ + $SHARED_TRAIN_HPARAMS \ --train_tasks almond_translate \ --train_languages en \ --train_tgt_languages de \ @@ -31,11 +32,7 @@ for model in "Helsinki-NLP/opus-mt-en-de" ; do --train_batch_tokens 100 \ --val_batch_size 100 \ --train_iterations 0 \ - --preserve_case \ - --save $workdir/model_$i \ - --exist_ok \ - --embeddings $EMBEDDING_DIR \ - --no_commit + --save $workdir/model_$i # translate entities genienlp predict \ @@ -89,7 +86,9 @@ for model in "Helsinki-NLP/opus-mt-en-de" "sshleifer/tiny-mbart" ; do cp $workdir/translation/almond/train.tsv $workdir/translation/almond/eval.tsv # train - genienlp train --train_tasks almond_translate \ + genienlp train \ + $SHARED_TRAIN_HPARAMS \ + --train_tasks almond_translate \ --do_alignment \ --train_languages en \ --train_tgt_languages de \ @@ -100,15 +99,8 @@ for model in "Helsinki-NLP/opus-mt-en-de" "sshleifer/tiny-mbart" ; do --train_batch_tokens 100 \ --val_batch_size 100 \ --train_iterations 6 \ - --preserve_case \ - --save_every 2 \ - --log_every 2 \ - --val_every 2 \ --save $workdir/model_$i \ - --data $workdir/translation/ \ - --exist_ok \ - --embeddings $EMBEDDING_DIR \ - --no_commit + --data $workdir/translation/ # greedy prediction genienlp predict \