From b3bbad24c5030fadb01871be2d50ae839ad85194 Mon Sep 17 00:00:00 2001 From: mehrad Date: Mon, 4 Mar 2019 15:08:56 -0800 Subject: [PATCH] update tests --- .travis.yml | 3 ++- decanlp/arguments.py | 1 + decanlp/predict.py | 10 +++++--- decanlp/server.py | 9 +++++--- decanlp/tests/dataset/{ => almond}/eval.tsv | 0 decanlp/tests/dataset/{ => almond}/test.tsv | 0 decanlp/tests/dataset/{ => almond}/train.tsv | 0 decanlp/tests/test.sh | 24 +++++++++++++------- decanlp/train.py | 5 +++- 9 files changed, 36 insertions(+), 16 deletions(-) rename decanlp/tests/dataset/{ => almond}/eval.tsv (100%) rename decanlp/tests/dataset/{ => almond}/test.tsv (100%) rename decanlp/tests/dataset/{ => almond}/train.tsv (100%) diff --git a/.travis.yml b/.travis.yml index f823fbf9..f80faca1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,13 +17,14 @@ install: - pip install flake8 # pytest # add another testing frameworks later - pip install pipenv - pipenv install --dev + - python3 ./setup.py install before_script: # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics script: - - "./tests/test.sh" # pytest --capture=sys # add other tests here + - "./decanlp/tests/test.sh" # pytest --capture=sys # add other tests here notifications: on_success: change on_failure: change # `always` will be the setting once code changes slow down diff --git a/decanlp/arguments.py b/decanlp/arguments.py index 20cf81a1..5facaa91 100644 --- a/decanlp/arguments.py +++ b/decanlp/arguments.py @@ -127,6 +127,7 @@ def parse(argv): parser.add_argument('--use_bleu_loss', action='store_true', help='whether to use differentiable BLEU loss or not') parser.add_argument('--use_maxmargin_loss', action='store_true', help='whether to use max-margin loss or not') parser.add_argument('--loss_switch', default=0.666, type=float, help='switch to BLEU loss after certain iterations controlled by this ratio') + parser.add_argument('--small_glove', action='store_true', help='Use glove.6B.50d instead of glove.840B.300d') args = parser.parse_args(argv[1:]) diff --git a/decanlp/predict.py b/decanlp/predict.py index c6b85db3..97a161d5 100644 --- a/decanlp/predict.py +++ b/decanlp/predict.py @@ -71,9 +71,12 @@ def prepare_data(args, FIELD): args.max_generative_vocab = min(len(FIELD.vocab), args.max_generative_vocab) FIELD.append_vocab(new_vocab) logger.info(f'Vocabulary has expanded to {len(FIELD.vocab)} tokens') - + logger.info(f'Getting pretrained word vectors') char_vectors = torchtext.vocab.CharNGram(cache=args.embeddings) - glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings) + if args.small_glove: + glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings, name="6B", dim=50) + else: + glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings) vectors = [char_vectors, glove_vectors] FIELD.vocab.load_vectors(vectors, True) FIELD.decoder_to_vocab = {idx: FIELD.vocab.stoi[word] for idx, word in enumerate(FIELD.decoder_itos)} @@ -268,7 +271,8 @@ def get_args(argv): 'transformer_layers', 'rnn_layers', 'transformer_hidden', 'dimension', 'load', 'max_val_context_length', 'val_batch_size', 'transformer_heads', 'max_output_length', 'max_generative_vocab', - 'lower', 'cove', 'intermediate_cove', 'elmo', 'glove_and_char', 'use_maxmargin_loss'] + 'lower', 'cove', 'intermediate_cove', 'elmo', 'glove_and_char', + 'use_maxmargin_loss', 'small_glove'] for r in retrieve: if r in config: setattr(args, r, config[r]) diff --git a/decanlp/server.py b/decanlp/server.py index 1b80e905..9317752d 100644 --- a/decanlp/server.py +++ b/decanlp/server.py @@ -68,7 +68,10 @@ class Server(): logger.info(f'Vocabulary has {len(self.field.vocab)} tokens from training') char_vectors = torchtext.vocab.CharNGram(cache=self.args.embeddings) - glove_vectors = torchtext.vocab.GloVe(cache=self.args.embeddings) + if args.small_glove: + glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings, name="6B", dim=50) + else: + glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings) self._vector_collections = [char_vectors, glove_vectors] self._limited_idx_to_full_idx = deepcopy(self.field.decoder_to_vocab) # should avoid this with a conditional in map to full @@ -222,8 +225,8 @@ def get_args(argv): 'transformer_layers', 'rnn_layers', 'transformer_hidden', 'dimension', 'load', 'max_val_context_length', 'val_batch_size', 'transformer_heads', 'max_output_length', 'max_generative_vocab', - 'lower', 'cove', 'intermediate_cove', 'elmo', 'glove_and_char', 'use_maxmargin_loss', - 'reverse_task_bool'] + 'lower', 'cove', 'intermediate_cove', 'elmo', 'glove_and_char', + 'use_maxmargin_loss', 'reverse_task_bool', 'small_glove'] for r in retrieve: if r in config: setattr(args, r, config[r]) diff --git a/decanlp/tests/dataset/eval.tsv b/decanlp/tests/dataset/almond/eval.tsv similarity index 100% rename from decanlp/tests/dataset/eval.tsv rename to decanlp/tests/dataset/almond/eval.tsv diff --git a/decanlp/tests/dataset/test.tsv b/decanlp/tests/dataset/almond/test.tsv similarity index 100% rename from decanlp/tests/dataset/test.tsv rename to decanlp/tests/dataset/almond/test.tsv diff --git a/decanlp/tests/dataset/train.tsv b/decanlp/tests/dataset/almond/train.tsv similarity index 100% rename from decanlp/tests/dataset/train.tsv rename to decanlp/tests/dataset/almond/train.tsv diff --git a/decanlp/tests/test.sh b/decanlp/tests/test.sh index 48c48820..7747d7e7 100755 --- a/decanlp/tests/test.sh +++ b/decanlp/tests/test.sh @@ -6,10 +6,12 @@ SRCDIR=`dirname $0` # functional tests - -#mkdir ./embeddings -#wget --no-verbose http://nlp.stanford.edu/data/glove.840B.300d.zip ; unzip glove.840B.300d.zip ; mv glove.840B.300d.zip embeddings/ ; rm glove.42B.300d.zip -#wget --no-verbose http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/jmt_pre-trained_embeddings.tar.gz ; tar -xzvf jmt_pre-trained_embeddings.tar.gz; mv jmt_pre-trained_embeddings embeddings/; rm jmt_pre-trained_embeddings.tar.gz +function delete { + rm -rf $1 +} +mkdir -p $SRCDIR/embeddings +curl -O "https://parmesan.stanford.edu/glove/glove.6B.50d.txt.pt" ; mv glove.6B.50d.txt.pt $SRCDIR/embeddings/ +curl -O "https://parmesan.stanford.edu/glove/charNgram.txt.pt" ; mv charNgram.txt.pt $SRCDIR/embeddings/ TMPDIR=`pwd` workdir=`mktemp -d $TMPDIR/decaNLP-tests-XXXXXX` @@ -19,15 +21,21 @@ SRCDIR=`dirname $0` for hparams in "" ; do # train - pipenv run python3 $SRCDIR/../train.py --train_tasks almond --train_iterations 4 --preserve_case --save_every 2--log_every 2 --val_every 2 --save $workdir/model_$i --data dataset/ $hparams --exist_ok --skip_cache --no_glove_and_char --elmo 0 + pipenv run decanlp train --train_tasks almond --train_iterations 2 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --root "" --embeddings $SRCDIR/embeddings --small_glove # greedy decode - pipenv run python3 $SRCDIR/../predict.py --tasks almond --evaluate test --path ~/$workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data dataset/ --no_glove_and_char --elmo 0 + pipenv run decanlp predict --tasks almond --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $SRCDIR/embeddings # export prediction results - pipenv run python3 $SRCDIR/../utils/post_process_decoded_results.py --original_data dataset/test.tsv --gold_program $workdir/model_$i/eval_results/almond.gold.txt --predicted_program $workdir/model_$i/eval_results/almond.txt --output_file $workdir/model_$i/results.tsv + pipenv run python3 $SRCDIR/../utils/post_process_decoded_results.py --original_data $SRCDIR/dataset/almond/test.tsv --gold_program $workdir/model_$i/best/test/almond.gold.txt --predicted_program $workdir/model_$i/best/test/almond.txt --output_file $workdir/model_$i/results.tsv + + # check if result files exist + if [ ! -f $workdir/model_$i/results.tsv ] && [ ! -f $workdir/model_$i/results_raw.tsv ]; then + echo "File not found!" + exit + fi i=$((i+1)) done -trap { rm -rf $workdir } EXIT \ No newline at end of file +trap "delete $workdir" EXIT \ No newline at end of file diff --git a/decanlp/train.py b/decanlp/train.py index 8f837ea1..6061d6cf 100644 --- a/decanlp/train.py +++ b/decanlp/train.py @@ -115,7 +115,10 @@ def prepare_data(args, field, logger): if args.load is None: logger.info(f'Getting pretrained word vectors') char_vectors = torchtext.vocab.CharNGram(cache=args.embeddings) - glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings) + if args.small_glove: + glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings, name="6B", dim=50) + else: + glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings) vectors = [char_vectors, glove_vectors] vocab_sets = (train_sets + val_sets) if len(vocab_sets) == 0 else vocab_sets logger.info(f'Building vocabulary')