NMTKD / translation /OpenNMT-py /onmt /tests /pull_request_chk.sh
sakharamg's picture
Uploading all files
158b61b
#!/bin/bash
# Run this script and fix *any* error before sending PR.
# For repeated runs, set the environment variables
# SKIP_DOWNLOADS=1 If files/uncompressed dirs exist don't download (if compressed files exist, just untar).
# SKIP_FULL_CLEAN=1 Don't remove anything downloaded/uncompressed.
LOG_FILE=/tmp/$$_pull_request_chk.log
echo > ${LOG_FILE} # Empty the log file.
PROJECT_ROOT=`dirname "$0"`"/../../"
DATA_DIR="$PROJECT_ROOT/data"
TEST_DIR="$PROJECT_ROOT/onmt/tests"
PYTHON="python3"
TMP_OUT_DIR="/tmp/onmt_prchk"
clean_up()
{
if [[ "$1" != "error" ]]; then
rm ${LOG_FILE}
fi
if [[ "${SKIP_FULL_CLEAN}" == "1" ]]; then
# delete any .pt's that weren't downloaded
ls $TMP_OUT_DIR/*.pt | xargs -I {} rm -f $TMP_OUT_DIR/{}
else
# delete all .pt's
rm -f $TMP_OUT_DIR/*.pt
rm -rf $TMP_OUT_DIR/sample
rm $TMP_OUT_DIR/onmt.vocab*
rm -d $TMP_OUT_DIR
fi
}
trap clean_up SIGINT SIGQUIT SIGKILL
error_exit()
{
echo "Failed !" | tee -a ${LOG_FILE}
echo "[!] Check ${LOG_FILE} for detail."
clean_up error
exit 1
}
# environment_prepare()
# {
# }
# flake8 check
echo -n "[+] Doing flake8 check..."
${PYTHON} -m flake8 >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
# unittest
echo -n "[+] Doing unittest test..."
${PYTHON} -m unittest discover >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
#
# Get Vocabulary test
#
echo -n "[+] Testing vocabulary building..."
PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH} ${PYTHON} onmt/bin/build_vocab.py \
-config ${DATA_DIR}/data.yaml \
-save_data $TMP_OUT_DIR/onmt \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \
-n_sample 5000 -overwrite >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm -f -r $TMP_OUT_DIR/sample
echo -n "[+] Testing vocabulary building with features..."
PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH} ${PYTHON} onmt/bin/build_vocab.py \
-config ${DATA_DIR}/features_data.yaml \
-save_data $TMP_OUT_DIR/onmt_feat \
-src_vocab $TMP_OUT_DIR/onmt_feat.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt_feat.vocab.tgt \
-src_feats_vocab '{"feat0": "${TMP_OUT_DIR}/onmt_feat.vocab.feat0"}' \
-n_sample -1 -overwrite>> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm -f -r $TMP_OUT_DIR/sample
#
# Training test
#
echo -n "[+] Testing NMT fields/transforms prepare..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/data.yaml \
-save_data $TMP_OUT_DIR/onmt.train.check \
-dump_fields -dump_transforms -n_sample 30 \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
# rm $TMP_OUT_DIR/onmt.train.check* # used in tool testing
echo "[+] Doing Training test..."
echo -n " [+] Testing NMT training..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-rnn_size 2 -batch_size 10 \
-word_vec_size 5 -report_every 5 \
-rnn_size 10 -train_steps 10 >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
echo -n " [+] Testing NMT training w/ copy..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-rnn_size 2 -batch_size 10 \
-word_vec_size 5 -report_every 5 \
-rnn_size 10 -train_steps 10 \
-copy_attn >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
echo -n " [+] Testing NMT training w/ align..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/align_data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-max_generator_batches 0 \
-encoder_type transformer -decoder_type transformer \
-layers 4 -word_vec_size 16 -rnn_size 16 -heads 2 -transformer_ff 64 \
-lambda_align 0.05 -alignment_layer 2 -alignment_heads 0 \
-report_every 5 -train_steps 10 >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
echo -n " [+] Testing NMT training w/ coverage..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-rnn_size 2 -batch_size 10 \
-word_vec_size 5 -report_every 5 \
-coverage_attn true -lambda_coverage 0.1 \
-rnn_size 10 -train_steps 10 >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
echo -n " [+] Testing LM training..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/lm_data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.src \
-model_task lm \
-encoder_type transformer_lm \
-decoder_type transformer_lm \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-dec_layers 2 -batch_size 10 \
-heads 4 -transformer_ff 64 \
-word_vec_size 16 -report_every 5 \
-rnn_size 16 -train_steps 10 >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
echo -n " [+] Testing LM training w/ copy..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/lm_data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.src \
-model_task lm \
-encoder_type transformer_lm \
-decoder_type transformer_lm \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-dec_layers 2 -batch_size 10 \
-heads 4 -transformer_ff 64 \
-word_vec_size 16 -report_every 5 \
-rnn_size 16 -train_steps 10 \
-copy_attn >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}*
echo -n " [+] Testing Checkpoint Vocabulary Update..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \
-src_vocab_size 1000 -tgt_vocab_size 1000 \
-rnn_size 2 -batch_size 10 \
-word_vec_size 5 -rnn_size 10 \
-report_every 5 -train_steps 10 \
-save_model $TMP_OUT_DIR/onmt.model \
-save_checkpoint_steps 10 >> ${LOG_FILE} 2>&1
sed -i '1s/^/new_tok\t100000000\n/' $TMP_OUT_DIR/onmt.vocab.src >> ${LOG_FILE} 2>&1
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.tgt \
-src_vocab_size 1000 -tgt_vocab_size 1000 \
-rnn_size 2 -batch_size 10 \
-word_vec_size 5 -rnn_size 10 \
-report_every 5 -train_steps 20 \
-update_vocab -reset_optim "states" \
-train_from $TMP_OUT_DIR/onmt.model_step_10.pt >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
echo -n " [+] Testing Checkpoint Vocabulary Update with LM..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/lm_data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.src \
-model_task lm \
-encoder_type transformer_lm \
-decoder_type transformer_lm \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-dec_layers 2 -batch_size 10 \
-heads 4 -transformer_ff 64 \
-word_vec_size 16 -report_every 5 \
-save_model $TMP_OUT_DIR/lm.onmt.model \
-save_checkpoint_steps 10 \
-rnn_size 16 -train_steps 10 >> ${LOG_FILE} 2>&1
sed -i '1s/^/new_tok\t100000000\n/' $TMP_OUT_DIR/onmt.vocab.src >> ${LOG_FILE} 2>&1
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/lm_data.yaml \
-src_vocab $TMP_OUT_DIR/onmt.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt.vocab.src \
-model_task lm \
-encoder_type transformer_lm \
-decoder_type transformer_lm \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-dec_layers 2 -batch_size 10 \
-heads 4 -transformer_ff 64 \
-word_vec_size 16 -report_every 5 \
-rnn_size 16 -train_steps 20 \
-update_vocab -reset_optim "states" \
-train_from $TMP_OUT_DIR/lm.onmt.model_step_10.pt >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
echo -n " [+] Testing Graph Neural Network training..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/ggnn_data.yaml \
-src_seq_length 1000 -tgt_seq_length 30 \
-encoder_type ggnn -layers 2 \
-decoder_type rnn -rnn_size 256 \
-learning_rate 0.1 -learning_rate_decay 0.8 \
-global_attention general -batch_size 32 -word_vec_size 256 \
-bridge -train_steps 10 -n_edge_types 9 -state_dim 256 \
-n_steps 10 -n_node 64 >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
echo -n " [+] Testing training with features..."
${PYTHON} onmt/bin/train.py \
-config ${DATA_DIR}/features_data.yaml \
-src_vocab $TMP_OUT_DIR/onmt_feat.vocab.src \
-tgt_vocab $TMP_OUT_DIR/onmt_feat.vocab.tgt \
-src_feats_vocab '{"feat0": "${TMP_OUT_DIR}/onmt_feat.vocab.feat0"}' \
-src_vocab_size 1000 -tgt_vocab_size 1000 \
-rnn_size 2 -batch_size 10 \
-word_vec_size 5 -rnn_size 10 \
-report_every 5 -train_steps 10 \
-save_model $TMP_OUT_DIR/onmt.features.model \
-save_checkpoint_steps 10 >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm -f $TMP_OUT_DIR/onmt.vocab*
rm -f $TMP_OUT_DIR/onmt.model*
rm -f $TMP_OUT_DIR/onmt_feat.vocab.*
#
# Translation test
#
echo "[+] Doing translation test..."
echo -n " [+] Testing NMT translation..."
head ${DATA_DIR}/src-test.txt > $TMP_OUT_DIR/src-test.txt
${PYTHON} translate.py -model ${TEST_DIR}/test_model.pt -src $TMP_OUT_DIR/src-test.txt -verbose >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/src-test.txt
echo -n " [+] Testing NMT translation with features..."
${PYTHON} translate.py \
-model ${TMP_OUT_DIR}/onmt.features.model_step_10.pt \
-src ${DATA_DIR}/data_features/src-test.txt \
-src_feats "{'feat0': '${DATA_DIR}/data_features/src-test.feat0'}" \
-verbose >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm -f $TMP_OUT_DIR/onmt.features.model*
echo -n " [+] Testing NMT ensemble translation..."
head ${DATA_DIR}/src-test.txt > $TMP_OUT_DIR/src-test.txt
${PYTHON} translate.py -model ${TEST_DIR}/test_model.pt ${TEST_DIR}/test_model.pt \
-src $TMP_OUT_DIR/src-test.txt -verbose >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/src-test.txt
echo -n " [+] Testing NMT translation w/ Beam search..."
${PYTHON} translate.py -model ${TEST_DIR}/test_model2.pt \
-src ${DATA_DIR}/morph/src.valid \
-verbose -batch_size 10 \
-beam_size 10 \
-tgt ${DATA_DIR}/morph/tgt.valid \
-out $TMP_OUT_DIR/trans_beam >> ${LOG_FILE} 2>&1
diff ${DATA_DIR}/morph/tgt.valid $TMP_OUT_DIR/trans_beam
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/trans_beam
echo -n " [+] Testing NMT translation w/ Random Sampling..."
${PYTHON} translate.py -model ${TEST_DIR}/test_model2.pt \
-src ${DATA_DIR}/morph/src.valid \
-verbose -batch_size 10 \
-beam_size 1 \
-seed 1 \
-random_sampling_topk -1 \
-random_sampling_temp 0.0001 \
-tgt ${DATA_DIR}/morph/tgt.valid \
-out $TMP_OUT_DIR/trans_sampling >> ${LOG_FILE} 2>&1
diff ${DATA_DIR}/morph/tgt.valid $TMP_OUT_DIR/trans_sampling
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/trans_sampling
echo -n " [+] Testing LM generation..."
head ${DATA_DIR}/src-test.txt > $TMP_OUT_DIR/src-test.txt
${PYTHON} translate.py -model ${TEST_DIR}/test_model_lm.pt -src $TMP_OUT_DIR/src-test.txt -verbose >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/src-test.txt
echo -n " [+] Testing LM generation w/ Beam search..."
${PYTHON} translate.py -model ${TEST_DIR}/test_model_lm.pt \
-src ${DATA_DIR}/data_lm/src-gen.txt \
-verbose -batch_size 10 \
-beam_size 10 \
-ban_unk_token \
-out $TMP_OUT_DIR/gen_beam >> ${LOG_FILE} 2>&1
diff ${DATA_DIR}/data_lm/gen-beam-sol.txt $TMP_OUT_DIR/gen_beam
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/gen_beam
echo -n " [+] Testing LM generation w/ Random Sampling..."
${PYTHON} translate.py -model ${TEST_DIR}/test_model_lm.pt \
-src ${DATA_DIR}/data_lm/src-gen.txt \
-verbose -batch_size 10 \
-beam_size 1 \
-seed 1 \
-random_sampling_topk -1 \
-random_sampling_temp 0.0001 \
-ban_unk_token \
-out $TMP_OUT_DIR/gen_sampling >> ${LOG_FILE} 2>&1
diff ${DATA_DIR}/data_lm/gen-sampling-sol.txt $TMP_OUT_DIR/gen_sampling
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/gen_sampling
echo -n " [+] Testing LM generation w/ Random Top-k/Nucleus Sampling..."
${PYTHON} translate.py -model ${TEST_DIR}/test_model_lm.pt \
-src ${DATA_DIR}/data_lm/src-gen.txt \
-verbose -batch_size 10 \
-beam_size 1 \
-seed 3 \
-random_sampling_topk -1 \
-random_sampling_topp 0.95 \
-random_sampling_temp 1 \
-ban_unk_token \
-out $TMP_OUT_DIR/gen_sampling >> ${LOG_FILE} 2>&1
diff ${DATA_DIR}/data_lm/gen-nucleus-sampling-sol.txt $TMP_OUT_DIR/gen_sampling
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/gen_sampling
echo -n " [+] Testing LM generation w/ Random Top-k/Nucleus Sampling and multi beams..."
${PYTHON} translate.py -model ${TEST_DIR}/test_model_lm.pt \
-src ${DATA_DIR}/data_lm/src-gen.txt \
-verbose -batch_size 10 \
-beam_size 10 \
-seed 2 \
-random_sampling_topk 50 \
-random_sampling_topp 0.95 \
-random_sampling_temp 1 \
-length_penalty avg \
-ban_unk_token \
-min_length 5 \
-out $TMP_OUT_DIR/gen_sampling >> ${LOG_FILE} 2>&1
diff ${DATA_DIR}/data_lm/gen-sampling-beams-sol.txt $TMP_OUT_DIR/gen_sampling
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/gen_sampling
#
# Tools test
#
echo "[+] Doing tools test..."
echo -n " [+] Doing extract vocabulary test..."
PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH} ${PYTHON} ./tools/extract_vocabulary.py \
-file $TMP_OUT_DIR/onmt.train.check.vocab.pt -file_type field -side src \
-out_file $TMP_OUT_DIR/vocab.txt >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
if ! wc -l $TMP_OUT_DIR/vocab.txt | grep -qF "1002"; then
echo -n "wrong word count\n" >> ${LOG_FILE}
wc -l $TMP_OUT_DIR/vocab.txt >> ${LOG_FILE}
error_exit
fi
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/vocab.txt
echo -n " [+] Doing embeddings to torch test..."
PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH} ${PYTHON} ./tools/embeddings_to_torch.py \
-emb_file_enc ${TEST_DIR}/sample_glove.txt \
-emb_file_dec ${TEST_DIR}/sample_glove.txt \
-dict_file $TMP_OUT_DIR/onmt.train.check.vocab.pt \
-output_file $TMP_OUT_DIR/q_gloveembeddings >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
rm $TMP_OUT_DIR/q_gloveembeddings*
echo -n " [+] Doing extract embeddings test..."
PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH} ${PYTHON} tools/extract_embeddings.py \
-model onmt/tests/test_model.pt >> ${LOG_FILE} 2>&1
[ "$?" -eq 0 ] || error_exit
echo "Succeeded" | tee -a ${LOG_FILE}
# Finally, clean up
clean_up