File size: 5,361 Bytes
9e826e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
#!/usr/bin/env bash
# repo_dir: root directory of the project
repo_dir="$( cd "$( dirname "$0" )" && pwd )"
echo "==== Working directory: ====" >&2
echo "${repo_dir}" >&2
echo "============================" >&2
test_config=$1
source ${repo_dir}/scripts/load_config.sh ${test_config} ${repo_dir}
model_dir=$2
choice=$3 # all|best|last
model_dir=${repo_dir}/model
data_dir=${repo_dir}/data
res_path=${model_dir}/results
mkdir -p ${model_dir} ${data_dir} ${res_path}
testset_name=data_testset_1_name
testset_path=data_testset_1_path
testset_ref=data_testset_1_ref
testset_direc=data_testset_1_direction
i=1
testsets=""
while [[ ! -z ${!testset_path} && ! -z ${!testset_direc} ]]; do
dataname=${!testset_name}
mkdir -p ${data_dir}/${!testset_direc}/${dataname} ${data_dir}/ref/${!testset_direc}/${dataname}
cp ${!testset_path}/* ${data_dir}/${!testset_direc}/${dataname}/
cp ${!testset_ref}/* ${data_dir}/ref/${!testset_direc}/${dataname}/
if [[ $testsets == "" ]]; then
testsets=${!testset_direc}/${dataname}
else
testsets=${testsets}:${!testset_direc}/${dataname}
fi
i=$((i+1))
testset_name=testset_${i}_name
testset_path=testset_${i}_path
testset_ref=testset_${i}_ref
testset_direc=testset_${i}_direction
done
IFS=':' read -r -a testset_list <<< ${testsets}
bleu () {
src=$1
tgt=$2
res_file=$3
ref_file=$4
if [[ -f ${res_file} ]]; then
f_dirname=`dirname ${res_file}`
python3 ${repo_dir}/scripts/utils.py ${res_file} ${ref_file} || exit 1;
input_file="${f_dirname}/hypo.out.nobpe"
output_file="${f_dirname}/hypo.out.nobpe.final"
# form command
cmd="cat ${input_file}"
lang_token="LANG_TOK_"`echo "${tgt} " | tr '[a-z]' '[A-Z]'`
if [[ $tgt == "fr" ]]; then
cmd=$cmd" | sed -Ee 's/\"([^\"]*)\"/« \1 »/g'"
elif [[ $tgt == "zh" ]]; then
tokenizer="zh"
elif [[ $tgt == "ja" ]]; then
tokenizer="ja-mecab"
fi
[[ -z $tokenizer ]] && tokenizer="none"
cmd=$cmd" | sed -e s'|${lang_token} ||g' > ${output_file}"
eval $cmd || { echo "$cmd FAILED !"; exit 1; }
cat ${output_file} | sacrebleu -l ${src}-${tgt} -tok $tokenizer --short "${f_dirname}/ref.out" | awk '{print $3}'
else
echo "${res_file} not exist!" >&2 && exit 1;
fi
}
# monitor
# ${ckptname}/${direction}/${testname}/orig.txt
(inotifywait -r -m -e close_write ${res_path} |
while read path action file; do
if [[ "$file" =~ .*txt$ ]]; then
tmp_str="${path%/*}"
testname="${tmp_str##*/}"
tmp_str="${tmp_str%/*}"
direction="${tmp_str##*/}"
tmp_str="${tmp_str%/*}"
ckptname="${tmp_str##*/}"
src_lang="${direction%2*}"
tgt_lang="${direction##*2}"
res_file=$path$file
ref_file=${data_dir}/ref/${direction}/${testname}/dev.${tgt_lang}
bleuscore=`bleu ${src_lang} ${tgt_lang} ${res_file} ${ref_file}`
bleu_str="$(date "+%Y-%m-%d %H:%M:%S")\t${ckptname}\t${direction}/${testname}\t$bleuscore"
echo -e ${bleu_str} # to stdout
echo -e ${bleu_str} >> ${model_dir}/summary.log
fi
done) &
if [[ ${choice} == "all" ]]; then
filelist=`ls -la ${model_dir} | sort -k6,7 -r | awk '{print $NF}' | grep .pt$ | tr '\n' ' '`
elif [[ ${choice} == "best" ]]; then
filelist="${model_dir}/checkpoint_best.pt"
elif [[ ${choice} == "last" ]]; then
filelist="${model_dir}/checkpoint_last.pt"
else
echo "invalid choice!" && exit 2;
fi
N=${NUM_GPU}
#export CUDA_VISIBLE_DEVICES=$(seq -s ',' 0 $(($N - 1)) )
infer_test () {
test_path=$1
ckpts=$2
gpu=$3
final_res_file=$4
src=$5
tgt=$6
gpu_cmd="CUDA_VISIBLE_DEVICES=$gpu "
lang_token="LANG_TOK_"`echo "${tgt} " | tr '[a-z]' '[A-Z]'`
[[ -z ${max_source_positions} ]] && max_source_positions=1024
[[ -z ${max_target_positions} ]] && max_target_positions=1024
command=${gpu_cmd}"fairseq-generate ${test_path} \
--user-dir ${repo_dir}/mcolt \
-s ${src} \
-t ${tgt} \
--skip-invalid-size-inputs-valid-test \
--path ${ckpts} \
--max-tokens 1024 \
--task translation_w_langtok \
${options} \
--lang-prefix-tok ${lang_token} \
--max-source-positions ${max_source_positions} \
--max-target-positions ${max_target_positions} \
--nbest 1 | grep -E '[S|H|P|T]-[0-9]+' > ${final_res_file}
"
echo "$command"
}
export -f infer_test
i=0
(for ckpt in ${filelist}
do
for testset in "${testset_list[@]}"
do
ckptbase=`basename $ckpt`
ckptname="${ckptbase%.*}"
direction="${testset%/*}"
testname="${testset##*/}"
src_lang="${direction%2*}"
tgt_lang="${direction##*2}"
((i=i%N)); ((i++==0)) && wait
test_path=${data_dir}/${testset}
echo "-----> "${ckptname}" | "${direction}/$testname" <-----" >&2
if [[ ! -d ${res_path}/${ckptname}/${direction}/${testname} ]]; then
mkdir -p ${res_path}/${ckptname}/${direction}/${testname}
fi
final_res_file="${res_path}/${ckptname}/${direction}/${testname}/orig.txt"
command=`infer_test ${test_path} ${model_dir}/${ckptname}.pt $((i-1)) ${final_res_file} ${src_lang} ${tgt_lang}`
echo "${command}"
eval $command &
done
done)
|