|
#!/usr/bin/env bash |
|
|
|
|
|
repo_dir="$( cd "$( dirname "$0" )" && pwd )" |
|
echo "==== Working directory: ====" >&2 |
|
echo "${repo_dir}" >&2 |
|
echo "============================" >&2 |
|
|
|
|
|
test_config=$1 |
|
source ${repo_dir}/scripts/load_config.sh ${test_config} ${repo_dir} |
|
model_dir=$2 |
|
choice=$3 |
|
|
|
model_dir=${repo_dir}/model |
|
data_dir=${repo_dir}/data |
|
res_path=${model_dir}/results |
|
|
|
mkdir -p ${model_dir} ${data_dir} ${res_path} |
|
|
|
testset_name=data_testset_1_name |
|
testset_path=data_testset_1_path |
|
testset_ref=data_testset_1_ref |
|
testset_direc=data_testset_1_direction |
|
i=1 |
|
testsets="" |
|
while [[ ! -z ${!testset_path} && ! -z ${!testset_direc} ]]; do |
|
dataname=${!testset_name} |
|
mkdir -p ${data_dir}/${!testset_direc}/${dataname} ${data_dir}/ref/${!testset_direc}/${dataname} |
|
cp ${!testset_path}/* ${data_dir}/${!testset_direc}/${dataname}/ |
|
cp ${!testset_ref}/* ${data_dir}/ref/${!testset_direc}/${dataname}/ |
|
if [[ $testsets == "" ]]; then |
|
testsets=${!testset_direc}/${dataname} |
|
else |
|
testsets=${testsets}:${!testset_direc}/${dataname} |
|
fi |
|
i=$((i+1)) |
|
testset_name=testset_${i}_name |
|
testset_path=testset_${i}_path |
|
testset_ref=testset_${i}_ref |
|
testset_direc=testset_${i}_direction |
|
done |
|
|
|
IFS=':' read -r -a testset_list <<< ${testsets} |
|
|
|
|
|
bleu () { |
|
src=$1 |
|
tgt=$2 |
|
res_file=$3 |
|
ref_file=$4 |
|
if [[ -f ${res_file} ]]; then |
|
f_dirname=`dirname ${res_file}` |
|
python3 ${repo_dir}/scripts/utils.py ${res_file} ${ref_file} || exit 1; |
|
input_file="${f_dirname}/hypo.out.nobpe" |
|
output_file="${f_dirname}/hypo.out.nobpe.final" |
|
|
|
cmd="cat ${input_file}" |
|
lang_token="LANG_TOK_"`echo "${tgt} " | tr '[a-z]' '[A-Z]'` |
|
if [[ $tgt == "fr" ]]; then |
|
cmd=$cmd" | sed -Ee 's/\"([^\"]*)\"/« \1 »/g'" |
|
elif [[ $tgt == "zh" ]]; then |
|
tokenizer="zh" |
|
elif [[ $tgt == "ja" ]]; then |
|
tokenizer="ja-mecab" |
|
fi |
|
[[ -z $tokenizer ]] && tokenizer="none" |
|
cmd=$cmd" | sed -e s'|${lang_token} ||g' > ${output_file}" |
|
eval $cmd || { echo "$cmd FAILED !"; exit 1; } |
|
cat ${output_file} | sacrebleu -l ${src}-${tgt} -tok $tokenizer --short "${f_dirname}/ref.out" | awk '{print $3}' |
|
else |
|
echo "${res_file} not exist!" >&2 && exit 1; |
|
fi |
|
} |
|
|
|
|
|
|
|
(inotifywait -r -m -e close_write ${res_path} | |
|
while read path action file; do |
|
if [[ "$file" =~ .*txt$ ]]; then |
|
tmp_str="${path%/*}" |
|
testname="${tmp_str##*/}" |
|
tmp_str="${tmp_str%/*}" |
|
direction="${tmp_str##*/}" |
|
tmp_str="${tmp_str%/*}" |
|
ckptname="${tmp_str##*/}" |
|
src_lang="${direction%2*}" |
|
tgt_lang="${direction##*2}" |
|
res_file=$path$file |
|
ref_file=${data_dir}/ref/${direction}/${testname}/dev.${tgt_lang} |
|
bleuscore=`bleu ${src_lang} ${tgt_lang} ${res_file} ${ref_file}` |
|
bleu_str="$(date "+%Y-%m-%d %H:%M:%S")\t${ckptname}\t${direction}/${testname}\t$bleuscore" |
|
echo -e ${bleu_str} |
|
echo -e ${bleu_str} >> ${model_dir}/summary.log |
|
fi |
|
done) & |
|
|
|
|
|
if [[ ${choice} == "all" ]]; then |
|
filelist=`ls -la ${model_dir} | sort -k6,7 -r | awk '{print $NF}' | grep .pt$ | tr '\n' ' '` |
|
elif [[ ${choice} == "best" ]]; then |
|
filelist="${model_dir}/checkpoint_best.pt" |
|
elif [[ ${choice} == "last" ]]; then |
|
filelist="${model_dir}/checkpoint_last.pt" |
|
else |
|
echo "invalid choice!" && exit 2; |
|
fi |
|
|
|
N=${NUM_GPU} |
|
|
|
|
|
|
|
infer_test () { |
|
test_path=$1 |
|
ckpts=$2 |
|
gpu=$3 |
|
final_res_file=$4 |
|
src=$5 |
|
tgt=$6 |
|
gpu_cmd="CUDA_VISIBLE_DEVICES=$gpu " |
|
lang_token="LANG_TOK_"`echo "${tgt} " | tr '[a-z]' '[A-Z]'` |
|
[[ -z ${max_source_positions} ]] && max_source_positions=1024 |
|
[[ -z ${max_target_positions} ]] && max_target_positions=1024 |
|
command=${gpu_cmd}"fairseq-generate ${test_path} \ |
|
--user-dir ${repo_dir}/mcolt \ |
|
-s ${src} \ |
|
-t ${tgt} \ |
|
--skip-invalid-size-inputs-valid-test \ |
|
--path ${ckpts} \ |
|
--max-tokens 1024 \ |
|
--task translation_w_langtok \ |
|
${options} \ |
|
--lang-prefix-tok ${lang_token} \ |
|
--max-source-positions ${max_source_positions} \ |
|
--max-target-positions ${max_target_positions} \ |
|
--nbest 1 | grep -E '[S|H|P|T]-[0-9]+' > ${final_res_file} |
|
" |
|
echo "$command" |
|
} |
|
|
|
export -f infer_test |
|
i=0 |
|
(for ckpt in ${filelist} |
|
do |
|
for testset in "${testset_list[@]}" |
|
do |
|
ckptbase=`basename $ckpt` |
|
ckptname="${ckptbase%.*}" |
|
direction="${testset%/*}" |
|
testname="${testset##*/}" |
|
src_lang="${direction%2*}" |
|
tgt_lang="${direction##*2}" |
|
|
|
((i=i%N)); ((i++==0)) && wait |
|
test_path=${data_dir}/${testset} |
|
|
|
echo "-----> "${ckptname}" | "${direction}/$testname" <-----" >&2 |
|
if [[ ! -d ${res_path}/${ckptname}/${direction}/${testname} ]]; then |
|
mkdir -p ${res_path}/${ckptname}/${direction}/${testname} |
|
fi |
|
final_res_file="${res_path}/${ckptname}/${direction}/${testname}/orig.txt" |
|
command=`infer_test ${test_path} ${model_dir}/${ckptname}.pt $((i-1)) ${final_res_file} ${src_lang} ${tgt_lang}` |
|
echo "${command}" |
|
eval $command & |
|
done |
|
done) |
|
|