Spaces:

chinmaydan
/

S2SCascadeDemo

Runtime error

File size: 5,361 Bytes

95a3ca6

#!/usr/bin/env bash

# repo_dir: root directory of the project
repo_dir="$( cd "$( dirname "$0" )" && pwd )"
echo "==== Working directory: ====" >&2
echo "${repo_dir}" >&2
echo "============================" >&2


test_config=$1
source ${repo_dir}/scripts/load_config.sh ${test_config} ${repo_dir}
model_dir=$2
choice=$3  # all|best|last

model_dir=${repo_dir}/model
data_dir=${repo_dir}/data
res_path=${model_dir}/results

mkdir -p ${model_dir} ${data_dir} ${res_path}

testset_name=data_testset_1_name
testset_path=data_testset_1_path
testset_ref=data_testset_1_ref
testset_direc=data_testset_1_direction
i=1
testsets=""
while [[ ! -z ${!testset_path} && ! -z ${!testset_direc} ]]; do
    dataname=${!testset_name}
    mkdir -p ${data_dir}/${!testset_direc}/${dataname} ${data_dir}/ref/${!testset_direc}/${dataname}
    cp ${!testset_path}/* ${data_dir}/${!testset_direc}/${dataname}/
    cp ${!testset_ref}/* ${data_dir}/ref/${!testset_direc}/${dataname}/
    if [[ $testsets == "" ]]; then
        testsets=${!testset_direc}/${dataname}
    else
        testsets=${testsets}:${!testset_direc}/${dataname}
    fi
    i=$((i+1))
    testset_name=testset_${i}_name
    testset_path=testset_${i}_path
    testset_ref=testset_${i}_ref
    testset_direc=testset_${i}_direction
done

IFS=':' read -r -a testset_list <<< ${testsets}


bleu () {
    src=$1
    tgt=$2
    res_file=$3
    ref_file=$4
    if [[ -f ${res_file} ]]; then
        f_dirname=`dirname ${res_file}`
        python3 ${repo_dir}/scripts/utils.py ${res_file} ${ref_file} || exit 1;
        input_file="${f_dirname}/hypo.out.nobpe"
        output_file="${f_dirname}/hypo.out.nobpe.final"
        # form command
        cmd="cat ${input_file}"
        lang_token="LANG_TOK_"`echo "${tgt} " | tr '[a-z]' '[A-Z]'`
        if [[ $tgt == "fr" ]]; then
            cmd=$cmd" | sed -Ee 's/\"([^\"]*)\"/« \1 »/g'"
        elif [[ $tgt == "zh" ]]; then
            tokenizer="zh"
        elif [[ $tgt == "ja" ]]; then
            tokenizer="ja-mecab"
        fi
        [[ -z $tokenizer ]] && tokenizer="none"
        cmd=$cmd" | sed -e s'|${lang_token} ||g' > ${output_file}"
        eval $cmd || { echo "$cmd FAILED !"; exit 1; }
        cat ${output_file} | sacrebleu -l ${src}-${tgt} -tok $tokenizer --short "${f_dirname}/ref.out" | awk '{print $3}'
    else
        echo "${res_file} not exist!" >&2 && exit 1;
    fi
}

# monitor
# ${ckptname}/${direction}/${testname}/orig.txt
(inotifywait -r -m -e close_write ${res_path} |
while read path action file; do
    if [[ "$file" =~ .*txt$ ]]; then
        tmp_str="${path%/*}"
        testname="${tmp_str##*/}"
        tmp_str="${tmp_str%/*}"
        direction="${tmp_str##*/}"
        tmp_str="${tmp_str%/*}"
        ckptname="${tmp_str##*/}"
        src_lang="${direction%2*}"
        tgt_lang="${direction##*2}"
        res_file=$path$file
        ref_file=${data_dir}/ref/${direction}/${testname}/dev.${tgt_lang}
        bleuscore=`bleu ${src_lang} ${tgt_lang} ${res_file} ${ref_file}`
        bleu_str="$(date "+%Y-%m-%d %H:%M:%S")\t${ckptname}\t${direction}/${testname}\t$bleuscore"
        echo -e ${bleu_str}  # to stdout
        echo -e ${bleu_str} >> ${model_dir}/summary.log
    fi
done) &


if [[ ${choice} == "all" ]]; then
    filelist=`ls -la ${model_dir} | sort -k6,7 -r | awk '{print $NF}' | grep .pt$ | tr '\n' ' '`
elif [[ ${choice} == "best" ]]; then
    filelist="${model_dir}/checkpoint_best.pt"
elif [[ ${choice} == "last" ]]; then
    filelist="${model_dir}/checkpoint_last.pt"
else
    echo "invalid choice!" && exit 2;
fi

N=${NUM_GPU}
#export CUDA_VISIBLE_DEVICES=$(seq -s ',' 0 $(($N - 1)) )


infer_test () {
    test_path=$1
    ckpts=$2
    gpu=$3
    final_res_file=$4
    src=$5
    tgt=$6
    gpu_cmd="CUDA_VISIBLE_DEVICES=$gpu "
    lang_token="LANG_TOK_"`echo "${tgt} " | tr '[a-z]' '[A-Z]'`
    [[ -z ${max_source_positions} ]] && max_source_positions=1024
    [[ -z ${max_target_positions} ]] && max_target_positions=1024
    command=${gpu_cmd}"fairseq-generate ${test_path} \
    --user-dir ${repo_dir}/mcolt \
    -s ${src} \
    -t ${tgt} \
    --skip-invalid-size-inputs-valid-test \
    --path ${ckpts} \
    --max-tokens 1024 \
    --task translation_w_langtok \
    ${options} \
    --lang-prefix-tok ${lang_token} \
    --max-source-positions ${max_source_positions} \
    --max-target-positions ${max_target_positions} \
    --nbest 1 | grep -E '[S|H|P|T]-[0-9]+' > ${final_res_file}
    "
    echo "$command"
}

export -f infer_test
i=0
(for ckpt in ${filelist}
do
    for testset in "${testset_list[@]}"
    do
        ckptbase=`basename $ckpt`
        ckptname="${ckptbase%.*}"
        direction="${testset%/*}"
        testname="${testset##*/}"
        src_lang="${direction%2*}"
        tgt_lang="${direction##*2}"

        ((i=i%N)); ((i++==0)) && wait
        test_path=${data_dir}/${testset}

        echo "-----> "${ckptname}" | "${direction}/$testname" <-----" >&2
        if [[ ! -d ${res_path}/${ckptname}/${direction}/${testname} ]]; then
            mkdir -p ${res_path}/${ckptname}/${direction}/${testname}
        fi
        final_res_file="${res_path}/${ckptname}/${direction}/${testname}/orig.txt"
        command=`infer_test ${test_path} ${model_dir}/${ckptname}.pt $((i-1)) ${final_res_file} ${src_lang} ${tgt_lang}`
        echo "${command}"
        eval $command &
    done
done)