File size: 5,361 Bytes
95a3ca6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env bash

# repo_dir: root directory of the project
repo_dir="$( cd "$( dirname "$0" )" && pwd )"
echo "==== Working directory: ====" >&2
echo "${repo_dir}" >&2
echo "============================" >&2


test_config=$1
source ${repo_dir}/scripts/load_config.sh ${test_config} ${repo_dir}
model_dir=$2
choice=$3  # all|best|last

model_dir=${repo_dir}/model
data_dir=${repo_dir}/data
res_path=${model_dir}/results

mkdir -p ${model_dir} ${data_dir} ${res_path}

testset_name=data_testset_1_name
testset_path=data_testset_1_path
testset_ref=data_testset_1_ref
testset_direc=data_testset_1_direction
i=1
testsets=""
while [[ ! -z ${!testset_path} && ! -z ${!testset_direc} ]]; do
    dataname=${!testset_name}
    mkdir -p ${data_dir}/${!testset_direc}/${dataname} ${data_dir}/ref/${!testset_direc}/${dataname}
    cp ${!testset_path}/* ${data_dir}/${!testset_direc}/${dataname}/
    cp ${!testset_ref}/* ${data_dir}/ref/${!testset_direc}/${dataname}/
    if [[ $testsets == "" ]]; then
        testsets=${!testset_direc}/${dataname}
    else
        testsets=${testsets}:${!testset_direc}/${dataname}
    fi
    i=$((i+1))
    testset_name=testset_${i}_name
    testset_path=testset_${i}_path
    testset_ref=testset_${i}_ref
    testset_direc=testset_${i}_direction
done

IFS=':' read -r -a testset_list <<< ${testsets}


bleu () {
    src=$1
    tgt=$2
    res_file=$3
    ref_file=$4
    if [[ -f ${res_file} ]]; then
        f_dirname=`dirname ${res_file}`
        python3 ${repo_dir}/scripts/utils.py ${res_file} ${ref_file} || exit 1;
        input_file="${f_dirname}/hypo.out.nobpe"
        output_file="${f_dirname}/hypo.out.nobpe.final"
        # form command
        cmd="cat ${input_file}"
        lang_token="LANG_TOK_"`echo "${tgt} " | tr '[a-z]' '[A-Z]'`
        if [[ $tgt == "fr" ]]; then
            cmd=$cmd" | sed -Ee 's/\"([^\"]*)\"/« \1 »/g'"
        elif [[ $tgt == "zh" ]]; then
            tokenizer="zh"
        elif [[ $tgt == "ja" ]]; then
            tokenizer="ja-mecab"
        fi
        [[ -z $tokenizer ]] && tokenizer="none"
        cmd=$cmd" | sed -e s'|${lang_token} ||g' > ${output_file}"
        eval $cmd || { echo "$cmd FAILED !"; exit 1; }
        cat ${output_file} | sacrebleu -l ${src}-${tgt} -tok $tokenizer --short "${f_dirname}/ref.out" | awk '{print $3}'
    else
        echo "${res_file} not exist!" >&2 && exit 1;
    fi
}

# monitor
# ${ckptname}/${direction}/${testname}/orig.txt
(inotifywait -r -m -e close_write ${res_path} |
while read path action file; do
    if [[ "$file" =~ .*txt$ ]]; then
        tmp_str="${path%/*}"
        testname="${tmp_str##*/}"
        tmp_str="${tmp_str%/*}"
        direction="${tmp_str##*/}"
        tmp_str="${tmp_str%/*}"
        ckptname="${tmp_str##*/}"
        src_lang="${direction%2*}"
        tgt_lang="${direction##*2}"
        res_file=$path$file
        ref_file=${data_dir}/ref/${direction}/${testname}/dev.${tgt_lang}
        bleuscore=`bleu ${src_lang} ${tgt_lang} ${res_file} ${ref_file}`
        bleu_str="$(date "+%Y-%m-%d %H:%M:%S")\t${ckptname}\t${direction}/${testname}\t$bleuscore"
        echo -e ${bleu_str}  # to stdout
        echo -e ${bleu_str} >> ${model_dir}/summary.log
    fi
done) &


if [[ ${choice} == "all" ]]; then
    filelist=`ls -la ${model_dir} | sort -k6,7 -r | awk '{print $NF}' | grep .pt$ | tr '\n' ' '`
elif [[ ${choice} == "best" ]]; then
    filelist="${model_dir}/checkpoint_best.pt"
elif [[ ${choice} == "last" ]]; then
    filelist="${model_dir}/checkpoint_last.pt"
else
    echo "invalid choice!" && exit 2;
fi

N=${NUM_GPU}
#export CUDA_VISIBLE_DEVICES=$(seq -s ',' 0 $(($N - 1)) )


infer_test () {
    test_path=$1
    ckpts=$2
    gpu=$3
    final_res_file=$4
    src=$5
    tgt=$6
    gpu_cmd="CUDA_VISIBLE_DEVICES=$gpu "
    lang_token="LANG_TOK_"`echo "${tgt} " | tr '[a-z]' '[A-Z]'`
    [[ -z ${max_source_positions} ]] && max_source_positions=1024
    [[ -z ${max_target_positions} ]] && max_target_positions=1024
    command=${gpu_cmd}"fairseq-generate ${test_path} \
    --user-dir ${repo_dir}/mcolt \
    -s ${src} \
    -t ${tgt} \
    --skip-invalid-size-inputs-valid-test \
    --path ${ckpts} \
    --max-tokens 1024 \
    --task translation_w_langtok \
    ${options} \
    --lang-prefix-tok ${lang_token} \
    --max-source-positions ${max_source_positions} \
    --max-target-positions ${max_target_positions} \
    --nbest 1 | grep -E '[S|H|P|T]-[0-9]+' > ${final_res_file}
    "
    echo "$command"
}

export -f infer_test
i=0
(for ckpt in ${filelist}
do
    for testset in "${testset_list[@]}"
    do
        ckptbase=`basename $ckpt`
        ckptname="${ckptbase%.*}"
        direction="${testset%/*}"
        testname="${testset##*/}"
        src_lang="${direction%2*}"
        tgt_lang="${direction##*2}"

        ((i=i%N)); ((i++==0)) && wait
        test_path=${data_dir}/${testset}

        echo "-----> "${ckptname}" | "${direction}/$testname" <-----" >&2
        if [[ ! -d ${res_path}/${ckptname}/${direction}/${testname} ]]; then
            mkdir -p ${res_path}/${ckptname}/${direction}/${testname}
        fi
        final_res_file="${res_path}/${ckptname}/${direction}/${testname}/orig.txt"
        command=`infer_test ${test_path} ${model_dir}/${ckptname}.pt $((i-1)) ${final_res_file} ${src_lang} ${tgt_lang}`
        echo "${command}"
        eval $command &
    done
done)