|
#!/bin/bash |
|
|
|
MOSES_HOME=/opt/moses |
|
GIZA_HOME=${MOSES_HOME}/giza++-v1.0.7 |
|
IRSTLM=${MOSES_HOME}/irstlm-5.70.04 |
|
|
|
function tokenise() { |
|
local LANG="$1" |
|
local FILENAME="$2" |
|
local WORKING_DIR="$3" |
|
local BASENAME="`basename ${FILENAME}`" |
|
|
|
if [ ! -f ${WORKING_DIR} ]; then |
|
mkdir -p ${WORKING_DIR} |
|
fi |
|
|
|
NEW_BASENAME=`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "tok."; } } }'` |
|
|
|
TOKENISED_FILENAME="${WORKING_DIR}/${NEW_BASENAME}" |
|
${MOSES_HOME}/scripts/tokenizer/tokenizer.perl -q -l ${LANG} < ${FILENAME} > ${TOKENISED_FILENAME} |
|
} |
|
|
|
function cleanup() { |
|
local SRC_FILENAME="$1" |
|
local TGT_FILENAME="$2" |
|
local SEGMENT_LENGTH="$3" |
|
SRC_CLEANUP_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'` |
|
TGT_CLEANUP_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'` |
|
|
|
truncate -s 0 ${SRC_CLEANUP_FILENAME} |
|
truncate -s 0 ${TGT_CLEANUP_FILENAME} |
|
|
|
paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE; |
|
do |
|
declare -i SRC_NO_WORDS=`echo "${SRC_LINE}" | wc -w` |
|
declare -i TGT_NO_WORDS=`echo "${TGT_LINE}" | wc -w` |
|
if [ ${SRC_NO_WORDS} -lt 20 -a ${TGT_NO_WORDS} -lt 20 ]; then |
|
echo "${SRC_LINE}" >> ${SRC_CLEANUP_FILENAME} |
|
echo "${TGT_LINE}" >> ${TGT_CLEANUP_FILENAME} |
|
fi |
|
done |
|
} |
|
|
|
function data_split() { |
|
local SRC_FILENAME="$1" |
|
local TGT_FILENAME="$2" |
|
declare -i DEV_SIZE="$3" |
|
declare -i EVAL_SIZE="$4" |
|
|
|
SRC_TRAIN_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'` |
|
TGT_TRAIN_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'` |
|
SRC_DEVEL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'` |
|
TGT_DEVEL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'` |
|
SRC_EVAL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'` |
|
TGT_EVAL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'` |
|
|
|
local ALL_FILES=(${SRC_TRAIN_FILENAME} ${TGT_TRAIN_FILENAME} ${SRC_DEVEL_FILENAME} ${TGT_DEVEL_FILENAME} ${SRC_EVAL_FILENAME} ${TGT_EVAL_FILENAME}) |
|
for FN in ${ALL_FILES} |
|
do |
|
truncate -s 0 ${FN} |
|
done |
|
|
|
declare -i DEV_EVAL_SIZE=$(($DEV_SIZE + $EVAL_SIZE)) |
|
declare -i LINE_CNT=1 |
|
paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE; |
|
do |
|
if [ ${LINE_CNT} -le ${DEV_EVAL_SIZE} ]; then |
|
if [ ${LINE_CNT} -le ${DEV_SIZE} ]; then |
|
echo "${SRC_LINE}" >> ${SRC_DEVEL_FILENAME} |
|
echo "${TGT_LINE}" >> ${TGT_DEVEL_FILENAME} |
|
else |
|
echo "${SRC_LINE}" >> ${SRC_EVAL_FILENAME} |
|
echo "${TGT_LINE}" >> ${TGT_EVAL_FILENAME} |
|
fi |
|
else |
|
echo "${SRC_LINE}" >> ${SRC_TRAIN_FILENAME} |
|
echo "${TGT_LINE}" >> ${TGT_TRAIN_FILENAME} |
|
fi |
|
LINE_CNT=$(($LINE_CNT + 1)) |
|
done |
|
} |
|
|
|
function translation_model_train() { |
|
declare -l TT_SRC_LANG="$1" |
|
declare -l TT_TGT_LANG="$2" |
|
local SRC_FILENAME="`realpath $3`" |
|
local TGT_FILENAME="`realpath $4`" |
|
local ALIGNMENT_METHOD="$5" |
|
local REORDERING_METHOD="$6" |
|
local WORKING_DIR="$7" |
|
|
|
declare -r SRC_CORPORA_NAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'` |
|
declare -r TGT_CORPORA_NAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'` |
|
|
|
if [ "${SRC_CORPORA_NAME}" != "${TGT_CORPORA_NAME}" ]; then |
|
echo "Arrrgh" |
|
exit 1 |
|
fi |
|
|
|
if [ -f ${WORKING_DIR} ]; then |
|
rm -Rf ${WORKING_DIR} >& /dev/null |
|
fi |
|
mkdir -p ${WORKING_DIR} |
|
WORKING_DIR=`realpath ${WORKING_DIR}` |
|
|
|
declare -r DUMMY_FILE="${WORKING_DIR}/dummy.lm" |
|
echo "dummy lm file" > ${DUMMY_FILE} |
|
|
|
declare -r LOG_FILE="${WORKING_DIR}/log" |
|
|
|
${MOSES_HOME}/scripts/training/train-model.perl -root-dir ${WORKING_DIR} -corpus ${SRC_CORPORA_NAME} -f ${TT_SRC_LANG} -e ${TT_TGT_LANG} -alignment ${ALIGNMENT_METHOD} -reordering ${REORDERING_METHOD} -lm 0:5:${DUMMY_FILE}:0 -external-bin-dir ${GIZA_HOME} 2> ${LOG_FILE} |
|
|
|
MOSES_INI_FILE="${WORKING_DIR}/model/moses.ini" |
|
} |
|
|
|
function language_model_train() { |
|
local FILENAME="$1" |
|
local SMOOTHING_METHOD="$2" |
|
local WORKING_DIR="$3" |
|
|
|
if [ ! -f ${WORKING_DIR} ]; then |
|
mkdir -p ${WORKING_DIR} |
|
fi |
|
|
|
declare -r BASENAME=`basename ${FILENAME}` |
|
declare -r START_END_OUTPUT_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "sb."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'` |
|
declare -r LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "lm."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'` |
|
COMPILED_LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "arpa."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'` |
|
|
|
export IRSTLM |
|
|
|
${IRSTLM}/bin/add-start-end.sh < ${FILENAME} > ${START_END_OUTPUT_FILENAME} |
|
|
|
declare -r TMP_DIR=`mktemp -dp /tmp` |
|
${IRSTLM}/bin/build-lm.sh -i ${START_END_OUTPUT_FILENAME} -t ${TMP_DIR} -p -s ${SMOOTHING_METHOD} -o ${LM_FILENAME} |
|
if [ -f ${TMP_DIR} ]; then |
|
rm -Rf ${TMP_DIR} >& /dev/null |
|
fi |
|
|
|
${IRSTLM}/bin/compile-lm --text yes ${LM_FILENAME}.gz ${COMPILED_LM_FILENAME} |
|
} |
|
|
|
function mert() { |
|
local MOSES_INI_FILENAME="`realpath $1`" |
|
local COMPILED_LM_FILENAME="`realpath $2`" |
|
local EVAL_FILENAME="$3" |
|
declare -lr _SRC_LANG="$4" |
|
declare -lr _TGT_LANG="$5" |
|
declare -ri MODEL_ORDER="$6" |
|
declare -ri MODEL_TYPE="$7" |
|
local WORKING_DIR="$8" |
|
declare -ri MAX_NO_ITERS="$9" |
|
|
|
local INFILENAME=`realpath ${EVAL_FILENAME}` |
|
INFILENAME=`echo ${INFILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'` |
|
|
|
if [ ! -f ${MOSES_INI_FILENAME} ]; then |
|
echo "${MOSES_INI_FILENAME} does not exist." |
|
exit 1 |
|
fi |
|
|
|
if [ -f ${WORKING_DIR} ]; then |
|
rm -Rf ${WORKING_DIR} >& /dev/null |
|
fi |
|
mkdir -p ${WORKING_DIR} |
|
|
|
WORKING_DIR=`realpath ${WORKING_DIR}` |
|
MERT_INI_FILENAME="${WORKING_DIR}/trained-moses.ini" |
|
local SED_PROG="/\[lmodel-file\]/,/^[[:space:]]*\$/c\[lmodel-file\]\n${MODEL_TYPE} 0 ${MODEL_ORDER} ${COMPILED_LM_FILENAME}\n" |
|
eval cat ${MOSES_INI_FILENAME} | sed "${SED_PROG}" > ${MERT_INI_FILENAME} |
|
|
|
${MOSES_HOME}/scripts/training/mert-moses.pl --maximum-iterations ${MAX_NO_ITERS} --mertdir ${MOSES_HOME}/bin --working-dir ${WORKING_DIR} ${INFILENAME}.${_SRC_LANG} ${INFILENAME}.${_TGT_LANG} ${MOSES_HOME}/bin/moses ${MERT_INI_FILENAME} 2> ${WORKING_DIR}/log |
|
} |
|
|
|
|
|
if [ $# -lt 4 ]; then |
|
echo "`basename $0` usage:" |
|
echo " `basename $0` src_file tgt_file src_lang tgt_lang" |
|
echo |
|
exit 1 |
|
fi |
|
|
|
declare -r SRC_LANG="$3" |
|
declare -r TGT_LANG="$4" |
|
|
|
|
|
tokenise "${SRC_LANG}" "$1" "training/tokeniser" |
|
declare -r SRC_TOKENISED_FILENAME="${TOKENISED_FILENAME}" |
|
|
|
tokenise "${TGT_LANG}" "$2" "training/tokeniser" |
|
declare -r TGT_TOKENISED_FILENAME="${TOKENISED_FILENAME}" |
|
|
|
echo ${SRC_TOKENISED_FILENAME} |
|
echo ${TGT_TOKENISED_FILENAME} |
|
|
|
|
|
cleanup "${SRC_TOKENISED_FILENAME}" "${TGT_TOKENISED_FILENAME}" 20 |
|
|
|
echo ${SRC_CLEANUP_FILENAME} |
|
echo ${TGT_CLEANUP_FILENAME} |
|
|
|
|
|
data_split "${SRC_CLEANUP_FILENAME}" "${TGT_CLEANUP_FILENAME}" 1000 500 |
|
|
|
echo ${SRC_TRAIN_FILENAME} |
|
echo ${TGT_TRAIN_FILENAME} |
|
echo ${SRC_DEVEL_FILENAME} |
|
echo ${TGT_DEVEL_FILENAME} |
|
echo ${SRC_EVAL_FILENAME} |
|
echo ${TGT_EVAL_FILENAME} |
|
|
|
|
|
translation_model_train "${SRC_LANG}" "${TGT_LANG}" "${SRC_DEVEL_FILENAME}" "${TGT_DEVEL_FILENAME}" "grow-diag-final-and" "msd-bidirectional-fe" "training/model" |
|
|
|
declare -r MOSES_TT_INI_FILENAME="${MOSES_INI_FILE}" |
|
echo ${MOSES_TT_INI_FILENAME} |
|
|
|
|
|
language_model_train "${TGT_TOKENISED_FILENAME}" "improved-kneser-ney" "training/lm" |
|
|
|
echo ${COMPILED_LM_FILENAME} |
|
|
|
|
|
mert "${MOSES_TT_INI_FILENAME}" "${COMPILED_LM_FILENAME}" "${SRC_EVAL_FILENAME}" "${SRC_LANG}" "${TGT_LANG}" 3 9 "training/mert" 1 |
|
|
|
echo ${MERT_INI_FILENAME} |
|
|