#!/usr/bin/env bash # Author : Thamme Gowda # Created : Nov 06, 2017 ONMT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )" #======= EXPERIMENT SETUP ====== # Activate python environment if needed source ~/.bashrc # source activate py3 # update these variables NAME="run1" OUT="onmt-runs/$NAME" DATA="$ONMT/onmt-runs/data" TRAIN_SRC=$DATA/*train.src TRAIN_TGT=$DATA/*train.tgt VALID_SRC=$DATA/*dev.src VALID_TGT=$DATA/*dev.tgt TEST_SRC=$DATA/*test.src TEST_TGT=$DATA/*test.tgt BPE="" # default BPE="src" # src, tgt, src+tgt # applicable only when BPE="src" or "src+tgt" BPE_SRC_OPS=10000 # applicable only when BPE="tgt" or "src+tgt" BPE_TGT_OPS=10000 GPUARG="" # default GPUARG="0" #====== EXPERIMENT BEGIN ====== # Check if input exists for f in $TRAIN_SRC $TRAIN_TGT $VALID_SRC $VALID_TGT $TEST_SRC $TEST_TGT; do if [[ ! -f "$f" ]]; then echo "Input File $f doesnt exist. Please fix the paths" exit 1 fi done function lines_check { l1=`wc -l $1` l2=`wc -l $2` if [[ $l1 != $l2 ]]; then echo "ERROR: Record counts doesnt match between: $1 and $2" exit 2 fi } lines_check $TRAIN_SRC $TRAIN_TGT lines_check $VALID_SRC $VALID_TGT lines_check $TEST_SRC $TEST_TGT echo "Output dir = $OUT" [ -d $OUT ] || mkdir -p $OUT [ -d $OUT/data ] || mkdir -p $OUT/data [ -d $OUT/models ] || mkdir $OUT/models [ -d $OUT/test ] || mkdir -p $OUT/test echo "Step 1a: Preprocess inputs" if [[ "$BPE" == *"src"* ]]; then echo "BPE on source" # Here we could use more monolingual data $ONMT/tools/learn_bpe.py -s $BPE_SRC_OPS < $TRAIN_SRC > $OUT/data/bpe-codes.src $ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.src < $TRAIN_SRC > $OUT/data/train.src $ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.src < $VALID_SRC > $OUT/data/valid.src $ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.src < $TEST_SRC > $OUT/data/test.src else ln -sf $TRAIN_SRC $OUT/data/train.src ln -sf $VALID_SRC $OUT/data/valid.src ln -sf $TEST_SRC $OUT/data/test.src fi if [[ "$BPE" == *"tgt"* ]]; then echo "BPE on target" # Here we could use more monolingual data $ONMT/tools/learn_bpe.py -s $BPE_SRC_OPS < $TRAIN_TGT > $OUT/data/bpe-codes.tgt $ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.tgt < $TRAIN_TGT > $OUT/data/train.tgt $ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.tgt < $VALID_TGT > $OUT/data/valid.tgt #$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.tgt < $TEST_TGT > $OUT/data/test.tgt # We dont touch the test References, No BPE on them! ln -sf $TEST_TGT $OUT/data/test.tgt else ln -sf $TRAIN_TGT $OUT/data/train.tgt ln -sf $VALID_TGT $OUT/data/valid.tgt ln -sf $TEST_TGT $OUT/data/test.tgt fi #: < maxv) {maxv=score; max=$0}} END{ print max}'` echo "Chosen Model = $model" if [[ -z "$model" ]]; then echo "Model not found. Looked in $OUT/models/" exit 1 fi GPU_OPTS="" if [ ! -z $GPUARG ]; then GPU_OPTS="-gpu $GPUARG" fi echo "Step 3a: Translate Test" python $ONMT/translate.py -model $model \ -src $OUT/data/test.src \ -output $OUT/test/test.out \ -replace_unk -verbose $GPU_OPTS > $OUT/test/test.log echo "Step 3b: Translate Dev" python $ONMT/translate.py -model $model \ -src $OUT/data/valid.src \ -output $OUT/test/valid.out \ -replace_unk -verbose $GPU_OPTS > $OUT/test/valid.log if [[ "$BPE" == *"tgt"* ]]; then echo "BPE decoding/detokenising target to match with references" mv $OUT/test/test.out{,.bpe} mv $OUT/test/valid.out{,.bpe} cat $OUT/test/valid.out.bpe | sed -E 's/(@@ )|(@@ ?$)//g' > $OUT/test/valid.out cat $OUT/test/test.out.bpe | sed -E 's/(@@ )|(@@ ?$)//g' > $OUT/test/test.out fi echo "Step 4a: Evaluate Test" $ONMT/tools/multi-bleu-detok.perl $OUT/data/test.tgt < $OUT/test/test.out > $OUT/test/test.tc.bleu $ONMT/tools/multi-bleu-detok.perl -lc $OUT/data/test.tgt < $OUT/test/test.out > $OUT/test/test.lc.bleu echo "Step 4b: Evaluate Dev" $ONMT/tools/multi-bleu-detok.perl $OUT/data/valid.tgt < $OUT/test/valid.out > $OUT/test/valid.tc.bleu $ONMT/tools/multi-bleu-detok.perl -lc $OUT/data/valid.tgt < $OUT/test/valid.out > $OUT/test/valid.lc.bleu #===== EXPERIMENT END ======