|
#!/usr/bin/env bash |
|
|
|
|
|
|
|
ONMT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )" |
|
|
|
|
|
|
|
source ~/.bashrc |
|
|
|
|
|
|
|
NAME="run1" |
|
OUT="onmt-runs/$NAME" |
|
|
|
DATA="$ONMT/onmt-runs/data" |
|
TRAIN_SRC=$DATA/*train.src |
|
TRAIN_TGT=$DATA/*train.tgt |
|
VALID_SRC=$DATA/*dev.src |
|
VALID_TGT=$DATA/*dev.tgt |
|
TEST_SRC=$DATA/*test.src |
|
TEST_TGT=$DATA/*test.tgt |
|
|
|
BPE="" |
|
BPE="src" |
|
|
|
|
|
BPE_SRC_OPS=10000 |
|
|
|
|
|
BPE_TGT_OPS=10000 |
|
|
|
GPUARG="" |
|
GPUARG="0" |
|
|
|
|
|
|
|
|
|
|
|
for f in $TRAIN_SRC $TRAIN_TGT $VALID_SRC $VALID_TGT $TEST_SRC $TEST_TGT; do |
|
if [[ ! -f "$f" ]]; then |
|
echo "Input File $f doesnt exist. Please fix the paths" |
|
exit 1 |
|
fi |
|
done |
|
|
|
function lines_check { |
|
l1=`wc -l $1` |
|
l2=`wc -l $2` |
|
if [[ $l1 != $l2 ]]; then |
|
echo "ERROR: Record counts doesnt match between: $1 and $2" |
|
exit 2 |
|
fi |
|
} |
|
lines_check $TRAIN_SRC $TRAIN_TGT |
|
lines_check $VALID_SRC $VALID_TGT |
|
lines_check $TEST_SRC $TEST_TGT |
|
|
|
|
|
echo "Output dir = $OUT" |
|
[ -d $OUT ] || mkdir -p $OUT |
|
[ -d $OUT/data ] || mkdir -p $OUT/data |
|
[ -d $OUT/models ] || mkdir $OUT/models |
|
[ -d $OUT/test ] || mkdir -p $OUT/test |
|
|
|
|
|
echo "Step 1a: Preprocess inputs" |
|
if [[ "$BPE" == *"src"* ]]; then |
|
echo "BPE on source" |
|
|
|
$ONMT/tools/learn_bpe.py -s $BPE_SRC_OPS < $TRAIN_SRC > $OUT/data/bpe-codes.src |
|
|
|
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.src < $TRAIN_SRC > $OUT/data/train.src |
|
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.src < $VALID_SRC > $OUT/data/valid.src |
|
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.src < $TEST_SRC > $OUT/data/test.src |
|
else |
|
ln -sf $TRAIN_SRC $OUT/data/train.src |
|
ln -sf $VALID_SRC $OUT/data/valid.src |
|
ln -sf $TEST_SRC $OUT/data/test.src |
|
fi |
|
|
|
|
|
if [[ "$BPE" == *"tgt"* ]]; then |
|
echo "BPE on target" |
|
|
|
$ONMT/tools/learn_bpe.py -s $BPE_SRC_OPS < $TRAIN_TGT > $OUT/data/bpe-codes.tgt |
|
|
|
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.tgt < $TRAIN_TGT > $OUT/data/train.tgt |
|
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.tgt < $VALID_TGT > $OUT/data/valid.tgt |
|
|
|
|
|
ln -sf $TEST_TGT $OUT/data/test.tgt |
|
else |
|
ln -sf $TRAIN_TGT $OUT/data/train.tgt |
|
ln -sf $VALID_TGT $OUT/data/valid.tgt |
|
ln -sf $TEST_TGT $OUT/data/test.tgt |
|
fi |
|
|
|
|
|
|
|
echo "Step 1b: Preprocess" |
|
python $ONMT/preprocess.py \ |
|
-train_src $OUT/data/train.src \ |
|
-train_tgt $OUT/data/train.tgt \ |
|
-valid_src $OUT/data/valid.src \ |
|
-valid_tgt $OUT/data/valid.tgt \ |
|
-save_data $OUT/data/processed |
|
|
|
|
|
echo "Step 2: Train" |
|
GPU_OPTS="" |
|
if [[ ! -z $GPUARG ]]; then |
|
GPU_OPTS="-gpu_ranks $GPUARG" |
|
fi |
|
CMD="python $ONMT/train.py -data $OUT/data/processed -save_model $OUT/models/$NAME $GPU_OPTS" |
|
echo "Training command :: $CMD" |
|
eval "$CMD" |
|
|
|
|
|
|
|
|
|
|
|
model=`ls $OUT/models/*.pt| awk -F '_' 'BEGIN{maxv=-1000000} {score=$(NF-3)-$(NF-1); if (score > maxv) {maxv=score; max=$0}} END{ print max}'` |
|
echo "Chosen Model = $model" |
|
if [[ -z "$model" ]]; then |
|
echo "Model not found. Looked in $OUT/models/" |
|
exit 1 |
|
fi |
|
|
|
GPU_OPTS="" |
|
if [ ! -z $GPUARG ]; then |
|
GPU_OPTS="-gpu $GPUARG" |
|
fi |
|
|
|
echo "Step 3a: Translate Test" |
|
python $ONMT/translate.py -model $model \ |
|
-src $OUT/data/test.src \ |
|
-output $OUT/test/test.out \ |
|
-replace_unk -verbose $GPU_OPTS > $OUT/test/test.log |
|
|
|
echo "Step 3b: Translate Dev" |
|
python $ONMT/translate.py -model $model \ |
|
-src $OUT/data/valid.src \ |
|
-output $OUT/test/valid.out \ |
|
-replace_unk -verbose $GPU_OPTS > $OUT/test/valid.log |
|
|
|
if [[ "$BPE" == *"tgt"* ]]; then |
|
echo "BPE decoding/detokenising target to match with references" |
|
mv $OUT/test/test.out{,.bpe} |
|
mv $OUT/test/valid.out{,.bpe} |
|
cat $OUT/test/valid.out.bpe | sed -E 's/(@@ )|(@@ ?$)//g' > $OUT/test/valid.out |
|
cat $OUT/test/test.out.bpe | sed -E 's/(@@ )|(@@ ?$)//g' > $OUT/test/test.out |
|
fi |
|
|
|
echo "Step 4a: Evaluate Test" |
|
$ONMT/tools/multi-bleu-detok.perl $OUT/data/test.tgt < $OUT/test/test.out > $OUT/test/test.tc.bleu |
|
$ONMT/tools/multi-bleu-detok.perl -lc $OUT/data/test.tgt < $OUT/test/test.out > $OUT/test/test.lc.bleu |
|
|
|
echo "Step 4b: Evaluate Dev" |
|
$ONMT/tools/multi-bleu-detok.perl $OUT/data/valid.tgt < $OUT/test/valid.out > $OUT/test/valid.tc.bleu |
|
$ONMT/tools/multi-bleu-detok.perl -lc $OUT/data/valid.tgt < $OUT/test/valid.out > $OUT/test/valid.lc.bleu |
|
|
|
|
|
|