|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
echo `date` |
|
exp_dir=$1 |
|
data_dir=$2 |
|
bpe_dir=$3 |
|
src_lang=$4 |
|
tgt_lang=$5 |
|
split=$6 |
|
parallel_installed=${7:-false} |
|
|
|
in_split_dir=$data_dir/$split |
|
out_split_dir=$bpe_dir/$split |
|
|
|
echo "Apply Sentence Piece tokenization to SRC corpus" |
|
|
|
|
|
if $parallel_installed; then |
|
parallel --pipe --keep-order \ |
|
spm_encode --model=$exp_dir/vocab/model.SRC \ |
|
--output_format=piece \ |
|
< $in_split_dir.$src_lang \ |
|
> $out_split_dir.$src_lang |
|
else |
|
spm_encode --model=$exp_dir/vocab/model.SRC \ |
|
--output_format=piece \ |
|
< $in_split_dir.$src_lang \ |
|
> $out_split_dir.$src_lang |
|
fi |
|
|
|
echo "Apply Sentence Piece tokenization to TGT corpus" |
|
|
|
|
|
if $parallel_installed; then |
|
parallel --pipe --keep-order \ |
|
spm_encode --model=$exp_dir/vocab/model.TGT \ |
|
--output_format=piece \ |
|
< $in_split_dir.$tgt_lang \ |
|
> $out_split_dir.$tgt_lang |
|
else |
|
spm_encode --model=$exp_dir/vocab/model.TGT \ |
|
--output_format=piece \ |
|
< $in_split_dir.$tgt_lang \ |
|
> $out_split_dir.$tgt_lang |
|
fi |