|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if [ -z $WORKDIR_ROOT ] ; |
|
then |
|
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." |
|
exit |
|
fi |
|
|
|
|
|
|
|
TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2 |
|
|
|
DEST=${WORKDIR_ROOT}/ML50/raw |
|
|
|
|
|
|
|
ROOT=${WORKDIR_ROOT} |
|
UTILS=$PWD/utils |
|
TMX2CORPUS="${UTILS}/tmx2corpus" |
|
TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py" |
|
|
|
mkdir -p $TMP_DIR |
|
mkdir -p $DEST |
|
mkdir -p $UTILS |
|
|
|
function download_opus(){ |
|
src=$1 |
|
tgt=$2 |
|
subset=$3 |
|
ulr=$4 |
|
|
|
mkdir extract_$subset.$src-$tgt |
|
pushd extract_$subset.$src-$tgt |
|
if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then |
|
wget $url -O "$subset.$src-$tgt.tmx.gz" |
|
gzip -d "$subset.$src-$tgt.tmx.gz" |
|
f=$subset.$src-$tgt.tmx |
|
$TMX_TOOL $f |
|
mv bitext.$src ../$subset.$src-$tgt.$src |
|
mv bitext.$tgt ../$subset.$src-$tgt.$tgt |
|
fi |
|
popd |
|
} |
|
|
|
function concat_subsets(){ |
|
src=$1 |
|
tgt=$2 |
|
subsets=$3 |
|
src_train=raw_train.$src-$tgt.$src |
|
tgt_train=raw_train.$src-$tgt.$tgt |
|
> $src_train |
|
> $tgt_train |
|
for subset in $subsets; do |
|
cat $subset.$src-$tgt.$src >> $src_train |
|
cat $subset.$src-$tgt.$tgt >> $tgt_train |
|
done |
|
} |
|
|
|
|
|
|
|
function get_seeded_random() |
|
{ |
|
seed="$1" |
|
openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \ |
|
</dev/zero 2>/dev/null |
|
} |
|
|
|
function split_train_valid(){ |
|
src=$1 |
|
tgt=$2 |
|
raw_src_train=raw_train.$src-$tgt.$src |
|
raw_tgt_train=raw_train.$src-$tgt.$tgt |
|
|
|
shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src |
|
shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt |
|
|
|
head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src |
|
head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt |
|
|
|
tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src |
|
tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt |
|
} |
|
|
|
function copy2dst(){ |
|
lsrc=$1 |
|
ltgt=$2 |
|
src=${lsrc:0:2} |
|
tgt=${ltgt:0:2} |
|
|
|
|
|
cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc |
|
cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt |
|
|
|
cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc |
|
cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt |
|
} |
|
|
|
|
|
|
|
|
|
|
|
declare -A xh_en_urls |
|
xh_en_urls=( |
|
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz |
|
[wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz |
|
[memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz |
|
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz |
|
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz |
|
[XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz |
|
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz |
|
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz |
|
) |
|
|
|
mkdir $TMP_DIR/xh-en |
|
pushd $TMP_DIR/xh-en |
|
for k in "${!xh_en_urls[@]}" |
|
do |
|
name=$k |
|
url=${xh_en_urls[$k]} |
|
echo "$name: $url" |
|
download_opus xh en $name $ulr |
|
done |
|
concat_subsets xh en "${!xh_en_urls[@]}" |
|
split_train_valid xh en |
|
copy2dst xh_ZA en_XX |
|
popd |
|
|
|
|
|
|
|
|
|
declare -A af_en_urls |
|
af_en_urls=( |
|
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz |
|
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz |
|
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz |
|
[QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz |
|
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz |
|
[OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz |
|
[SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz |
|
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz |
|
) |
|
|
|
mkdir $TMP_DIR/af-en |
|
pushd $TMP_DIR/af-en |
|
for k in "${!af_en_urls[@]}" |
|
do |
|
name=$k |
|
url=${af_en_urls[$k]} |
|
echo "$name: $url" |
|
download_opus af en $name $ulr |
|
done |
|
concat_subsets af en "${!af_en_urls[@]}" |
|
split_train_valid af en |
|
copy2dst af_ZA en_XX |
|
popd |
|
|
|
|
|
|