|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if [ -z $WORKDIR_ROOT ] ; |
|
then |
|
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." |
|
exit |
|
fi |
|
|
|
|
|
|
|
data_root=${WORKDIR_ROOT}/iwsltv2 |
|
DESTDIR=${WORKDIR_ROOT}/ML50/raw |
|
|
|
|
|
langs="ar_AR it_IT nl_XX ko_KR vi_VN" |
|
echo "data_root: $data_root" |
|
|
|
download_path=${data_root}/downloads |
|
raw=${DESTDIR} |
|
tmp=${data_root}/tmp |
|
orig=${data_root}/orig |
|
|
|
mkdir -p $download_path $orig $raw $tmp |
|
|
|
download_iwslt(){ |
|
iwslt_key=$1 |
|
src=$2 |
|
tgt=$3 |
|
save_prefix=$4 |
|
pushd ${download_path} |
|
if [[ ! -f ${save_prefix}$src-$tgt.tgz ]]; then |
|
wget https://wit3.fbk.eu/archive/${iwslt_key}/texts/$src/$tgt/$src-$tgt.tgz -O ${save_prefix}$src-$tgt.tgz |
|
[ $? -eq 0 ] && return 0 |
|
fi |
|
popd |
|
} |
|
|
|
extract_iwslt(){ |
|
src=$1 |
|
tgt=$2 |
|
prefix=$3 |
|
pushd $orig |
|
tar zxvf ${download_path}/${prefix}$src-${tgt}.tgz |
|
popd |
|
} |
|
|
|
generate_train(){ |
|
lsrc=$1 |
|
ltgt=$2 |
|
src=${lsrc:0:2} |
|
tgt=${ltgt:0:2} |
|
for ll in $lsrc $ltgt; do |
|
l=${ll:0:2} |
|
f="$orig/*/train.tags.$src-$tgt.$l" |
|
f_raw=$raw/train.$lsrc-$ltgt.$ll |
|
cat $f \ |
|
| grep -v '<url>' \ |
|
| grep -v '<talkid>' \ |
|
| grep -v '<keywords>' \ |
|
| grep -v '<speaker>' \ |
|
| grep -v '<reviewer' \ |
|
| grep -v '<translator' \ |
|
| grep -v '<doc' \ |
|
| grep -v '</doc>' \ |
|
| sed -e 's/<title>//g' \ |
|
| sed -e 's/<\/title>//g' \ |
|
| sed -e 's/<description>//g' \ |
|
| sed -e 's/<\/description>//g' \ |
|
| sed 's/^\s*//g' \ |
|
| sed 's/\s*$//g' \ |
|
> $f_raw |
|
[ $? -eq 0 ] && echo "extracted $f to $f_raw" |
|
done |
|
return 0 |
|
} |
|
|
|
convert_valid_test(){ |
|
src=$1 |
|
tgt=$2 |
|
for l in $src $tgt; do |
|
echo "lang: ${l}" |
|
for o in `ls $orig/*/IWSLT*.TED*.$src-$tgt.$l.xml`; do |
|
fname=${o##*/} |
|
f=$tmp/${fname%.*} |
|
echo "$o => $f" |
|
grep '<seg id' $o \ |
|
| sed -e 's/<seg id="[0-9]*">\s*//g' \ |
|
| sed -e 's/\s*<\/seg>\s*//g' \ |
|
| sed -e "s/\’/\'/g" \ |
|
> $f |
|
echo "" |
|
done |
|
done |
|
} |
|
|
|
generate_subset(){ |
|
lsrc=$1 |
|
ltgt=$2 |
|
src=${lsrc:0:2} |
|
tgt=${ltgt:0:2} |
|
subset=$3 |
|
prefix=$4 |
|
for ll in $lsrc $ltgt; do |
|
l=${ll:0:2} |
|
f=$tmp/$prefix.${src}-${tgt}.$l |
|
if [[ -f $f ]]; then |
|
cp $f $raw/$subset.${lsrc}-$ltgt.${ll} |
|
fi |
|
done |
|
} |
|
|
|
|
|
echo "downloading iwslt training and dev data" |
|
|
|
download_iwslt "2017-01-trnmted" DeEnItNlRo DeEnItNlRo |
|
download_iwslt "2017-01-trnted" ar en |
|
download_iwslt "2017-01-trnted" en ar |
|
download_iwslt "2017-01-trnted" ko en |
|
download_iwslt "2017-01-trnted" en ko |
|
download_iwslt "2015-01" vi en |
|
download_iwslt "2015-01" en vi |
|
|
|
echo "donwloading iwslt test data" |
|
download_iwslt "2017-01-mted-test" it en "test." |
|
download_iwslt "2017-01-mted-test" en it "test." |
|
download_iwslt "2017-01-mted-test" nl en "test." |
|
download_iwslt "2017-01-mted-test" en nl "test." |
|
|
|
download_iwslt "2017-01-ted-test" ar en "test." |
|
download_iwslt "2017-01-ted-test" en ar "test." |
|
download_iwslt "2017-01-ted-test" ko en "test." |
|
download_iwslt "2017-01-ted-test" en ko "test." |
|
download_iwslt "2015-01-test" vi en "test." |
|
download_iwslt "2015-01-test" en vi "test." |
|
|
|
echo "extract training data tar balls" |
|
extract_iwslt DeEnItNlRo DeEnItNlRo |
|
extract_iwslt ar en |
|
extract_iwslt en ar |
|
extract_iwslt ko en |
|
extract_iwslt en ko |
|
extract_iwslt vi en |
|
extract_iwslt en vi |
|
|
|
|
|
echo "extracting iwslt test data" |
|
for lang in $langs; do |
|
l=${lang:0:2} |
|
extract_iwslt $l en "test." |
|
extract_iwslt en $l "test." |
|
done |
|
|
|
echo "convert dev and test data" |
|
for lang in $langs; do |
|
s_lang=${lang:0:2} |
|
convert_valid_test $s_lang en |
|
convert_valid_test en $s_lang |
|
done |
|
|
|
|
|
|
|
echo "creating training data into $raw" |
|
for lang in $langs; do |
|
generate_train $lang en_XX |
|
generate_train en_XX $lang |
|
done |
|
|
|
echo "creating iwslt dev data into raw" |
|
generate_subset en_XX vi_VN valid "IWSLT15.TED.tst2013" |
|
generate_subset vi_VN en_XX valid "IWSLT15.TED.tst2013" |
|
|
|
generate_subset en_XX ar_AR valid "IWSLT17.TED.tst2016" |
|
generate_subset ar_AR en_XX valid "IWSLT17.TED.tst2016" |
|
generate_subset en_XX ko_KR valid "IWSLT17.TED.tst2016" |
|
generate_subset ko_KR en_XX valid "IWSLT17.TED.tst2016" |
|
|
|
|
|
generate_subset en_XX it_IT valid "IWSLT17.TED.tst2010" |
|
generate_subset it_IT en_XX valid "IWSLT17.TED.tst2010" |
|
generate_subset en_XX nl_XX valid "IWSLT17.TED.tst2010" |
|
generate_subset nl_XX en_XX valid "IWSLT17.TED.tst2010" |
|
|
|
echo "creating iswslt test data into raw" |
|
generate_subset en_XX vi_VN test "IWSLT15.TED.tst2015" |
|
generate_subset vi_VN en_XX test "IWSLT15.TED.tst2015" |
|
|
|
generate_subset en_XX ar_AR test "IWSLT17.TED.tst2017" |
|
generate_subset ar_AR en_XX test "IWSLT17.TED.tst2017" |
|
generate_subset en_XX ko_KR test "IWSLT17.TED.tst2017" |
|
generate_subset ko_KR en_XX test "IWSLT17.TED.tst2017" |
|
|
|
generate_subset en_XX it_IT test "IWSLT17.TED.tst2017.mltlng" |
|
generate_subset it_IT en_XX test "IWSLT17.TED.tst2017.mltlng" |
|
generate_subset en_XX nl_XX test "IWSLT17.TED.tst2017.mltlng" |
|
generate_subset nl_XX en_XX test "IWSLT17.TED.tst2017.mltlng" |
|
|
|
|
|
pushd $raw |
|
for lang in $langs; do |
|
for split in test valid; do |
|
x_en_f1=$split.$lang-en_XX.en_XX |
|
x_en_f2=$split.$lang-en_XX.${lang} |
|
|
|
en_x_f1=$split.en_XX-$lang.en_XX |
|
en_x_f2=$split.en_XX-$lang.${lang} |
|
|
|
if [ -f $en_x_f1 ] && [ ! -f $x_en_f1 ]; then |
|
echo "cp $en_x_f1 $x_en_f1" |
|
cp $en_x_f1 $x_en_f1 |
|
fi |
|
if [ -f $x_en_f2 ] && [ ! -f $x_en_f2 ]; then |
|
echo "cp $en_x_f2 $x_en_f2" |
|
cp $en_x_f2 $x_en_f2 |
|
fi |
|
done |
|
done |
|
popd |