|
#!/bin/bash |
|
|
|
SCRIPTS=mosesdecoder/scripts |
|
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl |
|
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl |
|
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl |
|
BPEROOT=subword-nmt/subword_nmt |
|
|
|
|
|
BPE_CODE=wmt18_en_de/code |
|
SUBSAMPLE_SIZE=25000000 |
|
LANG=de |
|
|
|
|
|
OUTDIR=wmt18_${LANG}_mono |
|
orig=orig |
|
tmp=$OUTDIR/tmp |
|
mkdir -p $OUTDIR $tmp |
|
|
|
|
|
URLS=( |
|
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2007.de.shuffled.gz" |
|
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2008.de.shuffled.gz" |
|
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2009.de.shuffled.gz" |
|
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.de.shuffled.gz" |
|
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2011.de.shuffled.gz" |
|
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.de.shuffled.gz" |
|
"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.de.shuffled.gz" |
|
"http://www.statmt.org/wmt15/training-monolingual-news-crawl-v2/news.2014.de.shuffled.v2.gz" |
|
"http://data.statmt.org/wmt16/translation-task/news.2015.de.shuffled.gz" |
|
"http://data.statmt.org/wmt17/translation-task/news.2016.de.shuffled.gz" |
|
"http://data.statmt.org/wmt18/translation-task/news.2017.de.shuffled.deduped.gz" |
|
) |
|
FILES=( |
|
"news.2007.de.shuffled.gz" |
|
"news.2008.de.shuffled.gz" |
|
"news.2009.de.shuffled.gz" |
|
"news.2010.de.shuffled.gz" |
|
"news.2011.de.shuffled.gz" |
|
"news.2012.de.shuffled.gz" |
|
"news.2013.de.shuffled.gz" |
|
"news.2014.de.shuffled.v2.gz" |
|
"news.2015.de.shuffled.gz" |
|
"news.2016.de.shuffled.gz" |
|
"news.2017.de.shuffled.deduped.gz" |
|
) |
|
|
|
|
|
cd $orig |
|
for ((i=0;i<${#URLS[@]};++i)); do |
|
file=${FILES[i]} |
|
if [ -f $file ]; then |
|
echo "$file already exists, skipping download" |
|
else |
|
url=${URLS[i]} |
|
wget "$url" |
|
fi |
|
done |
|
cd .. |
|
|
|
|
|
if [ -f $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then |
|
echo "found monolingual sample, skipping shuffle/sample/tokenize" |
|
else |
|
gzip -c -d -k $(for FILE in "${FILES[@]}"; do echo $orig/$FILE; done) \ |
|
| shuf -n $SUBSAMPLE_SIZE \ |
|
| perl $NORM_PUNC $LANG \ |
|
| perl $REM_NON_PRINT_CHAR \ |
|
| perl $TOKENIZER -threads 8 -a -l $LANG \ |
|
> $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} |
|
fi |
|
|
|
|
|
if [ -f $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then |
|
echo "found BPE monolingual sample, skipping BPE step" |
|
else |
|
python $BPEROOT/apply_bpe.py -c $BPE_CODE \ |
|
< $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} \ |
|
> $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} |
|
fi |
|
|
|
|
|
if [ -f $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} ]; then |
|
echo "found deduplicated monolingual sample, skipping deduplication step" |
|
else |
|
python deduplicate_lines.py $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} \ |
|
> $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} |
|
fi |
|
|
|
|
|
if [ -f $OUTDIR/bpe.monolingual.dedup.00.de ]; then |
|
echo "found sharded data, skipping sharding step" |
|
else |
|
split --lines 1000000 --numeric-suffixes \ |
|
--additional-suffix .${LANG} \ |
|
$tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} \ |
|
$OUTDIR/bpe.monolingual.dedup. |
|
fi |
|
|