Spaces:

mshukor
/

UnIVAL

Sleeping

UnIVAL / fairseq /examples /backtranslation /prepare-de-monolingual.sh

mshukor

init

26fd00c over 1 year ago

3.24 kB

	#!/bin/bash

	SCRIPTS=mosesdecoder/scripts
	TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
	NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
	REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
	BPEROOT=subword-nmt/subword_nmt


	BPE_CODE=wmt18_en_de/code
	SUBSAMPLE_SIZE=25000000
	LANG=de


	OUTDIR=wmt18_${LANG}_mono
	orig=orig
	tmp=$OUTDIR/tmp
	mkdir -p $OUTDIR $tmp


	URLS=(
	"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2007.de.shuffled.gz"
	"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2008.de.shuffled.gz"
	"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2009.de.shuffled.gz"
	"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.de.shuffled.gz"
	"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2011.de.shuffled.gz"
	"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.de.shuffled.gz"
	"http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.de.shuffled.gz"
	"http://www.statmt.org/wmt15/training-monolingual-news-crawl-v2/news.2014.de.shuffled.v2.gz"
	"http://data.statmt.org/wmt16/translation-task/news.2015.de.shuffled.gz"
	"http://data.statmt.org/wmt17/translation-task/news.2016.de.shuffled.gz"
	"http://data.statmt.org/wmt18/translation-task/news.2017.de.shuffled.deduped.gz"
	)
	FILES=(
	"news.2007.de.shuffled.gz"
	"news.2008.de.shuffled.gz"
	"news.2009.de.shuffled.gz"
	"news.2010.de.shuffled.gz"
	"news.2011.de.shuffled.gz"
	"news.2012.de.shuffled.gz"
	"news.2013.de.shuffled.gz"
	"news.2014.de.shuffled.v2.gz"
	"news.2015.de.shuffled.gz"
	"news.2016.de.shuffled.gz"
	"news.2017.de.shuffled.deduped.gz"
	)


	cd $orig
	for ((i=0;i<${#URLS[@]};++i)); do
	file=${FILES[i]}
	if [ -f $file ]; then
	echo "$file already exists, skipping download"
	else
	url=${URLS[i]}
	wget "$url"
	fi
	done
	cd ..


	if [ -f $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then
	echo "found monolingual sample, skipping shuffle/sample/tokenize"
	else
	gzip -c -d -k $(for FILE in "${FILES[@]}"; do echo $orig/$FILE; done) \
	\| shuf -n $SUBSAMPLE_SIZE \
	\| perl $NORM_PUNC $LANG \
	\| perl $REM_NON_PRINT_CHAR \
	\| perl $TOKENIZER -threads 8 -a -l $LANG \
	> $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG}
	fi


	if [ -f $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then
	echo "found BPE monolingual sample, skipping BPE step"
	else
	python $BPEROOT/apply_bpe.py -c $BPE_CODE \
	< $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} \
	> $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG}
	fi


	if [ -f $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} ]; then
	echo "found deduplicated monolingual sample, skipping deduplication step"
	else
	python deduplicate_lines.py $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} \
	> $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG}
	fi


	if [ -f $OUTDIR/bpe.monolingual.dedup.00.de ]; then
	echo "found sharded data, skipping sharding step"
	else
	split --lines 1000000 --numeric-suffixes \
	--additional-suffix .${LANG} \
	$tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} \
	$OUTDIR/bpe.monolingual.dedup.
	fi