|
#!/bin/bash |
|
set -e |
|
|
|
|
|
|
|
|
|
LANGUAGES_WIKIPEDIA=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" ) |
|
LANGUAGES_OSCAR=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" ) |
|
MODEL_TYPES=("bpe" "unigram") |
|
|
|
NDOC_FOR_LM=1000000 |
|
VOCAB_SIZES=(16000 32000 64000) |
|
SMALL_VOCAB_SIZE=16000 |
|
EXTRA_IDS=100 |
|
|
|
|
|
SKIP_KENLM=True |
|
REMOVE_ACCENTS=False |
|
LOWER_CASE=False |
|
NORMALIZE_NUMBERS=True |
|
NORMALIZE_PUNCT=1 |
|
|
|
|
|
NDOC_FOR_LM_OSCAR=1000000 |
|
|
|
|
|
train_language_and_dataset () { |
|
local lang=$1 |
|
local dataset=$2 |
|
local vocab_size=$3 |
|
local vocab_ndoc=$4 |
|
local model_type=$5 |
|
local model_extra_ids="" |
|
local extra_ids=`python -c "print('--user_defined_symbols='+','.join([f'<extra_id_{i}>' for i in range($EXTRA_IDS)]))"` |
|
if [ "$EXTRA_IDS" = 0 ]; then |
|
model_extra_ids="" |
|
else |
|
model_extra_ids=".${EXTRA_IDS}extra" |
|
fi |
|
if [[ "$lang" == *"-"* ]]; then |
|
echo "Set of languages: ${lang}" |
|
for sublang in $(echo $lang | tr "-" "\n") |
|
do |
|
train_language_and_dataset "$sublang" "$dataset" "$vocab_size" "$vocab_ndoc" "$model_type" |
|
done |
|
if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then |
|
echo "${dataset} openings were alerady extracted for ${lang}" |
|
else |
|
touch "data/${dataset}/cirrus/gz/${lang}.json.gz" |
|
touch "data/${dataset}/cirrus/gz/${lang}.opening.tmp" |
|
echo "Combining and shuffling languages: ${lang}" |
|
for sublang in $(echo $lang | tr "-" "\n") |
|
do |
|
cat "data/${dataset}/cirrus/gz/${sublang}.opening.txt" >> "data/${dataset}/cirrus/gz/${lang}.opening.tmp" |
|
done |
|
shuf "data/${dataset}/cirrus/gz/${lang}.opening.tmp" -o "data/${dataset}/cirrus/gz/${lang}.opening.txt" |
|
rm "data/${dataset}/cirrus/gz/${lang}.opening.tmp" |
|
fi |
|
fi |
|
|
|
if [ "$dataset" = "wikipedia" ]; then |
|
|
|
if [ -f "data/${dataset}/cirrus/gz/${lang}.json.gz" ]; then |
|
echo "${lang} Wikipedia cirrus was already downloaded." |
|
else |
|
echo "Downloading ${lang}" |
|
mkdir -p "data/${dataset}/cirrus/gz/" |
|
python cc_net/get_wiki_cirrus.py dl --lang "${lang}" --output_dir "data/${dataset}/cirrus/gz" --date 20220418 |
|
echo "Downloaded Wikipedia cirrus for ${lang}" |
|
fi |
|
|
|
|
|
if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then |
|
echo "Wikipedia openings were already extracted for ${lang}" |
|
else |
|
echo "Extracting ${lang}" |
|
python cc_net/get_wiki_cirrus.py opening \ |
|
--n_docs ${NDOC_FOR_LM} \ |
|
--file "data/${dataset}/cirrus/gz/${lang}.json.gz" \ |
|
--output "data/${dataset}/cirrus/gz/${lang}.opening.txt" \ |
|
--accent ${REMOVE_ACCENTS} \ |
|
--case ${LOWER_CASE} \ |
|
--numbers ${NORMALIZE_NUMBERS} \ |
|
--punct ${NORMALIZE_PUNCT} |
|
fi |
|
else |
|
|
|
if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then |
|
echo "OSCAR openings were already extracted for ${lang}" |
|
else |
|
echo "Downloading OSCAR ${lang}" |
|
mkdir -p "data/${dataset}/cirrus/gz/" |
|
python cc_net/get_hf_dataset.py dl \ |
|
--dataset "${dataset}" \ |
|
--output_file "data/${dataset}/cirrus/gz/${lang}.opening.txt" \ |
|
--name "unshuffled_deduplicated_${lang}" \ |
|
--split "train" \ |
|
--max_docs $NDOC_FOR_LM_OSCAR |
|
fi |
|
fi |
|
local model_name="${lang}_${vocab_size}_${model_type}${model_extra_ids}" |
|
|
|
if [ -f "data/${dataset}/lm_sp/${model_name}.sp.model" ]; then |
|
echo "Sentence piece tokenizer was already trained for ${model_name}" |
|
else |
|
echo "Training sentence piece tokenizer for ${lang}_${vocab_size}_${model_type}" |
|
mkdir -p "data/${dataset}/lm_sp" |
|
./bin/spm_train --input="data/${dataset}/cirrus/gz/${lang}.opening.txt" \ |
|
--vocab_size=${vocab_size} --hard_vocab_limit \ |
|
--character_coverage=1.0 \ |
|
--model_type=${model_type} \ |
|
--bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 \ |
|
--input_sentence_size=${vocab_ndoc} --shuffle_input_sentence=true \ |
|
--model_prefix="data/${dataset}/lm_sp/${model_name}.sp" ${extra_ids} \ |
|
|| echo "WARNING: Corpus is too small, will train smaller model" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "Trained SentencePiece model with $(wc -l data/"${dataset}"/lm_sp/"${lang}"_"${vocab_size}"_"${model_type}${model_extra_ids}".sp.vocab) pieces" |
|
fi |
|
|
|
if [ "$SKIP_KENLM" = "False" ]; then |
|
|
|
|
|
if [ -f "data/${dataset}/cirrus/sp/${lang}.opening.txt" ]; then |
|
echo "Openings dataset already tokenized for ${model_name}" |
|
else |
|
mkdir -p "data/${dataset}/cirrus/sp" |
|
echo "Tokenizing openings dataset for ${model_name}" |
|
./bin/spm_encode \ |
|
--model="data/${dataset}/lm_sp/${model_name}.sp.model" \ |
|
--output_format=piece \ |
|
"data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/cirrus/sp/${lang}.opening.txt" |
|
echo "Tokenized openings dataset for ${model_name}" |
|
fi |
|
|
|
|
|
if [ -f "data/${dataset}/lm_sp/${model_name}.arpa" ] || [ -f "data/${dataset}/lm_sp/${model_name}.arpa.bin" ]; then |
|
echo "KenLM model already trained for ${model_name}" |
|
else |
|
echo "Training KenLM model for ${model_name}" |
|
mkdir -p tmp |
|
./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback \ |
|
< "data/${dataset}/cirrus/sp/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}.arpa" |
|
echo "Trained KenLM model for ${model_name}" |
|
fi |
|
if [ -f "data/${dataset}/lm_sp/${model_name}_untokenized.arpa" ] ; then |
|
echo "KenLM model already trained for ${model_name}_untokenized" |
|
else |
|
echo "Training KenLM model for ${model_name}_untokenized" |
|
mkdir -p tmp |
|
./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback --skip_symbols \ |
|
< "data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}_untokenized.arpa" |
|
echo "Trained KenLM model for ${model_name}_untokenized" |
|
fi |
|
|
|
|
|
|
|
if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin" ]; then |
|
echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}" |
|
else |
|
echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}" |
|
./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin" |
|
echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}" |
|
rm "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa" |
|
fi |
|
if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin" ]; then |
|
echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized" |
|
else |
|
echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized" |
|
./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin" |
|
echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized" |
|
|
|
fi |
|
|
|
fi |
|
} |
|
|
|
for model_type in "${MODEL_TYPES[@]}" |
|
do |
|
for vocab_size in "${VOCAB_SIZES[@]}" |
|
do |
|
echo -e "\n--------------------\nVocab: ${vocab_size}. Model: ${model_type}\n--------------------\n" |
|
for lang in "${LANGUAGES_WIKIPEDIA[@]}" |
|
do |
|
train_language_and_dataset "$lang" wikipedia "$vocab_size" "$NDOC_FOR_LM" "$model_type" |
|
done |
|
|
|
for lang in "${LANGUAGES_OSCAR[@]}" |
|
do |
|
train_language_and_dataset "$lang" oscar "$vocab_size" "$NDOC_FOR_LM_OSCAR" "$model_type" |
|
done |
|
done |
|
done |
|
|