File size: 9,453 Bytes
0ed8b17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
#!/bin/bash
set -e
# Languages to train on
#LANGUAGES_WIKIPEDIA=( "es" "af" "ar" "arz" "as" "bn" "fr" "sw" "eu" "ca" "zh" "en" "hi" "ur" "id" "pt" "vi" "gu" "kn" "ml" "mr" "ta" "te" "yo" )
#LANGUAGES_OSCAR=( "es" "af" "ar" "arz" "as" "bn" "fr" "sw" "eu" "ca" "zh" "en" "hi" "ur" "id" "pt" "vi" "gu" "kn" "ml" "mr" "te" )
LANGUAGES_WIKIPEDIA=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" )
LANGUAGES_OSCAR=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" )
MODEL_TYPES=("bpe" "unigram")
NDOC_FOR_LM=1000000
VOCAB_SIZES=(16000 32000 64000) # 65536
SMALL_VOCAB_SIZE=16000
EXTRA_IDS=100
# Normalization parameters
SKIP_KENLM=True
REMOVE_ACCENTS=False
LOWER_CASE=False
NORMALIZE_NUMBERS=True
NORMALIZE_PUNCT=1
# OSCAR
NDOC_FOR_LM_OSCAR=1000000
train_language_and_dataset () {
local lang=$1
local dataset=$2
local vocab_size=$3
local vocab_ndoc=$4
local model_type=$5
local model_extra_ids=""
local extra_ids=`python -c "print('--user_defined_symbols='+','.join([f'<extra_id_{i}>' for i in range($EXTRA_IDS)]))"`
if [ "$EXTRA_IDS" = 0 ]; then
model_extra_ids=""
else
model_extra_ids=".${EXTRA_IDS}extra"
fi
if [[ "$lang" == *"-"* ]]; then
echo "Set of languages: ${lang}"
for sublang in $(echo $lang | tr "-" "\n")
do
train_language_and_dataset "$sublang" "$dataset" "$vocab_size" "$vocab_ndoc" "$model_type"
done
if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then
echo "${dataset} openings were alerady extracted for ${lang}"
else
touch "data/${dataset}/cirrus/gz/${lang}.json.gz"
touch "data/${dataset}/cirrus/gz/${lang}.opening.tmp"
echo "Combining and shuffling languages: ${lang}"
for sublang in $(echo $lang | tr "-" "\n")
do
cat "data/${dataset}/cirrus/gz/${sublang}.opening.txt" >> "data/${dataset}/cirrus/gz/${lang}.opening.tmp"
done
shuf "data/${dataset}/cirrus/gz/${lang}.opening.tmp" -o "data/${dataset}/cirrus/gz/${lang}.opening.txt"
rm "data/${dataset}/cirrus/gz/${lang}.opening.tmp"
fi
fi
if [ "$dataset" = "wikipedia" ]; then
# 1 Download Wikipedia cirrus
if [ -f "data/${dataset}/cirrus/gz/${lang}.json.gz" ]; then
echo "${lang} Wikipedia cirrus was already downloaded."
else
echo "Downloading ${lang}"
mkdir -p "data/${dataset}/cirrus/gz/"
python cc_net/get_wiki_cirrus.py dl --lang "${lang}" --output_dir "data/${dataset}/cirrus/gz" --date 20220418
echo "Downloaded Wikipedia cirrus for ${lang}"
fi
# 2 Extract opening text of each article
if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then
echo "Wikipedia openings were already extracted for ${lang}"
else
echo "Extracting ${lang}"
python cc_net/get_wiki_cirrus.py opening \
--n_docs ${NDOC_FOR_LM} \
--file "data/${dataset}/cirrus/gz/${lang}.json.gz" \
--output "data/${dataset}/cirrus/gz/${lang}.opening.txt" \
--accent ${REMOVE_ACCENTS} \
--case ${LOWER_CASE} \
--numbers ${NORMALIZE_NUMBERS} \
--punct ${NORMALIZE_PUNCT}
fi
else
# 1 & 2 Download and preprocess dataset from HF hub
if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then
echo "OSCAR openings were already extracted for ${lang}"
else
echo "Downloading OSCAR ${lang}"
mkdir -p "data/${dataset}/cirrus/gz/"
python cc_net/get_hf_dataset.py dl \
--dataset "${dataset}" \
--output_file "data/${dataset}/cirrus/gz/${lang}.opening.txt" \
--name "unshuffled_deduplicated_${lang}" \
--split "train" \
--max_docs $NDOC_FOR_LM_OSCAR
fi
fi
local model_name="${lang}_${vocab_size}_${model_type}${model_extra_ids}"
# 3 Train sentence piece tokenizer
if [ -f "data/${dataset}/lm_sp/${model_name}.sp.model" ]; then
echo "Sentence piece tokenizer was already trained for ${model_name}"
else
echo "Training sentence piece tokenizer for ${lang}_${vocab_size}_${model_type}"
mkdir -p "data/${dataset}/lm_sp"
./bin/spm_train --input="data/${dataset}/cirrus/gz/${lang}.opening.txt" \
--vocab_size=${vocab_size} --hard_vocab_limit \
--character_coverage=1.0 \
--model_type=${model_type} \
--bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 \
--input_sentence_size=${vocab_ndoc} --shuffle_input_sentence=true \
--model_prefix="data/${dataset}/lm_sp/${model_name}.sp" ${extra_ids} \
|| echo "WARNING: Corpus is too small, will train smaller model" #&& \
#./bin/spm_train --input="data/${dataset}/cirrus/gz/${lang}.opening.txt" \
# --vocab_size=${SMALL_VOCAB_SIZE} \
# --character_coverage=1.0 \
# --model_type=${model_type} \
# --bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 \
# --model_prefix="data/${dataset}/lm_sp/${lang}_${vocab_size}.sp"
echo "Trained SentencePiece model with $(wc -l data/"${dataset}"/lm_sp/"${lang}"_"${vocab_size}"_"${model_type}${model_extra_ids}".sp.vocab) pieces"
fi
if [ "$SKIP_KENLM" = "False" ]; then
# 4 Tokenize openings dataset
if [ -f "data/${dataset}/cirrus/sp/${lang}.opening.txt" ]; then
echo "Openings dataset already tokenized for ${model_name}"
else
mkdir -p "data/${dataset}/cirrus/sp"
echo "Tokenizing openings dataset for ${model_name}"
./bin/spm_encode \
--model="data/${dataset}/lm_sp/${model_name}.sp.model" \
--output_format=piece \
"data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/cirrus/sp/${lang}.opening.txt"
echo "Tokenized openings dataset for ${model_name}"
fi
# 5 Train KenLM model on tokenized dataset
if [ -f "data/${dataset}/lm_sp/${model_name}.arpa" ] || [ -f "data/${dataset}/lm_sp/${model_name}.arpa.bin" ]; then
echo "KenLM model already trained for ${model_name}"
else
echo "Training KenLM model for ${model_name}"
mkdir -p tmp
./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback \
< "data/${dataset}/cirrus/sp/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}.arpa"
echo "Trained KenLM model for ${model_name}"
fi
if [ -f "data/${dataset}/lm_sp/${model_name}_untokenized.arpa" ] ; then
echo "KenLM model already trained for ${model_name}_untokenized"
else
echo "Training KenLM model for ${model_name}_untokenized"
mkdir -p tmp
./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback --skip_symbols \
< "data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}_untokenized.arpa"
echo "Trained KenLM model for ${model_name}_untokenized"
fi
# 6 Convert KenLM model to binary
if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin" ]; then
echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}"
else
echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}"
./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin"
echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}"
rm "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa"
fi
if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin" ]; then
echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized"
else
echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized"
./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin"
echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized"
# rm "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}_untokenized.arpa"
fi
fi
}
for model_type in "${MODEL_TYPES[@]}"
do
for vocab_size in "${VOCAB_SIZES[@]}"
do
echo -e "\n--------------------\nVocab: ${vocab_size}. Model: ${model_type}\n--------------------\n"
for lang in "${LANGUAGES_WIKIPEDIA[@]}"
do
train_language_and_dataset "$lang" wikipedia "$vocab_size" "$NDOC_FOR_LM" "$model_type"
done
for lang in "${LANGUAGES_OSCAR[@]}"
do
train_language_and_dataset "$lang" oscar "$vocab_size" "$NDOC_FOR_LM_OSCAR" "$model_type"
done
done
done
|