nb-tokenizers-spm / train_all.sh
versae's picture
Adding tokenizers models and vocabs
0ed8b17
#!/bin/bash
set -e
# Languages to train on
#LANGUAGES_WIKIPEDIA=( "es" "af" "ar" "arz" "as" "bn" "fr" "sw" "eu" "ca" "zh" "en" "hi" "ur" "id" "pt" "vi" "gu" "kn" "ml" "mr" "ta" "te" "yo" )
#LANGUAGES_OSCAR=( "es" "af" "ar" "arz" "as" "bn" "fr" "sw" "eu" "ca" "zh" "en" "hi" "ur" "id" "pt" "vi" "gu" "kn" "ml" "mr" "te" )
LANGUAGES_WIKIPEDIA=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" )
LANGUAGES_OSCAR=( "es" "no" "nn" "is" "da" "en" "fr" "de" "sv" "no-da-en-sv-nn-is" "no-nn" )
MODEL_TYPES=("bpe" "unigram")
NDOC_FOR_LM=1000000
VOCAB_SIZES=(16000 32000 64000) # 65536
SMALL_VOCAB_SIZE=16000
EXTRA_IDS=100
# Normalization parameters
SKIP_KENLM=True
REMOVE_ACCENTS=False
LOWER_CASE=False
NORMALIZE_NUMBERS=True
NORMALIZE_PUNCT=1
# OSCAR
NDOC_FOR_LM_OSCAR=1000000
train_language_and_dataset () {
local lang=$1
local dataset=$2
local vocab_size=$3
local vocab_ndoc=$4
local model_type=$5
local model_extra_ids=""
local extra_ids=`python -c "print('--user_defined_symbols='+','.join([f'<extra_id_{i}>' for i in range($EXTRA_IDS)]))"`
if [ "$EXTRA_IDS" = 0 ]; then
model_extra_ids=""
else
model_extra_ids=".${EXTRA_IDS}extra"
fi
if [[ "$lang" == *"-"* ]]; then
echo "Set of languages: ${lang}"
for sublang in $(echo $lang | tr "-" "\n")
do
train_language_and_dataset "$sublang" "$dataset" "$vocab_size" "$vocab_ndoc" "$model_type"
done
if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then
echo "${dataset} openings were alerady extracted for ${lang}"
else
touch "data/${dataset}/cirrus/gz/${lang}.json.gz"
touch "data/${dataset}/cirrus/gz/${lang}.opening.tmp"
echo "Combining and shuffling languages: ${lang}"
for sublang in $(echo $lang | tr "-" "\n")
do
cat "data/${dataset}/cirrus/gz/${sublang}.opening.txt" >> "data/${dataset}/cirrus/gz/${lang}.opening.tmp"
done
shuf "data/${dataset}/cirrus/gz/${lang}.opening.tmp" -o "data/${dataset}/cirrus/gz/${lang}.opening.txt"
rm "data/${dataset}/cirrus/gz/${lang}.opening.tmp"
fi
fi
if [ "$dataset" = "wikipedia" ]; then
# 1 Download Wikipedia cirrus
if [ -f "data/${dataset}/cirrus/gz/${lang}.json.gz" ]; then
echo "${lang} Wikipedia cirrus was already downloaded."
else
echo "Downloading ${lang}"
mkdir -p "data/${dataset}/cirrus/gz/"
python cc_net/get_wiki_cirrus.py dl --lang "${lang}" --output_dir "data/${dataset}/cirrus/gz" --date 20220418
echo "Downloaded Wikipedia cirrus for ${lang}"
fi
# 2 Extract opening text of each article
if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then
echo "Wikipedia openings were already extracted for ${lang}"
else
echo "Extracting ${lang}"
python cc_net/get_wiki_cirrus.py opening \
--n_docs ${NDOC_FOR_LM} \
--file "data/${dataset}/cirrus/gz/${lang}.json.gz" \
--output "data/${dataset}/cirrus/gz/${lang}.opening.txt" \
--accent ${REMOVE_ACCENTS} \
--case ${LOWER_CASE} \
--numbers ${NORMALIZE_NUMBERS} \
--punct ${NORMALIZE_PUNCT}
fi
else
# 1 & 2 Download and preprocess dataset from HF hub
if [ -f "data/${dataset}/cirrus/gz/${lang}.opening.txt" ]; then
echo "OSCAR openings were already extracted for ${lang}"
else
echo "Downloading OSCAR ${lang}"
mkdir -p "data/${dataset}/cirrus/gz/"
python cc_net/get_hf_dataset.py dl \
--dataset "${dataset}" \
--output_file "data/${dataset}/cirrus/gz/${lang}.opening.txt" \
--name "unshuffled_deduplicated_${lang}" \
--split "train" \
--max_docs $NDOC_FOR_LM_OSCAR
fi
fi
local model_name="${lang}_${vocab_size}_${model_type}${model_extra_ids}"
# 3 Train sentence piece tokenizer
if [ -f "data/${dataset}/lm_sp/${model_name}.sp.model" ]; then
echo "Sentence piece tokenizer was already trained for ${model_name}"
else
echo "Training sentence piece tokenizer for ${lang}_${vocab_size}_${model_type}"
mkdir -p "data/${dataset}/lm_sp"
./bin/spm_train --input="data/${dataset}/cirrus/gz/${lang}.opening.txt" \
--vocab_size=${vocab_size} --hard_vocab_limit \
--character_coverage=1.0 \
--model_type=${model_type} \
--bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 \
--input_sentence_size=${vocab_ndoc} --shuffle_input_sentence=true \
--model_prefix="data/${dataset}/lm_sp/${model_name}.sp" ${extra_ids} \
|| echo "WARNING: Corpus is too small, will train smaller model" #&& \
#./bin/spm_train --input="data/${dataset}/cirrus/gz/${lang}.opening.txt" \
# --vocab_size=${SMALL_VOCAB_SIZE} \
# --character_coverage=1.0 \
# --model_type=${model_type} \
# --bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 \
# --model_prefix="data/${dataset}/lm_sp/${lang}_${vocab_size}.sp"
echo "Trained SentencePiece model with $(wc -l data/"${dataset}"/lm_sp/"${lang}"_"${vocab_size}"_"${model_type}${model_extra_ids}".sp.vocab) pieces"
fi
if [ "$SKIP_KENLM" = "False" ]; then
# 4 Tokenize openings dataset
if [ -f "data/${dataset}/cirrus/sp/${lang}.opening.txt" ]; then
echo "Openings dataset already tokenized for ${model_name}"
else
mkdir -p "data/${dataset}/cirrus/sp"
echo "Tokenizing openings dataset for ${model_name}"
./bin/spm_encode \
--model="data/${dataset}/lm_sp/${model_name}.sp.model" \
--output_format=piece \
"data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/cirrus/sp/${lang}.opening.txt"
echo "Tokenized openings dataset for ${model_name}"
fi
# 5 Train KenLM model on tokenized dataset
if [ -f "data/${dataset}/lm_sp/${model_name}.arpa" ] || [ -f "data/${dataset}/lm_sp/${model_name}.arpa.bin" ]; then
echo "KenLM model already trained for ${model_name}"
else
echo "Training KenLM model for ${model_name}"
mkdir -p tmp
./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback \
< "data/${dataset}/cirrus/sp/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}.arpa"
echo "Trained KenLM model for ${model_name}"
fi
if [ -f "data/${dataset}/lm_sp/${model_name}_untokenized.arpa" ] ; then
echo "KenLM model already trained for ${model_name}_untokenized"
else
echo "Training KenLM model for ${model_name}_untokenized"
mkdir -p tmp
./bin/lmplz -o 5 -S 8G -T tmp --vocab_estimate ${vocab_size} --discount_fallback --skip_symbols \
< "data/${dataset}/cirrus/gz/${lang}.opening.txt" > "data/${dataset}/lm_sp/${model_name}_untokenized.arpa"
echo "Trained KenLM model for ${model_name}_untokenized"
fi
# 6 Convert KenLM model to binary
if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin" ]; then
echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}"
else
echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}"
./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa.bin"
echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}"
rm "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}.arpa"
fi
if [ -f "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin" ]; then
echo "KenLM model already converted to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized"
else
echo "Converting KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized"
./bin/build_binary "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa" "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized.arpa.bin"
echo "Converted KenLM model to binary for ${lang}_${vocab_size}_${model_type}${model_extra_ids}_untokenized"
# rm "data/${dataset}/lm_sp/${lang}_${vocab_size}_${model_type}_untokenized.arpa"
fi
fi
}
for model_type in "${MODEL_TYPES[@]}"
do
for vocab_size in "${VOCAB_SIZES[@]}"
do
echo -e "\n--------------------\nVocab: ${vocab_size}. Model: ${model_type}\n--------------------\n"
for lang in "${LANGUAGES_WIKIPEDIA[@]}"
do
train_language_and_dataset "$lang" wikipedia "$vocab_size" "$NDOC_FOR_LM" "$model_type"
done
for lang in "${LANGUAGES_OSCAR[@]}"
do
train_language_and_dataset "$lang" oscar "$vocab_size" "$NDOC_FOR_LM_OSCAR" "$model_type"
done
done
done