|
#! /usr/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LANG=kn_in |
|
LANG_ISO_3=kan |
|
LANGUAGE=Kannada |
|
|
|
|
|
|
|
|
|
|
|
|
|
DATASET="google/fleurs" |
|
TEXT_COLUMN="transcription" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SCRIPT_PATH=$(realpath "${BASH_SOURCE[0]}") |
|
SCRIPT_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}")) |
|
|
|
|
|
export MASTER_PORT="${MASTER_PORT:-29500}" |
|
echo "Using master_port for deepspeech: ${MASTER_PORT}" |
|
|
|
export "MASTER_ADDR"="localhost" |
|
export "RANK"="0" |
|
export "LOCAL_RANK"="0" |
|
export "WORLD_SIZE"="1" |
|
|
|
|
|
MODEL=w2v2 |
|
|
|
|
|
|
|
BASE_MODEL="facebook/mms-1b-fl102" |
|
|
|
JUST_LANG=${LANG%%_*} |
|
MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}-GF" |
|
|
|
OUTDIR="/cosmos/home/sp-operator/ai/training/models/simpragma/${MY_MODEL}" |
|
echo "OUTDIR: ${OUTDIR}" |
|
|
|
|
|
|
|
MAX_EPOCHS=4 |
|
TRAIN_BATCH_SIZE=4 |
|
EVAL_BATCH_SIZE=4 |
|
LEARNING_RATE="1e-3" |
|
|
|
EVAL_STEPS="1000" |
|
SAVE_STEPS="1000" |
|
|
|
|
|
mkdir -p ${OUTDIR} |
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "================ TRAINING: START ================" |
|
|
|
python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \ |
|
--dataset_name="${DATASET}" \ |
|
--model_name_or_path="${BASE_MODEL}" \ |
|
--dataset_config_name="${LANG}" \ |
|
--target_language="${LANG_ISO_3}" \ |
|
--output_dir="${OUTDIR}" \ |
|
--num_train_epochs="${MAX_EPOCHS}" \ |
|
--per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \ |
|
--gradient_accumulation_steps="16" \ |
|
--learning_rate="${LEARNING_RATE}" \ |
|
--warmup_steps="100" \ |
|
--evaluation_strategy="steps" \ |
|
--text_column_name="${TEXT_COLUMN}" \ |
|
--length_column_name="input_length" \ |
|
--save_steps="${SAVE_STEPS}" \ |
|
--eval_steps="${EVAL_STEPS}" \ |
|
--save_total_limit="3" \ |
|
--optim="adamw_bnb_8bit" \ |
|
--hub_model_id "simpragma/${MY_MODEL}" \ |
|
--gradient_checkpointing \ |
|
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ |
|
--fp16 \ |
|
--group_by_length \ |
|
--do_train \ |
|
--do_eval \ |
|
--push_to_hub \ |
|
--overwrite_output_dir \ |
|
| tee ${OUTDIR}/${MY_MODEL}.log |
|
|
|
|
|
cp ${SCRIPT_PATH} ${OUTDIR} |
|
|
|
echo "================ TRAINING: DONE ================" |
|
|
|
exit 0 |
|
|