|
export CUDA_VISIBLE_DEVICES=0 |
|
|
|
data_dir=${1:-"en-indic-exp"} |
|
model_name=${2:-"ai4bharat/indictrans2-en-indic-dist-200M"} |
|
output_dir=${3:-"output"} |
|
src_lang_list=${4:-"eng_Latn"} |
|
tgt_lang_list=${5:-"asm_Beng,ben_Beng,guj_Gujr,hin_Deva,kan_Knda,mal_Mlym,mar_Deva,npi_Deva,ory_Orya,pan_Guru,tam_Taml,tel_Telu,urd_Arab"} |
|
|
|
python3 train_lora.py \ |
|
--data_dir $data_dir \ |
|
--model_name $model_name \ |
|
--output_dir $output_dir \ |
|
--src_lang_list $src_lang_list \ |
|
--tgt_lang_list $tgt_lang_list \ |
|
--save_steps 1000 \ |
|
--max_steps 1000000 \ |
|
--batch_size 32 \ |
|
--grad_accum_steps 4 \ |
|
--warmup_steps 4000 \ |
|
--max_grad_norm 1.0 \ |
|
--learning_rate 2e-4 \ |
|
--adam_beta1 0.9 \ |
|
--adam_beta2 0.98 \ |
|
--optimizer adamw_torch \ |
|
--lr_scheduler inverse_sqrt \ |
|
--num_workers 16 \ |
|
--metric_for_best_model eval_BLEU \ |
|
--greater_is_better \ |
|
--patience 10 \ |
|
--weight_decay 0.01 \ |
|
--lora_target_modules "q_proj,k_proj" \ |
|
--lora_dropout 0.1 \ |
|
--lora_r 16 \ |
|
--lora_alpha 32 \ |
|
--print_samples |