|
max_seq_length = 500 |
|
|
|
def fmt(examples): |
|
print(len(examples)) |
|
return examples |
|
|
|
# 'lora_r' is the dimension of the LoRA attention. |
|
lora_r = 32 |
|
|
|
# 'lora_alpha' is the alpha parameter for LoRA scaling. |
|
lora_alpha = 16 |
|
|
|
# 'lora_dropout' is the dropout probability for LoRA layers. |
|
lora_dropout = 0.05 |
|
|
|
# 'target_modules' is a list of the modules that should be targeted by LoRA. |
|
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"] |
|
|
|
# 'se |
|
|
|
peft_config = LoraConfig( |
|
r=lora_r, |
|
lora_alpha=lora_alpha, |
|
lora_dropout=lora_dropout, |
|
task_type=TaskType.CAUSAL_LM, |
|
target_modules=target_modules, |
|
) |
|
|
|
trainer = SFTTrainer( |
|
model = model, |
|
tokenizer = tokenizer, |
|
train_dataset = qa_dataset['train'], |
|
eval_dataset = qa_dataset['test'], |
|
dataset_text_field = "text", |
|
max_seq_length = max_seq_length, |
|
dataset_num_proc = 4, |
|
data_collator = collator, |
|
# formatting_func = fmt, |
|
# peft_config=peft_config, |
|
args = TrainingArguments( |
|
per_device_train_batch_size = 8, |
|
gradient_checkpointing = True, |
|
gradient_accumulation_steps = 16, |
|
per_device_eval_batch_size = 40, |
|
do_eval = True, |
|
eval_strategy = 'steps', |
|
eval_steps = 50, |
|
# save_strategy = 'steps', |
|
save_steps = 1000, |
|
|
|
# Use num_train_epochs and warmup_ratio for longer runs! |
|
# max_steps = 70, |
|
# warmup_steps = 10, |
|
# warmup_ratio = 0.1, |
|
num_train_epochs = 2, |
|
|
|
# Select a 2 to 10x smaller learning rate for the embedding matrices! |
|
learning_rate = 5e-5, |
|
# embedding_learning_rate = 1e-6, |
|
|
|
# fp16 = not is_bfloat16_supported(), |
|
bf16 = True, |
|
logging_steps = 1, |
|
optim = "adamw_torch", |
|
weight_decay = 0.00, |
|
lr_scheduler_type = "linear", |
|
# seed = 3407, |
|
|
|
output_dir = "llama_1b_step2_batch_grad_v5", |
|
), |
|
) |