danielgombas's picture
danielgombas/llama_1b_step2_batch_grad_v5
0504c67 verified
max_seq_length = 500
def fmt(examples):
print(len(examples))
return examples
# 'lora_r' is the dimension of the LoRA attention.
lora_r = 32
# 'lora_alpha' is the alpha parameter for LoRA scaling.
lora_alpha = 16
# 'lora_dropout' is the dropout probability for LoRA layers.
lora_dropout = 0.05
# 'target_modules' is a list of the modules that should be targeted by LoRA.
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
# 'se
peft_config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
task_type=TaskType.CAUSAL_LM,
target_modules=target_modules,
)
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = qa_dataset['train'],
eval_dataset = qa_dataset['test'],
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 4,
data_collator = collator,
# formatting_func = fmt,
# peft_config=peft_config,
args = TrainingArguments(
per_device_train_batch_size = 8,
gradient_checkpointing = True,
gradient_accumulation_steps = 16,
per_device_eval_batch_size = 40,
do_eval = True,
eval_strategy = 'steps',
eval_steps = 50,
# save_strategy = 'steps',
save_steps = 1000,
# Use num_train_epochs and warmup_ratio for longer runs!
# max_steps = 70,
# warmup_steps = 10,
# warmup_ratio = 0.1,
num_train_epochs = 2,
# Select a 2 to 10x smaller learning rate for the embedding matrices!
learning_rate = 5e-5,
# embedding_learning_rate = 1e-6,
# fp16 = not is_bfloat16_supported(),
bf16 = True,
logging_steps = 1,
optim = "adamw_torch",
weight_decay = 0.00,
lr_scheduler_type = "linear",
# seed = 3407,
output_dir = "llama_1b_step2_batch_grad_v5",
),
)