max_seq_length = 500 def fmt(examples): print(len(examples)) return examples # 'lora_r' is the dimension of the LoRA attention. lora_r = 32 # 'lora_alpha' is the alpha parameter for LoRA scaling. lora_alpha = 16 # 'lora_dropout' is the dropout probability for LoRA layers. lora_dropout = 0.05 # 'target_modules' is a list of the modules that should be targeted by LoRA. target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"] # 'se peft_config = LoraConfig( r=lora_r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, task_type=TaskType.CAUSAL_LM, target_modules=target_modules, ) trainer = SFTTrainer( model = model, tokenizer = tokenizer, train_dataset = qa_dataset['train'], eval_dataset = qa_dataset['test'], dataset_text_field = "text", max_seq_length = max_seq_length, dataset_num_proc = 4, data_collator = collator, # formatting_func = fmt, # peft_config=peft_config, args = TrainingArguments( per_device_train_batch_size = 6, gradient_checkpointing = True, gradient_accumulation_steps = 4, per_device_eval_batch_size = 40, do_eval = True, eval_strategy = 'steps', eval_steps = 50, # save_strategy = 'steps', save_steps = 1000, # Use num_train_epochs and warmup_ratio for longer runs! # max_steps = 70, # warmup_steps = 10, # warmup_ratio = 0.1, num_train_epochs = 2, # Select a 2 to 10x smaller learning rate for the embedding matrices! learning_rate = 3e-5, # embedding_learning_rate = 1e-6, # fp16 = not is_bfloat16_supported(), bf16 = True, logging_steps = 1, optim = "adamw_torch", weight_decay = 0.00, lr_scheduler_type = "linear", # seed = 3407, output_dir = "llama_3b_step2_batch_v4", ), )