import torch from transformers import AutoModelForCausalLM, AutoTokenizer from urllib.parse import unquote_plus import os from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \ DataCollatorForLanguageModeling, Trainer, TrainingArguments from transformers import BitsAndBytesConfig from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training nf4_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 ) # Carregar o modelo e o tokenizador na GPU device = "cuda:0" model_id = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id,quantization_config=nf4_config,device_map="auto",local_files_only=False,trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print(model) from transformers import AutoModelForCausalLM from datasets import load_dataset from trl import * # jondurbin/truthy-dpo-v0.1 def return_prompt_and_responses(samples) : return { "prompt": [ "Question: " + question + "\n\nAnswer: " for question in samples["prompt"] ], "chosen": samples["chosen"], # rated better than k "rejected": samples["rejected"], # rated worse than j } dataset = load_dataset( "jondurbin/truthy-dpo-v0.1", split="train", #data_dir="data/rl" ) original_columns = dataset.column_names dataset.map( return_prompt_and_responses, batched=True, remove_columns=original_columns ) model = prepare_model_for_kbit_training(model) peft_config = LoraConfig( r=128, lora_alpha=16, target_modules=["q_proj","k_proj","v_proj","o_proj", "up_proj","gate_proj","down_proj", "lm_head"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", ) output_dir = "./odp" training_args = TrainingArguments( per_device_train_batch_size=1, gradient_accumulation_steps=1, gradient_checkpointing =True, max_grad_norm= 0.3, optim='adafactor', overwrite_output_dir=True,save_steps=100, num_train_epochs=1, learning_rate=2e-4, bf16=True, save_total_limit=3, logging_steps=10, output_dir=output_dir, lr_scheduler_type="cosine", warmup_ratio=0.05, ) dpo_trainer = DPOTrainer( model, #model_ref, args=training_args, peft_config=peft_config, beta=0.1, train_dataset=dataset, tokenizer=tokenizer, max_prompt_length=1024, max_length=2048, ) dpo_trainer.train()