|
import torch |
|
|
|
from datasets import Dataset |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TrainingArguments |
|
from peft import LoraConfig, PeftModel |
|
from trl import SFTTrainer |
|
import warnings |
|
|
|
|
|
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download") |
|
|
|
|
|
device = torch.device("cpu") |
|
|
|
|
|
data_path = "./data_finetunned/coffee_finetuning_20240914_witi_total.jsonl" |
|
dataset = Dataset.from_json(data_path) |
|
|
|
print("๋ฐ์ดํฐ์
๋ก๋ ์๋ฃ") |
|
|
|
|
|
BASE_MODEL = "beomi/gemma-ko-2b" |
|
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL) |
|
model.to(device) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True) |
|
|
|
|
|
|
|
def generate_prompt(example): |
|
prompt_list = [] |
|
for i in range(len(example['instruction'])): |
|
prompt_list.append(f"""<bos><start_of_turn>user |
|
{example['instruction'][i]}<end_of_turn> |
|
<start_of_turn>model |
|
{example['response'][i]}<end_of_turn><eos>""") |
|
return prompt_list |
|
|
|
|
|
train_data = dataset |
|
|
|
|
|
print(generate_prompt(train_data[:1])[0]) |
|
|
|
|
|
lora_config = LoraConfig( |
|
r=6, |
|
lora_alpha=8, |
|
lora_dropout=0.05, |
|
target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], |
|
task_type="CAUSAL_LM", |
|
) |
|
|
|
|
|
trainer = SFTTrainer( |
|
model=model, |
|
train_dataset=train_data, |
|
max_seq_length=512, |
|
args=TrainingArguments( |
|
output_dir="outputs", |
|
max_steps=3000, |
|
per_device_train_batch_size=1, |
|
gradient_accumulation_steps=4, |
|
optim="adamw_torch", |
|
warmup_steps=0.03, |
|
learning_rate=2e-4, |
|
fp16=False, |
|
logging_steps=100, |
|
push_to_hub=False, |
|
report_to='none', |
|
use_mps_device=False |
|
), |
|
peft_config=lora_config, |
|
formatting_func=generate_prompt, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
ADAPTER_MODEL = "lora_adapter" |
|
trainer.model.save_pretrained(ADAPTER_MODEL) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16) |
|
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16) |
|
|
|
model = model.merge_and_unload() |
|
model.save_pretrained('./gemma_outputs/gemma-ko-2b-beans-20240915-01') |
|
print("๋ชจ๋ธ ์ ์ฅ ์๋ฃ") |
|
|
|
tokenizer.save_pretrained('./gemma_outputs/gemma-ko-2b-beans-20240915-01') |
|
print("tokenizer ์ ์ฅ ์๋ฃ") |