Safetensors
Korean
gemma
RecoTravRoute / summary_ko
sususupa's picture
Update summary_ko
11133c0 verified
# 1. ๊ฐœ๋ฐœ ํ™˜๊ฒฝ ์„ค์ •ยถ
# 1.1 ํ•„์ˆ˜ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜ํ•˜๊ธฐยถ
!pip3 install -q -U transformers==4.38.2
!pip3 install -q -U datasets==2.18.0
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.9.0
!pip3 install -q -U trl==0.7.11
!pip3 install -q -U accelerate==0.27.2
# 1.2 Import modulesยถ
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
# 1.3 Huggingface ๋กœ๊ทธ์ธยถ
from huggingface_hub import notebook_login
notebook_login()
# 2. Dataset ์ƒ์„ฑ ๋ฐ ์ค€๋น„ยถ
# 2.1 ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œยถ
from datasets import load_dataset
dataset = load_dataset("daekeun-ml/naver-news-summarization-ko")
# 2.2 ๋ฐ์ดํ„ฐ์…‹ ํƒ์ƒ‰ยถ
dataset
# 2.3 ๋ฐ์ดํ„ฐ์…‹ ์˜ˆ์‹œยถ
dataset['train'][0]
# 3. Gemma ๋ชจ๋ธ์˜ ํ•œ๊ตญ์–ด ์š”์•ฝ ํ…Œ์ŠคํŠธยถ
# 3.1 ๋ชจ๋ธ ๋กœ๋“œยถ
BASE_MODEL = "google/gemma-2b-it"
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
# 3.2 Gemma-it์˜ ํ”„๋กฌํ”„ํŠธ ํ˜•์‹ยถ
doc = dataset['train']['document'][0]
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
messages = [
{
"role": "user",
"content": "๋‹ค์Œ ๊ธ€์„ ์š”์•ฝํ•ด์ฃผ์„ธ์š” :\n\n{}".format(doc)
}
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt
# 3.3 Gemma-it ์ถ”๋ก ยถ
outputs = pipe(
prompt,
do_sample=True,
temperature=0.2,
top_k=50,
top_p=0.95,
add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])
# 4. Gemma ํŒŒ์ธํŠœ๋‹ยถ
์ฃผ์˜: Colab GPU ๋ฉ”๋ชจ๋ฆฌ ํ•œ๊ณ„๋กœ ์ด์ „์žฅ ์ถ”๋ก ์—์„œ ์‚ฌ์šฉํ–ˆ๋˜ ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ๋น„์›Œ ์ค˜์•ผ ํŒŒ์ธํŠœ๋‹์„ ์ง„ํ–‰ ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
notebook ๋Ÿฐํƒ€์ž„ ์„ธ์…˜์„ ์žฌ์‹œ์ž‘ ํ•œ ํ›„ 1๋ฒˆ๊ณผ 2๋ฒˆ์˜ 2.1 ํ•ญ๋ชฉ๊นŒ์ง€ ๋‹ค์‹œ ์‹คํ–‰ํ•˜์—ฌ ๋กœ๋“œ ํ•œ ํ›„ ์•„๋ž˜ ๊ณผ์ •์„ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค
!nvidia-smi
# 4.1 ํ•™์Šต์šฉ ํ”„๋กฌํ”„ํŠธ ์กฐ์ •ยถ
def generate_prompt(example):
prompt_list = []
for i in range(len(example['document'])):
prompt_list.append(r"""<bos><start_of_turn>user
๋‹ค์Œ ๊ธ€์„ ์š”์•ฝํ•ด์ฃผ์„ธ์š”:
{}<end_of_turn>
<start_of_turn>model
{}<end_of_turn><eos>""".format(example['document'][i], example['summary'][i]))
return prompt_list
train_data = dataset['train']
print(generate_prompt(train_data[:1])[0])
# 4.2 QLoRA ์„ค์ •ยถ
lora_config = LoraConfig(
r=6,
target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
task_type="CAUSAL_LM",
)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
BASE_MODEL = "google/gemma-2b-it"
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
tokenizer.padding_side = 'right'
# 4.3 Trainer ์‹คํ–‰ยถ
trainer = SFTTrainer(
model=model,
train_dataset=train_data,
max_seq_length=512,
args=TrainingArguments(
output_dir="outputs",
# num_train_epochs = 1,
max_steps=3000,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
optim="paged_adamw_8bit",
warmup_steps=0.03,
learning_rate=2e-4,
fp16=True,
logging_steps=100,
push_to_hub=False,
report_to='none',
),
peft_config=lora_config,
formatting_func=generate_prompt,
)
trainer.train()
# 4.4 Finetuned Model ์ €์žฅยถ
ADAPTER_MODEL = "lora_adapter"
trainer.model.save_pretrained(ADAPTER_MODEL)
!ls -alh lora_adapter
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)
model = model.merge_and_unload()
model.save_pretrained('gemma-2b-it-sum-ko')
!ls -alh ./gemma-2b-it-sum-ko
# 5. Gemma ํ•œ๊ตญ์–ด ์š”์•ฝ ๋ชจ๋ธ ์ถ”๋ก ยถ
์ฃผ์˜: ๋งˆ์ฐฌ๊ฐ€์ง€๋กœ Colab GPU ๋ฉ”๋ชจ๋ฆฌ ํ•œ๊ณ„๋กœ ํ•™์Šต ์‹œ ์‚ฌ์šฉํ–ˆ๋˜ ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ๋น„์›Œ ์ค˜์•ผ ํŒŒ์ธํŠœ๋‹์„ ์ง„ํ–‰ ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
notebook ๋Ÿฐํƒ€์ž„ ์„ธ์…˜์„ ์žฌ์‹œ์ž‘ ํ•œ ํ›„ 1๋ฒˆ๊ณผ 2๋ฒˆ์˜ 2.1 ํ•ญ๋ชฉ๊นŒ์ง€ ๋‹ค์‹œ ์‹คํ–‰ํ•˜์—ฌ ๋กœ๋“œ ํ•œ ํ›„ ์•„๋ž˜ ๊ณผ์ •์„ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค
!nvidia-smi
# 5.1 Fine-tuned ๋ชจ๋ธ ๋กœ๋“œยถ
BASE_MODEL = "google/gemma-2b-it"
FINETUNE_MODEL = "./gemma-2b-it-sum-ko"
finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
# 5.2 Fine-tuned ๋ชจ๋ธ ์ถ”๋ก ยถ
pipe_finetuned = pipeline("text-generation", model=finetune_model, tokenizer=tokenizer, max_new_tokens=512)
doc = dataset['test']['document'][10]
messages = [
{
"role": "user",
"content": "๋‹ค์Œ ๊ธ€์„ ์š”์•ฝํ•ด์ฃผ์„ธ์š”:\n\n{}".format(doc)
}
]
prompt = pipe_finetuned.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe_finetuned(
prompt,
do_sample=True,
temperature=0.2,
top_k=50,
top_p=0.95,
add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])