Safetensors
Korean
gemma
sususupa commited on
Commit
11133c0
โ€ข
1 Parent(s): d4c15a1

Update summary_ko

Browse files
Files changed (1) hide show
  1. summary_ko +31 -31
summary_ko CHANGED
@@ -1,6 +1,6 @@
1
  # 1. ๊ฐœ๋ฐœ ํ™˜๊ฒฝ ์„ค์ •ยถ
2
  # 1.1 ํ•„์ˆ˜ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜ํ•˜๊ธฐยถ
3
- In [ ]:
4
  !pip3 install -q -U transformers==4.38.2
5
  !pip3 install -q -U datasets==2.18.0
6
  !pip3 install -q -U bitsandbytes==0.42.0
@@ -9,7 +9,7 @@ In [ ]:
9
  !pip3 install -q -U accelerate==0.27.2
10
 
11
  # 1.2 Import modulesยถ
12
- In [ ]:
13
  import torch
14
  from datasets import Dataset, load_dataset
15
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
@@ -17,35 +17,35 @@ from peft import LoraConfig, PeftModel
17
  from trl import SFTTrainer
18
 
19
  # 1.3 Huggingface ๋กœ๊ทธ์ธยถ
20
- In [ ]:
21
  from huggingface_hub import notebook_login
22
  notebook_login()
23
 
24
  # 2. Dataset ์ƒ์„ฑ ๋ฐ ์ค€๋น„ยถ
25
  # 2.1 ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œยถ
26
- In [ ]:
27
  from datasets import load_dataset
28
  dataset = load_dataset("daekeun-ml/naver-news-summarization-ko")
29
  # 2.2 ๋ฐ์ดํ„ฐ์…‹ ํƒ์ƒ‰ยถ
30
- In [ ]:
31
  dataset
32
  # 2.3 ๋ฐ์ดํ„ฐ์…‹ ์˜ˆ์‹œยถ
33
- In [ ]:
34
  dataset['train'][0]
35
 
36
  # 3. Gemma ๋ชจ๋ธ์˜ ํ•œ๊ตญ์–ด ์š”์•ฝ ํ…Œ์ŠคํŠธยถ
37
  # 3.1 ๋ชจ๋ธ ๋กœ๋“œยถ
38
- In [ ]:
39
  BASE_MODEL = "google/gemma-2b-it"
40
 
41
  model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map={"":0})
42
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
43
  # 3.2 Gemma-it์˜ ํ”„๋กฌํ”„ํŠธ ํ˜•์‹ยถ
44
- In [ ]:
45
  doc = dataset['train']['document'][0]
46
- In [ ]:
47
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
48
- In [ ]:
49
  messages = [
50
  {
51
  "role": "user",
@@ -53,10 +53,10 @@ messages = [
53
  }
54
  ]
55
  prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
56
- In [ ]:
57
  prompt
58
  # 3.3 Gemma-it ์ถ”๋ก ยถ
59
- In [ ]:
60
  outputs = pipe(
61
  prompt,
62
  do_sample=True,
@@ -65,16 +65,16 @@ outputs = pipe(
65
  top_p=0.95,
66
  add_special_tokens=True
67
  )
68
- In [ ]:
69
  print(outputs[0]["generated_text"][len(prompt):])
70
 
71
  # 4. Gemma ํŒŒ์ธํŠœ๋‹ยถ
72
  ์ฃผ์˜: Colab GPU ๋ฉ”๋ชจ๋ฆฌ ํ•œ๊ณ„๋กœ ์ด์ „์žฅ ์ถ”๋ก ์—์„œ ์‚ฌ์šฉํ–ˆ๋˜ ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ๋น„์›Œ ์ค˜์•ผ ํŒŒ์ธํŠœ๋‹์„ ์ง„ํ–‰ ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
73
  notebook ๋Ÿฐํƒ€์ž„ ์„ธ์…˜์„ ์žฌ์‹œ์ž‘ ํ•œ ํ›„ 1๋ฒˆ๊ณผ 2๋ฒˆ์˜ 2.1 ํ•ญ๋ชฉ๊นŒ์ง€ ๋‹ค์‹œ ์‹คํ–‰ํ•˜์—ฌ ๋กœ๋“œ ํ•œ ํ›„ ์•„๋ž˜ ๊ณผ์ •์„ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค
74
- In [ ]:
75
  !nvidia-smi
76
  # 4.1 ํ•™์Šต์šฉ ํ”„๋กฌํ”„ํŠธ ์กฐ์ •ยถ
77
- In [ ]:
78
  def generate_prompt(example):
79
  prompt_list = []
80
  for i in range(len(example['document'])):
@@ -85,11 +85,11 @@ def generate_prompt(example):
85
  <start_of_turn>model
86
  {}<end_of_turn><eos>""".format(example['document'][i], example['summary'][i]))
87
  return prompt_list
88
- In [ ]:
89
  train_data = dataset['train']
90
  print(generate_prompt(train_data[:1])[0])
91
  # 4.2 QLoRA ์„ค์ •ยถ
92
- In [ ]:
93
  lora_config = LoraConfig(
94
  r=6,
95
  target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
@@ -101,13 +101,13 @@ bnb_config = BitsAndBytesConfig(
101
  bnb_4bit_quant_type="nf4",
102
  bnb_4bit_compute_dtype=torch.float16
103
  )
104
- In [ ]:
105
  BASE_MODEL = "google/gemma-2b-it"
106
  model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", quantization_config=bnb_config)
107
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
108
  tokenizer.padding_side = 'right'
109
  # 4.3 Trainer ์‹คํ–‰ยถ
110
- In [ ]:
111
  trainer = SFTTrainer(
112
  model=model,
113
  train_dataset=train_data,
@@ -129,42 +129,42 @@ trainer = SFTTrainer(
129
  peft_config=lora_config,
130
  formatting_func=generate_prompt,
131
  )
132
- In [ ]:
133
  trainer.train()
134
  # 4.4 Finetuned Model ์ €์žฅยถ
135
- In [ ]:
136
  ADAPTER_MODEL = "lora_adapter"
137
 
138
  trainer.model.save_pretrained(ADAPTER_MODEL)
139
- In [ ]:
140
  !ls -alh lora_adapter
141
- In [ ]:
142
  model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
143
  model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)
144
 
145
  model = model.merge_and_unload()
146
  model.save_pretrained('gemma-2b-it-sum-ko')
147
- In [ ]:
148
  !ls -alh ./gemma-2b-it-sum-ko
149
 
150
  # 5. Gemma ํ•œ๊ตญ์–ด ์š”์•ฝ ๋ชจ๋ธ ์ถ”๋ก ยถ
151
  ์ฃผ์˜: ๋งˆ์ฐฌ๊ฐ€์ง€๋กœ Colab GPU ๋ฉ”๋ชจ๋ฆฌ ํ•œ๊ณ„๋กœ ํ•™์Šต ์‹œ ์‚ฌ์šฉํ–ˆ๋˜ ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ๋น„์›Œ ์ค˜์•ผ ํŒŒ์ธํŠœ๋‹์„ ์ง„ํ–‰ ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
152
  notebook ๋Ÿฐํƒ€์ž„ ์„ธ์…˜์„ ์žฌ์‹œ์ž‘ ํ•œ ํ›„ 1๋ฒˆ๊ณผ 2๋ฒˆ์˜ 2.1 ํ•ญ๋ชฉ๊นŒ์ง€ ๋‹ค์‹œ ์‹คํ–‰ํ•˜์—ฌ ๋กœ๋“œ ํ•œ ํ›„ ์•„๋ž˜ ๊ณผ์ •์„ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค
153
- In [ ]:
154
  !nvidia-smi
155
  # 5.1 Fine-tuned ๋ชจ๋ธ ๋กœ๋“œยถ
156
- In [ ]:
157
  BASE_MODEL = "google/gemma-2b-it"
158
  FINETUNE_MODEL = "./gemma-2b-it-sum-ko"
159
 
160
  finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map={"":0})
161
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
162
  # 5.2 Fine-tuned ๋ชจ๋ธ ์ถ”๋ก ยถ
163
- In [ ]:
164
  pipe_finetuned = pipeline("text-generation", model=finetune_model, tokenizer=tokenizer, max_new_tokens=512)
165
- In [ ]:
166
  doc = dataset['test']['document'][10]
167
- In [ ]:
168
  messages = [
169
  {
170
  "role": "user",
@@ -172,7 +172,7 @@ messages = [
172
  }
173
  ]
174
  prompt = pipe_finetuned.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
175
- In [ ]:
176
  outputs = pipe_finetuned(
177
  prompt,
178
  do_sample=True,
@@ -182,5 +182,5 @@ outputs = pipe_finetuned(
182
  add_special_tokens=True
183
  )
184
  print(outputs[0]["generated_text"][len(prompt):])
185
- In [ ]:
186
 
 
1
  # 1. ๊ฐœ๋ฐœ ํ™˜๊ฒฝ ์„ค์ •ยถ
2
  # 1.1 ํ•„์ˆ˜ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜ํ•˜๊ธฐยถ
3
+
4
  !pip3 install -q -U transformers==4.38.2
5
  !pip3 install -q -U datasets==2.18.0
6
  !pip3 install -q -U bitsandbytes==0.42.0
 
9
  !pip3 install -q -U accelerate==0.27.2
10
 
11
  # 1.2 Import modulesยถ
12
+
13
  import torch
14
  from datasets import Dataset, load_dataset
15
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
 
17
  from trl import SFTTrainer
18
 
19
  # 1.3 Huggingface ๋กœ๊ทธ์ธยถ
20
+
21
  from huggingface_hub import notebook_login
22
  notebook_login()
23
 
24
  # 2. Dataset ์ƒ์„ฑ ๋ฐ ์ค€๋น„ยถ
25
  # 2.1 ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œยถ
26
+
27
  from datasets import load_dataset
28
  dataset = load_dataset("daekeun-ml/naver-news-summarization-ko")
29
  # 2.2 ๋ฐ์ดํ„ฐ์…‹ ํƒ์ƒ‰ยถ
30
+
31
  dataset
32
  # 2.3 ๋ฐ์ดํ„ฐ์…‹ ์˜ˆ์‹œยถ
33
+
34
  dataset['train'][0]
35
 
36
  # 3. Gemma ๋ชจ๋ธ์˜ ํ•œ๊ตญ์–ด ์š”์•ฝ ํ…Œ์ŠคํŠธยถ
37
  # 3.1 ๋ชจ๋ธ ๋กœ๋“œยถ
38
+
39
  BASE_MODEL = "google/gemma-2b-it"
40
 
41
  model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map={"":0})
42
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
43
  # 3.2 Gemma-it์˜ ํ”„๋กฌํ”„ํŠธ ํ˜•์‹ยถ
44
+
45
  doc = dataset['train']['document'][0]
46
+
47
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
48
+
49
  messages = [
50
  {
51
  "role": "user",
 
53
  }
54
  ]
55
  prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
56
+
57
  prompt
58
  # 3.3 Gemma-it ์ถ”๋ก ยถ
59
+
60
  outputs = pipe(
61
  prompt,
62
  do_sample=True,
 
65
  top_p=0.95,
66
  add_special_tokens=True
67
  )
68
+
69
  print(outputs[0]["generated_text"][len(prompt):])
70
 
71
  # 4. Gemma ํŒŒ์ธํŠœ๋‹ยถ
72
  ์ฃผ์˜: Colab GPU ๋ฉ”๋ชจ๋ฆฌ ํ•œ๊ณ„๋กœ ์ด์ „์žฅ ์ถ”๋ก ์—์„œ ์‚ฌ์šฉํ–ˆ๋˜ ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ๋น„์›Œ ์ค˜์•ผ ํŒŒ์ธํŠœ๋‹์„ ์ง„ํ–‰ ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
73
  notebook ๋Ÿฐํƒ€์ž„ ์„ธ์…˜์„ ์žฌ์‹œ์ž‘ ํ•œ ํ›„ 1๋ฒˆ๊ณผ 2๋ฒˆ์˜ 2.1 ํ•ญ๋ชฉ๊นŒ์ง€ ๋‹ค์‹œ ์‹คํ–‰ํ•˜์—ฌ ๋กœ๋“œ ํ•œ ํ›„ ์•„๋ž˜ ๊ณผ์ •์„ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค
74
+
75
  !nvidia-smi
76
  # 4.1 ํ•™์Šต์šฉ ํ”„๋กฌํ”„ํŠธ ์กฐ์ •ยถ
77
+
78
  def generate_prompt(example):
79
  prompt_list = []
80
  for i in range(len(example['document'])):
 
85
  <start_of_turn>model
86
  {}<end_of_turn><eos>""".format(example['document'][i], example['summary'][i]))
87
  return prompt_list
88
+
89
  train_data = dataset['train']
90
  print(generate_prompt(train_data[:1])[0])
91
  # 4.2 QLoRA ์„ค์ •ยถ
92
+
93
  lora_config = LoraConfig(
94
  r=6,
95
  target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
 
101
  bnb_4bit_quant_type="nf4",
102
  bnb_4bit_compute_dtype=torch.float16
103
  )
104
+
105
  BASE_MODEL = "google/gemma-2b-it"
106
  model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", quantization_config=bnb_config)
107
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
108
  tokenizer.padding_side = 'right'
109
  # 4.3 Trainer ์‹คํ–‰ยถ
110
+
111
  trainer = SFTTrainer(
112
  model=model,
113
  train_dataset=train_data,
 
129
  peft_config=lora_config,
130
  formatting_func=generate_prompt,
131
  )
132
+
133
  trainer.train()
134
  # 4.4 Finetuned Model ์ €์žฅยถ
135
+
136
  ADAPTER_MODEL = "lora_adapter"
137
 
138
  trainer.model.save_pretrained(ADAPTER_MODEL)
139
+
140
  !ls -alh lora_adapter
141
+
142
  model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
143
  model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)
144
 
145
  model = model.merge_and_unload()
146
  model.save_pretrained('gemma-2b-it-sum-ko')
147
+
148
  !ls -alh ./gemma-2b-it-sum-ko
149
 
150
  # 5. Gemma ํ•œ๊ตญ์–ด ์š”์•ฝ ๋ชจ๋ธ ์ถ”๋ก ยถ
151
  ์ฃผ์˜: ๋งˆ์ฐฌ๊ฐ€์ง€๋กœ Colab GPU ๋ฉ”๋ชจ๋ฆฌ ํ•œ๊ณ„๋กœ ํ•™์Šต ์‹œ ์‚ฌ์šฉํ–ˆ๋˜ ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ๋น„์›Œ ์ค˜์•ผ ํŒŒ์ธํŠœ๋‹์„ ์ง„ํ–‰ ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
152
  notebook ๋Ÿฐํƒ€์ž„ ์„ธ์…˜์„ ์žฌ์‹œ์ž‘ ํ•œ ํ›„ 1๋ฒˆ๊ณผ 2๋ฒˆ์˜ 2.1 ํ•ญ๋ชฉ๊นŒ์ง€ ๋‹ค์‹œ ์‹คํ–‰ํ•˜์—ฌ ๋กœ๋“œ ํ•œ ํ›„ ์•„๋ž˜ ๊ณผ์ •์„ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค
153
+
154
  !nvidia-smi
155
  # 5.1 Fine-tuned ๋ชจ๋ธ ๋กœ๋“œยถ
156
+
157
  BASE_MODEL = "google/gemma-2b-it"
158
  FINETUNE_MODEL = "./gemma-2b-it-sum-ko"
159
 
160
  finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map={"":0})
161
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
162
  # 5.2 Fine-tuned ๋ชจ๋ธ ์ถ”๋ก ยถ
163
+
164
  pipe_finetuned = pipeline("text-generation", model=finetune_model, tokenizer=tokenizer, max_new_tokens=512)
165
+
166
  doc = dataset['test']['document'][10]
167
+
168
  messages = [
169
  {
170
  "role": "user",
 
172
  }
173
  ]
174
  prompt = pipe_finetuned.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
175
+
176
  outputs = pipe_finetuned(
177
  prompt,
178
  do_sample=True,
 
182
  add_special_tokens=True
183
  )
184
  print(outputs[0]["generated_text"][len(prompt):])
185
+
186