min24ss commited on
Commit
e1f0e75
ยท
verified ยท
1 Parent(s): b92d5fd

Update r-story-test.py

Browse files
Files changed (1) hide show
  1. r-story-test.py +28 -120
r-story-test.py CHANGED
@@ -2,82 +2,28 @@
2
  # coding: utf-8
3
 
4
  # ## 1. tsv full data load
5
-
6
- # In[1]:
7
-
8
-
9
  import pandas as pd
10
 
11
-
12
  df = pd.read_csv("sl_webtoon_full_data_sequential.tsv", sep="\t")
13
 
14
-
15
  print(df.head())
16
  print("์ „์ฒด ๋ฌธ์žฅ ์ˆ˜:", len(df))
17
  print("์ปฌ๋Ÿผ ๋ชฉ๋ก:", df.columns.tolist())
18
 
19
- # 549
20
- #์ปฌ๋Ÿผ ๋ชฉ๋ก: ['์—ํ”ผ์†Œ๋“œ', 'scene_text', 'type']
21
-
22
-
23
- # In[2]:
24
-
25
-
26
- import pandas as pd
27
-
28
- df = pd.read_csv("sl_webtoon_full_data_sequential.tsv", sep="\t")
29
- print(df.head(3))
30
- print("์ปฌ๋Ÿผ:", df.columns.tolist(), "์ „์ฒด ํ–‰:", len(df))
31
-
32
-
33
- # In[3]:
34
-
35
-
36
- df['row_id'] = df.index #์ธ๋ฑ์Šค ์ปฌ๋Ÿผ ์ถ”๊ฐ€ <- ์›๋ณธ ์ถ”์ ์šฉ
37
-
38
  df['text'] = df.apply(
39
- lambda x: f"[{x['์—ํ”ผ์†Œ๋“œ']}] #{x['row_id']} {x['type']} {x['scene_text']}", #rag ๋ฌธ์žฅ ์ปฌ๋Ÿผ ์ƒ์„ฑ
40
  axis=1
41
  )
42
-
43
- print(df['text'].head(3).tolist())
44
-
45
-
46
- # In[4]:
47
-
48
-
49
  texts = df['text'].tolist()
50
  print("์ตœ์ข… ๋ฌธ์žฅ ์ˆ˜:", len(texts))
51
- # 549
52
-
53
-
54
- # ## 2. Rag ๋ฌธ์žฅ ์ƒ์„ฑ
55
-
56
- # In[5]:
57
-
58
-
59
- # 2๋‹จ๊ณ„: ์ตœ์ข… RAG ๋ฌธ์žฅ ์ƒ์„ฑ
60
- df['row_id'] = df.index # ์›๋ณธ ์ถ”์ ์šฉ ์ธ๋ฑ์Šค
61
- df['text'] = df.apply(
62
- lambda x: f"[{x['์—ํ”ผ์†Œ๋“œ']}] #{x['row_id']} {x['type']} {x['scene_text']}",
63
- axis=1
64
- )
65
 
 
66
  print("์˜ˆ์‹œ 5๊ฐœ:")
67
  for t in df['text'].head(5).tolist():
68
  print("-", t)
69
 
70
- texts = df['text'].tolist()
71
- print("\n์ตœ์ข… ๋ฌธ์žฅ ์ˆ˜:", len(texts))
72
- #549
73
-
74
-
75
- # ## 3. ํ•œ๊ตญ์–ด ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ, ๋ฒกํ„ฐ db - solo_leveling_faiss_ko
76
- #
77
- #
78
-
79
- # In[6]:
80
-
81
  from langchain.vectorstores import FAISS
82
  from langchain.embeddings import HuggingFaceEmbeddings
83
 
@@ -85,61 +31,39 @@ embedding_model = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask'
85
 
86
  db = FAISS.from_texts(texts, embedding_model)
87
  print(" ๋ฒกํ„ฐDB ์ƒ์„ฑ ์™„๋ฃŒ. ์ด ๋ฌธ์žฅ ์ˆ˜:", len(texts))
88
-
89
  db.save_local("solo_leveling_faiss_ko")
90
- print(" 'solo_leveling_faiss_ko' ํด๋”์— ์ €์žฅ")
91
-
92
-
93
- # In[7]:
94
-
95
 
96
  db = FAISS.load_local("solo_leveling_faiss_ko", embedding_model, allow_dangerous_deserialization=True)
97
 
98
-
99
  query = "๋งˆ๋‚˜์„์ด ๋ญ์ง€?"
100
  docs = db.similarity_search(query, k=5)
101
-
102
  for i, doc in enumerate(docs, 1):
103
  print(f"[{i}] {doc.page_content}")
104
 
 
 
 
 
 
 
105
 
106
- # In[8]:
107
-
108
-
109
- ## rag ํ™•์ธ
110
-
111
-
112
- # In[9]:
113
-
114
-
115
- from transformers import pipeline
116
-
117
  generator = pipeline(
118
  "text-generation",
119
  model="kakaocorp/kanana-nano-2.1b-instruct",
120
- device= -1
121
  )
122
 
123
-
124
-
125
- # In[10]:
126
-
127
-
128
- from langchain.chains import RetrievalQA
129
- from langchain.vectorstores import FAISS
130
- from langchain.prompts import PromptTemplate
131
- from langchain_community.llms import HuggingFacePipeline
132
- from langchain.embeddings import HuggingFaceEmbeddings
133
- import torch
134
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
135
-
136
  embedding_model = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask')
137
  vectorstore = FAISS.load_local("solo_leveling_faiss_ko", embedding_model, allow_dangerous_deserialization=True)
138
 
139
  model_name = "kakaocorp/kanana-nano-2.1b-instruct"
140
  tokenizer = AutoTokenizer.from_pretrained(model_name)
141
  model = AutoModelForCausalLM.from_pretrained(
142
- model_name, torch_dtype=torch.float32).to("cpu")
 
 
143
 
144
  llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128)
145
  llm = HuggingFacePipeline(pipeline=llm_pipeline)
@@ -154,25 +78,18 @@ qa_chain = RetrievalQA.from_chain_type(
154
  retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
155
  chain_type="stuff",
156
  return_source_documents=True,
157
- chain_type_kwargs={
158
- "prompt": custom_prompt }
159
  )
160
 
161
- #์งˆ๋ฌธ
162
  query = "์„ฑ์ง„์šฐ๋Š” ๋ช‡ ๊ธ‰ ํ—Œํ„ฐ์ง€?"
163
  result = qa_chain({"query": query})
164
-
165
  print("๋‹ต๋ณ€:", result["result"])
166
  print("\n์ฐธ์กฐ ๋ฌธ์„œ:")
167
  for doc in result["source_documents"]:
168
  print(doc.page_content)
169
 
170
-
171
- # ## 4. ํ™ฉ๋™์„ ์—ํ”ผ์†Œ๋“œ
172
-
173
- # In[13]:
174
-
175
-
176
  choices = [
177
  "1: ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ๋ชจ๋‘ ์ฒ˜์น˜ํ•œ๋‹ค.",
178
  "2: ์ง„ํ˜ธ๋ฅผ ํฌํ•จํ•œ ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ๋ชจ๋‘ ์ฒ˜์น˜ํ•œ๋‹ค.",
@@ -189,8 +106,8 @@ user_choice = choices[user_idx]
189
  print(f"\n[์‚ฌ์šฉ์ž ์„ ํƒ]: {user_choice}")
190
 
191
  result = qa_chain({"query": user_choice})
192
-
193
  retrieved_context = "\n".join([doc.page_content for doc in result["source_documents"]])
 
194
  print("\n[๊ฒ€์ƒ‰๋œ ๊ทผ๊ฑฐ ๋ฌธ์„œ ์˜ˆ์‹œ]")
195
  print(retrieved_context[:600], "...")
196
 
@@ -198,28 +115,19 @@ prompt = f"""
198
  ๋‹น์‹ ์€ ์›นํˆฐ '๋‚˜ ํ˜ผ์ž๋งŒ ๋ ˆ๋ฒจ์—…'์˜ ์„ฑ์ง„์šฐ์ž…๋‹ˆ๋‹ค.
199
  ํ˜„๏ฟฝ๏ฟฝ๏ฟฝ ์ƒํ™ฉ:
200
  {retrieved_context}
201
-
202
  ์‚ฌ์šฉ์ž ์„ ํƒ: {user_choice}
203
-
204
  ์„ฑ์ง„์šฐ์˜ ๋งํˆฌ๋กœ ๊ฐ„๊ฒฐํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ๋Œ€์‚ฌ๋ฅผ 1~2๋ฌธ์žฅ ์ƒ์„ฑํ•˜์„ธ์š”.
205
  ์ค‘๋ณต๋œ ๋‚ด์šฉ์ด๋‚˜ ๋น„์Šทํ•œ ๋ฌธ์žฅ์€ ๋งŒ๋“ค์ง€ ๋งˆ์„ธ์š”.
206
  """
207
 
208
- response = generator(prompt,
209
- max_new_tokens=200,
210
- do_sample=True,
211
- temperature=0.6,
212
- top_p = 0.9,
213
- return_full_text=False
 
214
  )[0]["generated_text"]
 
215
  print("\n[์„ฑ์ง„์šฐ ์‘๋‹ต]")
216
  print(response)
217
-
218
-
219
- # In[ ]:
220
-
221
-
222
-
223
-
224
-
225
- # ##
 
2
  # coding: utf-8
3
 
4
  # ## 1. tsv full data load
 
 
 
 
5
  import pandas as pd
6
 
 
7
  df = pd.read_csv("sl_webtoon_full_data_sequential.tsv", sep="\t")
8
 
 
9
  print(df.head())
10
  print("์ „์ฒด ๋ฌธ์žฅ ์ˆ˜:", len(df))
11
  print("์ปฌ๋Ÿผ ๋ชฉ๋ก:", df.columns.tolist())
12
 
13
+ df['row_id'] = df.index # ์ธ๋ฑ์Šค ์ปฌ๋Ÿผ ์ถ”๊ฐ€
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  df['text'] = df.apply(
15
+ lambda x: f"[{x['์—ํ”ผ์†Œ๋“œ']}] #{x['row_id']} {x['type']} {x['scene_text']}",
16
  axis=1
17
  )
 
 
 
 
 
 
 
18
  texts = df['text'].tolist()
19
  print("์ตœ์ข… ๋ฌธ์žฅ ์ˆ˜:", len(texts))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # ## 2. RAG ๋ฌธ์žฅ ์ƒ์„ฑ
22
  print("์˜ˆ์‹œ 5๊ฐœ:")
23
  for t in df['text'].head(5).tolist():
24
  print("-", t)
25
 
26
+ # ## 3. ํ•œ๊ตญ์–ด ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ, ๋ฒกํ„ฐ db
 
 
 
 
 
 
 
 
 
 
27
  from langchain.vectorstores import FAISS
28
  from langchain.embeddings import HuggingFaceEmbeddings
29
 
 
31
 
32
  db = FAISS.from_texts(texts, embedding_model)
33
  print(" ๋ฒกํ„ฐDB ์ƒ์„ฑ ์™„๋ฃŒ. ์ด ๋ฌธ์žฅ ์ˆ˜:", len(texts))
 
34
  db.save_local("solo_leveling_faiss_ko")
 
 
 
 
 
35
 
36
  db = FAISS.load_local("solo_leveling_faiss_ko", embedding_model, allow_dangerous_deserialization=True)
37
 
38
+ # ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ
39
  query = "๋งˆ๋‚˜์„์ด ๋ญ์ง€?"
40
  docs = db.similarity_search(query, k=5)
 
41
  for i, doc in enumerate(docs, 1):
42
  print(f"[{i}] {doc.page_content}")
43
 
44
+ # ## 4. LLM ๋กœ๋“œ (CPU ์ „์šฉ)
45
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
46
+ from langchain.chains import RetrievalQA
47
+ from langchain.prompts import PromptTemplate
48
+ from langchain_community.llms import HuggingFacePipeline
49
+ import torch
50
 
51
+ # CPU๋กœ ๊ฐ•์ œ
 
 
 
 
 
 
 
 
 
 
52
  generator = pipeline(
53
  "text-generation",
54
  model="kakaocorp/kanana-nano-2.1b-instruct",
55
+ device=-1 # โœ… CPU ์‚ฌ์šฉ
56
  )
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  embedding_model = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask')
59
  vectorstore = FAISS.load_local("solo_leveling_faiss_ko", embedding_model, allow_dangerous_deserialization=True)
60
 
61
  model_name = "kakaocorp/kanana-nano-2.1b-instruct"
62
  tokenizer = AutoTokenizer.from_pretrained(model_name)
63
  model = AutoModelForCausalLM.from_pretrained(
64
+ model_name,
65
+ torch_dtype=torch.float32 # โœ… CPU์—์„œ๋Š” float32
66
+ ).to("cpu") # โœ… CPU ์‚ฌ์šฉ
67
 
68
  llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128)
69
  llm = HuggingFacePipeline(pipeline=llm_pipeline)
 
78
  retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
79
  chain_type="stuff",
80
  return_source_documents=True,
81
+ chain_type_kwargs={"prompt": custom_prompt}
 
82
  )
83
 
84
+ # ์งˆ๋ฌธ ํ…Œ์ŠคํŠธ
85
  query = "์„ฑ์ง„์šฐ๋Š” ๋ช‡ ๊ธ‰ ํ—Œํ„ฐ์ง€?"
86
  result = qa_chain({"query": query})
 
87
  print("๋‹ต๋ณ€:", result["result"])
88
  print("\n์ฐธ์กฐ ๋ฌธ์„œ:")
89
  for doc in result["source_documents"]:
90
  print(doc.page_content)
91
 
92
+ # ## 5. ํ™ฉ๋™์„ ์—ํ”ผ์†Œ๋“œ
 
 
 
 
 
93
  choices = [
94
  "1: ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ๋ชจ๋‘ ์ฒ˜์น˜ํ•œ๋‹ค.",
95
  "2: ์ง„ํ˜ธ๋ฅผ ํฌํ•จํ•œ ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ๋ชจ๋‘ ์ฒ˜์น˜ํ•œ๋‹ค.",
 
106
  print(f"\n[์‚ฌ์šฉ์ž ์„ ํƒ]: {user_choice}")
107
 
108
  result = qa_chain({"query": user_choice})
 
109
  retrieved_context = "\n".join([doc.page_content for doc in result["source_documents"]])
110
+
111
  print("\n[๊ฒ€์ƒ‰๋œ ๊ทผ๊ฑฐ ๋ฌธ์„œ ์˜ˆ์‹œ]")
112
  print(retrieved_context[:600], "...")
113
 
 
115
  ๋‹น์‹ ์€ ์›นํˆฐ '๋‚˜ ํ˜ผ์ž๋งŒ ๋ ˆ๋ฒจ์—…'์˜ ์„ฑ์ง„์šฐ์ž…๋‹ˆ๋‹ค.
116
  ํ˜„๏ฟฝ๏ฟฝ๏ฟฝ ์ƒํ™ฉ:
117
  {retrieved_context}
 
118
  ์‚ฌ์šฉ์ž ์„ ํƒ: {user_choice}
 
119
  ์„ฑ์ง„์šฐ์˜ ๋งํˆฌ๋กœ ๊ฐ„๊ฒฐํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ๋Œ€์‚ฌ๋ฅผ 1~2๋ฌธ์žฅ ์ƒ์„ฑํ•˜์„ธ์š”.
120
  ์ค‘๋ณต๋œ ๋‚ด์šฉ์ด๋‚˜ ๋น„์Šทํ•œ ๋ฌธ์žฅ์€ ๋งŒ๋“ค์ง€ ๋งˆ์„ธ์š”.
121
  """
122
 
123
+ response = generator(
124
+ prompt,
125
+ max_new_tokens=200,
126
+ do_sample=True,
127
+ temperature=0.6,
128
+ top_p=0.9,
129
+ return_full_text=False
130
  )[0]["generated_text"]
131
+
132
  print("\n[์„ฑ์ง„์šฐ ์‘๋‹ต]")
133
  print(response)