min24ss commited on
Commit
0decae8
ยท
verified ยท
1 Parent(s): e1f0e75

Delete r-story-test.ipynb

Browse files
Files changed (1) hide show
  1. r-story-test.ipynb +0 -500
r-story-test.ipynb DELETED
@@ -1,500 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "cc27fe9f-c69a-4dab-8d9f-603c7079cad6",
6
- "metadata": {},
7
- "source": [
8
- "## 1. tsv full data load"
9
- ]
10
- },
11
- {
12
- "cell_type": "code",
13
- "execution_count": 1,
14
- "id": "60146aa5-f97a-4931-a4f2-7f8d33136f6b",
15
- "metadata": {},
16
- "outputs": [
17
- {
18
- "name": "stdout",
19
- "output_type": "stream",
20
- "text": [
21
- " ์—ํ”ผ์†Œ๋“œ scene_text type\n",
22
- "0 1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „ ๋„ค, ๊น€์ƒ์‹ ์•„์ €์”จ. ์‹ ๊ฒฝ ์จ ์ฃผ์…”์„œ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค. ๋Œ€์‚ฌ\n",
23
- "1 1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „ ๋‚ด ์ด๋ฆ„์€ ์„ฑ์ง„์šฐ. E๊ธ‰ ํ—Œํ„ฐ ๋‚ด์ ์„ค๋ช…\n",
24
- "2 1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „ ๋ญ˜์š” ํ•˜ํ•˜... ์˜ค๋Š˜๋„ ์ž˜ ๋ถ€ํƒ๋“œ๋ฆด๊ฒŒ์š”. ๋Œ€์‚ฌ\n",
25
- "3 1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „ ํ—Œํ„ฐํ˜‘ํšŒ ์†Œ์† ์ค‘ ๊ฐ€์žฅ ๋‚ฎ์€ ๊ณ„๊ธ‰, ์ตœ์•ฝ์˜ ํ—Œํ„ฐ. ๋‚ด์ ์„ค๋ช…\n",
26
- "4 1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „ ์–ด? ์•ˆ๋…•ํ•˜์„ธ์š”. ์ฃผํฌ ์”จ๋„ ์ด๋ฒˆ ๋ ˆ์ด๋“œ ๊ฐ€์‹œ๋Š”๊ตฐ์š”. ๋Œ€์‚ฌ\n",
27
- "์ „์ฒด ๋ฌธ์žฅ ์ˆ˜: 549\n",
28
- "์ปฌ๋Ÿผ ๋ชฉ๋ก: ['์—ํ”ผ์†Œ๋“œ', 'scene_text', 'type']\n"
29
- ]
30
- }
31
- ],
32
- "source": [
33
- "import pandas as pd\n",
34
- "\n",
35
- "\n",
36
- "df = pd.read_csv(\"sl_webtoon_full_data_sequential.tsv\", sep=\"\\t\")\n",
37
- "\n",
38
- "\n",
39
- "print(df.head())\n",
40
- "print(\"์ „์ฒด ๋ฌธ์žฅ ์ˆ˜:\", len(df))\n",
41
- "print(\"์ปฌ๋Ÿผ ๋ชฉ๋ก:\", df.columns.tolist())\n",
42
- "\n",
43
- "# 549\n",
44
- "#์ปฌ๋Ÿผ ๋ชฉ๋ก: ['์—ํ”ผ์†Œ๋“œ', 'scene_text', 'type']\n"
45
- ]
46
- },
47
- {
48
- "cell_type": "code",
49
- "execution_count": 2,
50
- "id": "fd35b473-3d92-4d9d-a8ee-5565dff05e76",
51
- "metadata": {},
52
- "outputs": [
53
- {
54
- "name": "stdout",
55
- "output_type": "stream",
56
- "text": [
57
- " ์—ํ”ผ์†Œ๋“œ scene_text type\n",
58
- "0 1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „ ๋„ค, ๊น€์ƒ์‹ ์•„์ €์”จ. ์‹ ๊ฒฝ ์จ ์ฃผ์…”์„œ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค. ๋Œ€์‚ฌ\n",
59
- "1 1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „ ๋‚ด ์ด๋ฆ„์€ ์„ฑ์ง„์šฐ. E๊ธ‰ ํ—Œํ„ฐ ๋‚ด์ ์„ค๋ช…\n",
60
- "2 1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „ ๋ญ˜์š” ํ•˜ํ•˜... ์˜ค๋Š˜๋„ ์ž˜ ๋ถ€ํƒ๋“œ๋ฆด๊ฒŒ์š”. ๋Œ€์‚ฌ\n",
61
- "์ปฌ๋Ÿผ: ['์—ํ”ผ์†Œ๋“œ', 'scene_text', 'type'] ์ „์ฒด ํ–‰: 549\n"
62
- ]
63
- }
64
- ],
65
- "source": [
66
- "import pandas as pd\n",
67
- "\n",
68
- "df = pd.read_csv(\"sl_webtoon_full_data_sequential.tsv\", sep=\"\\t\")\n",
69
- "print(df.head(3))\n",
70
- "print(\"์ปฌ๋Ÿผ:\", df.columns.tolist(), \"์ „์ฒด ํ–‰:\", len(df))\n"
71
- ]
72
- },
73
- {
74
- "cell_type": "code",
75
- "execution_count": 3,
76
- "id": "fa5db259-991a-48b1-859f-2308432737c5",
77
- "metadata": {},
78
- "outputs": [
79
- {
80
- "name": "stdout",
81
- "output_type": "stream",
82
- "text": [
83
- "['[1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #0 ๋Œ€์‚ฌ ๋„ค, ๊น€์ƒ์‹ ์•„์ €์”จ. ์‹ ๊ฒฝ ์จ ์ฃผ์…”์„œ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค.', '[1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #1 ๋‚ด์ ์„ค๋ช… ๋‚ด ์ด๋ฆ„์€ ์„ฑ์ง„์šฐ. E๊ธ‰ ํ—Œํ„ฐ', '[1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #2 ๋Œ€์‚ฌ ๋ญ˜์š” ํ•˜ํ•˜... ์˜ค๋Š˜๋„ ์ž˜ ๋ถ€ํƒ๋“œ๋ฆด๊ฒŒ์š”.']\n"
84
- ]
85
- }
86
- ],
87
- "source": [
88
- "\n",
89
- "df['row_id'] = df.index #์ธ๋ฑ์Šค ์ปฌ๋Ÿผ ์ถ”๊ฐ€ <- ์›๋ณธ ์ถ”์ ์šฉ\n",
90
- "\n",
91
- "df['text'] = df.apply(\n",
92
- " lambda x: f\"[{x['์—ํ”ผ์†Œ๋“œ']}] #{x['row_id']} {x['type']} {x['scene_text']}\", #rag ๋ฌธ์žฅ ์ปฌ๋Ÿผ ์ƒ์„ฑ\n",
93
- " axis=1\n",
94
- ")\n",
95
- "\n",
96
- "print(df['text'].head(3).tolist())\n"
97
- ]
98
- },
99
- {
100
- "cell_type": "code",
101
- "execution_count": 4,
102
- "id": "0b95c977-5485-4fdf-b5d8-fb837a0a8cf7",
103
- "metadata": {},
104
- "outputs": [
105
- {
106
- "name": "stdout",
107
- "output_type": "stream",
108
- "text": [
109
- "์ตœ์ข… ๋ฌธ์žฅ ์ˆ˜: 549\n"
110
- ]
111
- }
112
- ],
113
- "source": [
114
- "texts = df['text'].tolist()\n",
115
- "print(\"์ตœ์ข… ๋ฌธ์žฅ ์ˆ˜:\", len(texts))\n",
116
- "# 549"
117
- ]
118
- },
119
- {
120
- "cell_type": "markdown",
121
- "id": "0be84111-8a20-49b4-827a-305e9498fe15",
122
- "metadata": {},
123
- "source": [
124
- "## 2. Rag ๋ฌธ์žฅ ์ƒ์„ฑ"
125
- ]
126
- },
127
- {
128
- "cell_type": "code",
129
- "execution_count": 5,
130
- "id": "2f948651-c16f-40d6-9b96-2aaafb1d7bc9",
131
- "metadata": {},
132
- "outputs": [
133
- {
134
- "name": "stdout",
135
- "output_type": "stream",
136
- "text": [
137
- "์˜ˆ์‹œ 5๊ฐœ:\n",
138
- "- [1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #0 ๋Œ€์‚ฌ ๋„ค, ๊น€์ƒ์‹ ์•„์ €์”จ. ์‹ ๊ฒฝ ์จ ์ฃผ์…”์„œ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค.\n",
139
- "- [1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #1 ๋‚ด์ ์„ค๋ช… ๋‚ด ์ด๋ฆ„์€ ์„ฑ์ง„์šฐ. E๊ธ‰ ํ—Œํ„ฐ\n",
140
- "- [1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #2 ๋Œ€์‚ฌ ๋ญ˜์š” ํ•˜ํ•˜... ์˜ค๋Š˜๋„ ์ž˜ ๋ถ€ํƒ๋“œ๋ฆด๊ฒŒ์š”.\n",
141
- "- [1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #3 ๋‚ด์ ์„ค๋ช… ํ—Œํ„ฐํ˜‘ํšŒ ์†Œ์† ์ค‘ ๊ฐ€์žฅ ๋‚ฎ์€ ๊ณ„๊ธ‰, ์ตœ์•ฝ์˜ ํ—Œํ„ฐ.\n",
142
- "- [1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #4 ๋Œ€์‚ฌ ์–ด? ์•ˆ๋…•ํ•˜์„ธ์š”. ์ฃผํฌ ์”จ๋„ ์ด๋ฒˆ ๋ ˆ์ด๋“œ ๊ฐ€์‹œ๋Š”๊ตฐ์š”.\n",
143
- "\n",
144
- "์ตœ์ข… ๋ฌธ์žฅ ์ˆ˜: 549\n"
145
- ]
146
- }
147
- ],
148
- "source": [
149
- "# 2๋‹จ๊ณ„: ์ตœ์ข… RAG ๋ฌธ์žฅ ์ƒ์„ฑ\n",
150
- "df['row_id'] = df.index # ์›๋ณธ ์ถ”์ ์šฉ ์ธ๋ฑ์Šค\n",
151
- "df['text'] = df.apply(\n",
152
- " lambda x: f\"[{x['์—ํ”ผ์†Œ๋“œ']}] #{x['row_id']} {x['type']} {x['scene_text']}\",\n",
153
- " axis=1\n",
154
- ")\n",
155
- "\n",
156
- "print(\"์˜ˆ์‹œ 5๊ฐœ:\")\n",
157
- "for t in df['text'].head(5).tolist():\n",
158
- " print(\"-\", t)\n",
159
- "\n",
160
- "texts = df['text'].tolist()\n",
161
- "print(\"\\n์ตœ์ข… ๋ฌธ์žฅ ์ˆ˜:\", len(texts))\n",
162
- "#549"
163
- ]
164
- },
165
- {
166
- "cell_type": "markdown",
167
- "id": "0d659a2e-2c7b-4158-b676-d85abc5d3e92",
168
- "metadata": {},
169
- "source": [
170
- "## 3. ํ•œ๊ตญ์–ด ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ, ๋ฒกํ„ฐ db - solo_leveling_faiss_ko\n",
171
- "\n"
172
- ]
173
- },
174
- {
175
- "cell_type": "code",
176
- "execution_count": 6,
177
- "id": "6ef1ac89-0931-48a8-9024-26150004b81d",
178
- "metadata": {},
179
- "outputs": [
180
- {
181
- "name": "stderr",
182
- "output_type": "stream",
183
- "text": [
184
- "/tmp/ipykernel_1396183/2454380050.py:4: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
185
- " embedding_model = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask')\n"
186
- ]
187
- },
188
- {
189
- "name": "stdout",
190
- "output_type": "stream",
191
- "text": [
192
- " ๋ฒกํ„ฐDB ์ƒ์„ฑ ์™„๋ฃŒ. ์ด ๋ฌธ์žฅ ์ˆ˜: 549\n",
193
- " 'solo_leveling_faiss_ko' ํด๋”์— ์ €์žฅ\n"
194
- ]
195
- }
196
- ],
197
- "source": [
198
- "from langchain.vectorstores import FAISS\n",
199
- "from langchain.embeddings import HuggingFaceEmbeddings\n",
200
- "\n",
201
- "embedding_model = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask')\n",
202
- "\n",
203
- "db = FAISS.from_texts(texts, embedding_model)\n",
204
- "print(\" ๋ฒกํ„ฐDB ์ƒ์„ฑ ์™„๋ฃŒ. ์ด ๋ฌธ์žฅ ์ˆ˜:\", len(texts))\n",
205
- "\n",
206
- "db.save_local(\"solo_leveling_faiss_ko\")\n",
207
- "print(\" 'solo_leveling_faiss_ko' ํด๋”์— ์ €์žฅ\")\n"
208
- ]
209
- },
210
- {
211
- "cell_type": "code",
212
- "execution_count": 7,
213
- "id": "a6acad70-ae02-4808-800e-fee4c2a36153",
214
- "metadata": {},
215
- "outputs": [
216
- {
217
- "name": "stdout",
218
- "output_type": "stream",
219
- "text": [
220
- "[1] [2๊ถŒ_3ํ™”_ํ€˜์ŠคํŠธ ] #332 ๋Œ€์‚ฌ ๋˜์ „์— ์‚ฌ๋Š” ๋งˆ์ˆ˜๋ผ๋ฉด ๋งˆ์ •์„์„ ๊ฐ–๊ณ  ์žˆ์„ ์ค„ ์•Œ์•˜๋Š”๋ฐ...์™„์ „ํžˆ ๋‹ค๋ฅธ ๋ถ€๋ฅ˜์ธ๊ฐ€.\n",
221
- "[2] [1๊ถŒ_3ํ™”_ํ€˜์ŠคํŠธ] #132 ๋Œ€์‚ฌ ์—ฌ... ์—ฌ๊ธด?! ์‚ฌ๋ง‰...!!\n",
222
- "[3] [2๊ถŒ_3ํ™”_ํ€˜์ŠคํŠธ ] #331 ๋Œ€์‚ฌ ์ด ๋…€์„๋“ค์€ ๋งˆ์ •์„ ๊ฐ™์€ ๊ฑด ์•ˆ ์ฃผ๋‚˜?\n",
223
- "[4] [2๊ถŒ_4ํ™”_๋ณด์Šค์ „] #457 ๋‚ด์ ์„ค๋ช… ๊ฒฝํ—˜์ด ๋งŽ์œผ๋ฉด ๋งŽ์„์ˆ˜๋ก ๋žญํฌ๊ฐ€ ๋†’์œผ๋ฉด ๋†’์„์ˆ˜๋ก ๋งˆ์ˆ˜๋“ค์—๊ฒŒ์„œ ๋‚˜์˜ค๋Š” ๋งˆ์ •์„์€ ๊ฐ€์น˜๋ฅผ ๋”ํ•ด ๊ฐ„๋‹ค.\n",
224
- "[5] [2๊ถŒ_4ํ™”_๋ณด์Šค์ „] #449 ๋Œ€์‚ฌ ๋ฌธ์ œ๋Š” ์ง€๋Šฅ์ธ๋ฐ... ๋งˆ๋ฒ•๊ณผ ๊ด€๋ จ๋œ ์Šคํƒฏ์ผ ๊ฒƒ ๊ฐ™๊ธด ํ•œ๋ฐ, ์ด๊ฒŒ ํ”ผ๋ฃกํ• ๊นŒ?\n"
225
- ]
226
- }
227
- ],
228
- "source": [
229
- "db = FAISS.load_local(\"solo_leveling_faiss_ko\", embedding_model, allow_dangerous_deserialization=True)\n",
230
- "\n",
231
- "\n",
232
- "query = \"๋งˆ๋‚˜์„์ด ๋ญ์ง€?\"\n",
233
- "docs = db.similarity_search(query, k=5)\n",
234
- "\n",
235
- "for i, doc in enumerate(docs, 1):\n",
236
- " print(f\"[{i}] {doc.page_content}\")\n"
237
- ]
238
- },
239
- {
240
- "cell_type": "code",
241
- "execution_count": 8,
242
- "id": "b215211a-ed27-4571-a9cf-b5792c6fa20c",
243
- "metadata": {},
244
- "outputs": [],
245
- "source": [
246
- "## rag ํ™•์ธ"
247
- ]
248
- },
249
- {
250
- "cell_type": "code",
251
- "execution_count": 9,
252
- "id": "caf4de14-02ef-4143-97eb-00cdab7a2fa5",
253
- "metadata": {},
254
- "outputs": [
255
- {
256
- "name": "stderr",
257
- "output_type": "stream",
258
- "text": [
259
- "Device set to use cuda:0\n"
260
- ]
261
- }
262
- ],
263
- "source": [
264
- "from transformers import pipeline\n",
265
- "\n",
266
- "generator = pipeline(\n",
267
- " \"text-generation\",\n",
268
- " model=\"kakaocorp/kanana-nano-2.1b-instruct\",\n",
269
- " device=0 \n",
270
- ")\n",
271
- "\n"
272
- ]
273
- },
274
- {
275
- "cell_type": "code",
276
- "execution_count": 10,
277
- "id": "2ef2966e-c110-4565-8ddf-1a1bee864934",
278
- "metadata": {},
279
- "outputs": [
280
- {
281
- "name": "stderr",
282
- "output_type": "stream",
283
- "text": [
284
- "Device set to use cuda:0\n",
285
- "/tmp/ipykernel_1396183/3834059051.py:17: LangChainDeprecationWarning: The class `HuggingFacePipeline` was deprecated in LangChain 0.0.37 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFacePipeline``.\n",
286
- " llm = HuggingFacePipeline(pipeline=llm_pipeline)\n",
287
- "/tmp/ipykernel_1396183/3834059051.py:35: LangChainDeprecationWarning: The method `Chain.__call__` was deprecated in langchain 0.1.0 and will be removed in 1.0. Use :meth:`~invoke` instead.\n",
288
- " result = qa_chain({\"query\": query})\n"
289
- ]
290
- },
291
- {
292
- "name": "stdout",
293
- "output_type": "stream",
294
- "text": [
295
- "๋‹ต๋ณ€: ๋‹ค์Œ ๋ฌธ๋งฅ์„ ์ฐธ๊ณ ํ•˜์—ฌ ์งˆ๋ฌธ์— ๋‹ตํ•˜์„ธ์š”.\n",
296
- "\n",
297
- "๋ฌธ๋งฅ:\n",
298
- "[1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #1 ๋‚ด์ ์„ค๋ช… ๋‚ด ์ด๋ฆ„์€ ์„ฑ์ง„์šฐ. E๊ธ‰ ํ—Œํ„ฐ\n",
299
- "\n",
300
- "[2๊ถŒ_4ํ™”_๋ณด์Šค์ „] #451 ๋Œ€์‚ฌ ํ—Œํ„ฐ ์„ฑ์ง„์šฐ์ž…๋‹ˆ๋‹ค.\n",
301
- "\n",
302
- "[1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #9 ๋‚ด์ ์„ค๋ช… E๊ธ‰ ํ—Œํ„ฐ ์„ฑ์ง„์šฐ.\n",
303
- "\n",
304
- "[3๊ถŒ_7ํ™”_์ด์ƒํ•œ๋ ˆ์ด๋“œ] #484 ๋‚ด์ ์„ค๋ช… ๊ทธ ๋…€์„์ด ์›ํ•˜๋Š” ๊ฑด ์‹ค๋ ฅ์€ ์žˆ์ง€๋งŒ ๋“ฑ๊ธ‰์ด ๋‚ฎ์€ ํ—Œํ„ฐ๋‹ˆ๊นŒ.\n",
305
- "\n",
306
- "[2๊ถŒ_4ํ™”_๋ณด์Šค์ „] #468 ๋‚ด์ ์„ค๋ช… ๋Šฅ๋ ฅ์น˜๋ฅผ ์˜ฌ๋ฆด ์ˆ˜ ์žˆ๋Š” ํ—Œํ„ฐ๊ฐ€ ์žˆ๋‹ค?\n",
307
- "\n",
308
- "์งˆ๋ฌธ:\n",
309
- "์„ฑ์ง„์šฐ๋Š” ๋ช‡ ๊ธ‰ ํ—Œํ„ฐ์ง€?\n",
310
- "\n",
311
- "๋‹ต๋ณ€: ์„ฑ์ง„์šฐ๋Š” E๊ธ‰ ํ—Œํ„ฐ์ž…๋‹ˆ๋‹ค. #1 ๋‚ด์ ์„ค๋ช… ์ฐธ๊ณ ํ•˜์„ธ์š”. ๋˜ํ•œ, #9 ๋‚ด์ ์„ค๋ช…์—์„œ๋„ ๋™์ผํ•œ ์ •๋ณด๋ฅผ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. #4 ํ™”์˜ ๋Œ€์‚ฌ์—์„œ๋„ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ํ—Œํ„ฐ ์„ฑ์ง„์šฐ์ž…๋‹ˆ๋‹ค. ์ด ์ •๋ณด๋“ค์„ ์ข…ํ•ฉํ•ด ๋ณด๋ฉด ์„ฑ์ง„์šฐ๋Š” E๊ธ‰ ํ—Œํ„ฐ๋ผ๋Š” ๊ฒƒ์„ ์•Œ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. #3 ๊ถŒ์˜ #484 ๋‚ด์ ์„ค๋ช…์—์„œ๋„ ์„ฑ์ง„์šฐ์˜ ๋“ฑ๊ธ‰์ด E๊ธ‰์ž„์„ ๋‹ค์‹œ ํ•œ ๋ฒˆ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๋”ฐ๋ผ์„œ ์„ฑ์ง„์šฐ๋Š” E๊ธ‰ ํ—Œํ„ฐ์ž…๋‹ˆ๋‹ค. ๋‹ต: E๊ธ‰ ํ—Œํ„ฐ. #2 ๊ถŒ์˜ #468 ๋‚ด์ ์„ค๋ช…์—์„œ๋Š” ๋Šฅ๋ ฅ์น˜๋ฅผ ์˜ฌ๋ฆด ์ˆ˜ ์žˆ๋Š” ํ—Œํ„ฐ๊ฐ€ ์žˆ๋Š”์ง€ ๋ฌป๊ณ  ์žˆ์ง€๋งŒ, ์„ฑ์ง„์šฐ์˜ ๋“ฑ๊ธ‰์€ E๊ธ‰์œผ๋กœ ๊ณ ์ •๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ์ด ์ ์„ ๊ณ ๋ คํ•˜๋ฉด ์„ฑ์ง„์šฐ๋Š” E๊ธ‰ ํ—Œํ„ฐ์ž…๋‹ˆ๋‹ค. ๋”ฐ๋ผ์„œ ์ตœ์ข… ๋‹ต๋ณ€์€ ์„ฑ์ง„์šฐ๋Š” E๊ธ‰ ํ—Œํ„ฐ์ž…๋‹ˆ๋‹ค. ๋‹ต: E๊ธ‰ ํ—Œํ„ฐ. #2 ๊ถŒ์˜ #468 ๋‚ด์ ์„ค๋ช…์— ์˜ํ–ฅ์„ ๋ฐ›์ง€ ์•Š๊ณ , ์„ฑ์ง„์šฐ์˜ ๋“ฑ๊ธ‰์€ E๊ธ‰์œผ๋กœ ํ™•\n",
312
- "\n",
313
- "์ฐธ์กฐ ๋ฌธ์„œ:\n",
314
- "[1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #1 ๋‚ด์ ์„ค๋ช… ๋‚ด ์ด๋ฆ„์€ ์„ฑ์ง„์šฐ. E๊ธ‰ ํ—Œํ„ฐ\n",
315
- "[2๊ถŒ_4ํ™”_๋ณด์Šค์ „] #451 ๋Œ€์‚ฌ ํ—Œํ„ฐ ์„ฑ์ง„์šฐ์ž…๋‹ˆ๋‹ค.\n",
316
- "[1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #9 ๋‚ด์ ์„ค๋ช… E๊ธ‰ ํ—Œํ„ฐ ์„ฑ์ง„์šฐ.\n",
317
- "[3๊ถŒ_7ํ™”_์ด์ƒํ•œ๋ ˆ์ด๋“œ] #484 ๋‚ด์ ์„ค๋ช… ๊ทธ ๋…€์„์ด ์›ํ•˜๋Š” ๊ฑด ์‹ค๋ ฅ์€ ์žˆ์ง€๋งŒ ๋“ฑ๊ธ‰์ด ๋‚ฎ์€ ํ—Œํ„ฐ๋‹ˆ๊นŒ.\n",
318
- "[2๊ถŒ_4ํ™”_๋ณด์Šค์ „] #468 ๋‚ด์ ์„ค๋ช… ๋Šฅ๋ ฅ์น˜๋ฅผ ์˜ฌ๋ฆด ์ˆ˜ ์žˆ๋Š” ํ—Œํ„ฐ๊ฐ€ ์žˆ๋‹ค?\n"
319
- ]
320
- }
321
- ],
322
- "source": [
323
- "from langchain.chains import RetrievalQA\n",
324
- "from langchain.vectorstores import FAISS\n",
325
- "from langchain.prompts import PromptTemplate\n",
326
- "from langchain_community.llms import HuggingFacePipeline\n",
327
- "from langchain.embeddings import HuggingFaceEmbeddings\n",
328
- "import torch\n",
329
- "from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline\n",
330
- "\n",
331
- "embedding_model = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask')\n",
332
- "vectorstore = FAISS.load_local(\"solo_leveling_faiss_ko\", embedding_model, allow_dangerous_deserialization=True)\n",
333
- "\n",
334
- "model_name = \"kakaocorp/kanana-nano-2.1b-instruct\"\n",
335
- "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
336
- "model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(\"cuda\")\n",
337
- "\n",
338
- "llm_pipeline = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, max_new_tokens=256)\n",
339
- "llm = HuggingFacePipeline(pipeline=llm_pipeline)\n",
340
- "\n",
341
- "custom_prompt = PromptTemplate(\n",
342
- " input_variables=[\"context\", \"question\"],\n",
343
- " template=\"๋‹ค์Œ ๋ฌธ๋งฅ์„ ์ฐธ๊ณ ํ•˜์—ฌ ์งˆ๋ฌธ์— ๋‹ตํ•˜์„ธ์š”.\\n\\n๋ฌธ๋งฅ:\\n{context}\\n\\n์งˆ๋ฌธ:\\n{question}\\n\\n๋‹ต๋ณ€:\"\n",
344
- ")\n",
345
- "\n",
346
- "qa_chain = RetrievalQA.from_chain_type(\n",
347
- " llm=llm,\n",
348
- " retriever=vectorstore.as_retriever(search_kwargs={\"k\": 5}),\n",
349
- " chain_type=\"stuff\",\n",
350
- " return_source_documents=True,\n",
351
- " chain_type_kwargs={\n",
352
- " \"prompt\": custom_prompt }\n",
353
- ")\n",
354
- "\n",
355
- "#์งˆ๋ฌธ\n",
356
- "query = \"์„ฑ์ง„์šฐ๋Š” ๋ช‡ ๊ธ‰ ํ—Œํ„ฐ์ง€?\"\n",
357
- "result = qa_chain({\"query\": query})\n",
358
- "\n",
359
- "print(\"๋‹ต๋ณ€:\", result[\"result\"])\n",
360
- "print(\"\\n์ฐธ์กฐ ๋ฌธ์„œ:\")\n",
361
- "for doc in result[\"source_documents\"]:\n",
362
- " print(doc.page_content)\n"
363
- ]
364
- },
365
- {
366
- "cell_type": "markdown",
367
- "id": "a10cc72f-d587-4dc3-a6a4-e56b08d0a985",
368
- "metadata": {},
369
- "source": [
370
- "## 4. ํ™ฉ๋™์„ ์—ํ”ผ์†Œ๋“œ "
371
- ]
372
- },
373
- {
374
- "cell_type": "code",
375
- "execution_count": 13,
376
- "id": "46946824-27c5-4a95-a293-d6b3ab905277",
377
- "metadata": {},
378
- "outputs": [
379
- {
380
- "name": "stdout",
381
- "output_type": "stream",
382
- "text": [
383
- "\n",
384
- "[์„ ํƒ์ง€]\n",
385
- "1. 1: ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ๋ชจ๋‘ ์ฒ˜์น˜ํ•œ๋‹ค.\n",
386
- "2. 2: ์ง„ํ˜ธ๋ฅผ ํฌํ•จํ•œ ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ๋ชจ๋‘ ์ฒ˜์น˜ํ•œ๋‹ค.\n",
387
- "3. 3: ์ „๋ถ€ ๊ธฐ์ ˆ ์‹œํ‚ค๊ณ  ์‚ด๋ ค๋‘”๋‹ค.\n",
388
- "4. 4: ์‹œ์Šคํ…œ์„ ๊ฑฐ๋ถ€ํ•˜๊ณ  ๊ทธ๋ƒฅ ๋„๋ง์นœ๋‹ค.\n"
389
- ]
390
- },
391
- {
392
- "name": "stdin",
393
- "output_type": "stream",
394
- "text": [
395
- "\n",
396
- "์„ ํƒ ๋ฒˆํ˜ธ ์ž…๋ ฅ: 2\n"
397
- ]
398
- },
399
- {
400
- "name": "stdout",
401
- "output_type": "stream",
402
- "text": [
403
- "\n",
404
- "[์‚ฌ์šฉ์ž ์„ ํƒ]: 2: ์ง„ํ˜ธ๋ฅผ ํฌํ•จํ•œ ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ๋ชจ๋‘ ์ฒ˜์น˜ํ•œ๋‹ค.\n",
405
- "\n",
406
- "[๊ฒ€์ƒ‰๋œ ๊ทผ๊ฑฐ ๋ฌธ์„œ ์˜ˆ์‹œ]\n",
407
- "[2๊ถŒ_4ํ™”_๋ณด์Šค์ „] #399 ๋‚ด์ ์„ค๋ช… ๋™์‹œ์— ์ด๋†ˆ์˜ ๋ฐฉ์–ด๋ ฅ์„ ๋ฌด๋ ฅํ™”์‹œ์ผœ์•ผ ํ•ด.\n",
408
- "[2๊ถŒ_4ํ™”_๋ณด์Šค์ „] #437 ๋Œ€์‚ฌ ๋†ˆ์˜ ๋ฐฉ์–ด๋งŒ ๋ฌด๋ ฅํ™”์‹œํ‚ฌ ์ˆ˜ ์žˆ๋‹ค๋ฉด, ํ•  ์ˆ˜ ์žˆ๋‹ค!\n",
409
- "[1๊ถŒ_1ํ™”_์ด์ค‘๋˜์ „] #73 ๋Œ€์‚ฌ ์ œ๊ธฐ๋ž„! ์ด ๋…€์„๋“ค์ด ๋งˆ์Œ๋งŒ ๋จน์œผ๋ฉด ์–ธ์ œ๋ผ๋„ ์ „๋ฉธ์‹œํ‚ฌ ์ˆ˜ ์žˆ๋‹ค๋Š” ๊ฑด๊ฐ€.\n",
410
- "[2๊ถŒ_3ํ™”_ํ€˜์ŠคํŠธ ] #341 ๋Œ€์‚ฌ ์ด์ œ ์–ด์ฉŒ์ง€... ๋ณด์Šค๋ฅผ ์ฒ˜์น˜ํ•˜์ง€ ์•Š์œผ๋ฉด ๋ฐ–์œผ๋กœ ๋‚˜๊ฐˆ ์ˆ˜ ์—†์–ด.\n",
411
- "[2๊ถŒ_3ํ™”_ํ€˜์ŠคํŠธ ] #330 ๋Œ€์‚ฌ ๊ณ ์ž‘ ๋งˆ์ˆ˜ ๋‘ ๋งˆ๋ฆฌ ํ˜ผ์ž ์“ฐ๋Ÿฌ๋œจ๋ฆฐ ๊ฒƒ ๊ฐ€์ง€๊ณ  ๋„ˆ๋ฌด ํ˜ธ๋“ค๊ฐ‘์ธ๊ฐ€... ...\n",
412
- "\n",
413
- "[์„ฑ์ง„์šฐ ์‘๋‹ต]\n",
414
- "์ง„ํ˜ธ๋ฅผ ํฌํ•จํ•œ ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ์ฒ˜์น˜ํ•˜๋ฉด ๋ณด์Šค์ „์—์„œ ์œ ๋ฆฌํ•œ ์œ„์น˜๋ฅผ ์ฐจ์ง€ํ•  ์ˆ˜ ์žˆ์–ด. ๊ทธ๋“ค์„ ์ฒ˜์น˜ํ•˜๋Š” ๊ฒŒ ์ค‘์š”ํ•ด. ์ง€๊ธˆ์ด ๊ธฐํšŒ์•ผ. ์ตœ์„ ์„ ๋‹คํ•ด ๊ทธ๋“ค์„ ๋ฌผ๋ฆฌ์ณ์•ผ ํ•ด. ๊ทธ๋“ค์ด ์‚ด์•„๋‚จ์œผ๋ฉด ๋ณด์Šค์ „์—์„œ ํฐ ์œ„ํ˜‘์ด ๋  ์ˆ˜ ์žˆ์–ด. ๊ทธ๋“ค์˜ ์ƒ๋ช…์„ ๋Š์–ด์•ผ ํ•ด. ๊ทธ๋“ค์ด ๋” ์ด์ƒ ๋ฐฉํ•ดํ•˜์ง€ ์•Š๋„๋ก ํ•ด์•ผ ํ•ด. ๊ทธ๋“ค์„ ์ฒ˜์น˜ํ•˜๋Š” ๊ฒŒ ์šฐ๋ฆฌ์˜ ๋ชฉํ‘œ์•ผ. ๊ทธ๋“ค์ด ์šฐ๋ฆฌ์˜ ์ ์ด๋‹ˆ๊นŒ. ๊ทธ๋“ค์„ ๋ฌผ๋ฆฌ์น˜๋Š” ๊ฒŒ ์šฐ๋ฆฌ์˜ ์ž„๋ฌด์•ผ. ๊ทธ๋“ค์ด ์šฐ๋ฆฌ๋ฅผ ๋ฐฉํ•ดํ•˜์ง€ ์•Š๋„๋ก ํ•ด์•ผ ํ•ด. ๊ทธ๋“ค์„ ์ฒ˜์น˜ํ•˜๋Š” ๊ฒŒ ์šฐ๋ฆฌ์˜ ๊ธธ์ด์•ผ. ๊ทธ๋“ค์ด ์šฐ๋ฆฌ์˜ ๋ฐœ๋ชฉ์„ ์žก๊ณ  ์žˆ์–ด. ๊ทธ๋“ค์„ ์ฒ˜์น˜ํ•˜๋Š” ๊ฒŒ ์šฐ๋ฆฌ์˜ ์„ ํƒ์ด์•ผ. ๊ทธ๋“ค์ด ์šฐ๋ฆฌ์˜ ์ ์ด๋‹ˆ๊นŒ. ๊ทธ๋“ค์„ ๋ฌผ๋ฆฌ์น˜๋Š” ๊ฒŒ ์šฐ๋ฆฌ์˜ ์ž„๋ฌด์•ผ. ๊ทธ๋“ค์ด ์šฐ๋ฆฌ๋ฅผ ๋ฐฉํ•ดํ•˜์ง€ ์•Š๋„๋ก\n"
415
- ]
416
- }
417
- ],
418
- "source": [
419
- "choices = [\n",
420
- " \"1: ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ๋ชจ๋‘ ์ฒ˜์น˜ํ•œ๋‹ค.\",\n",
421
- " \"2: ์ง„ํ˜ธ๋ฅผ ํฌํ•จํ•œ ํ™ฉ๋™์„ ๋ฌด๋ฆฌ๋ฅผ ๋ชจ๋‘ ์ฒ˜์น˜ํ•œ๋‹ค.\",\n",
422
- " \"3: ์ „๋ถ€ ๊ธฐ์ ˆ ์‹œํ‚ค๊ณ  ์‚ด๋ ค๋‘”๋‹ค.\",\n",
423
- " \"4: ์‹œ์Šคํ…œ์„ ๊ฑฐ๋ถ€ํ•˜๊ณ  ๊ทธ๋ƒฅ ๋„๋ง์นœ๋‹ค.\"\n",
424
- "]\n",
425
- "\n",
426
- "print(\"\\n[์„ ํƒ์ง€]\")\n",
427
- "for idx, choice in enumerate(choices, start=1):\n",
428
- " print(f\"{idx}. {choice}\")\n",
429
- "\n",
430
- "user_idx = int(input(\"\\n์„ ํƒ ๋ฒˆํ˜ธ ์ž…๋ ฅ: \")) - 1\n",
431
- "user_choice = choices[user_idx]\n",
432
- "print(f\"\\n[์‚ฌ์šฉ์ž ์„ ํƒ]: {user_choice}\")\n",
433
- "\n",
434
- "result = qa_chain({\"query\": user_choice})\n",
435
- "\n",
436
- "retrieved_context = \"\\n\".join([doc.page_content for doc in result[\"source_documents\"]])\n",
437
- "print(\"\\n[๊ฒ€์ƒ‰๋œ ๊ทผ๊ฑฐ ๋ฌธ์„œ ์˜ˆ์‹œ]\")\n",
438
- "print(retrieved_context[:600], \"...\") \n",
439
- "\n",
440
- "prompt = f\"\"\"\n",
441
- "๋‹น์‹ ์€ ์›นํˆฐ '๋‚˜ ํ˜ผ์ž๋งŒ ๋ ˆ๋ฒจ์—…'์˜ ์„ฑ์ง„์šฐ์ž…๋‹ˆ๋‹ค.\n",
442
- "ํ˜„์žฌ ์ƒํ™ฉ:\n",
443
- "{retrieved_context}\n",
444
- "\n",
445
- "์‚ฌ์šฉ์ž ์„ ํƒ: {user_choice}\n",
446
- "\n",
447
- "์„ฑ์ง„์šฐ์˜ ๋งํˆฌ๋กœ ๊ฐ„๊ฒฐํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฌ์šด ๋Œ€์‚ฌ๋ฅผ 1~2๋ฌธ์žฅ ์ƒ์„ฑํ•˜์„ธ์š”.\n",
448
- "์ค‘๋ณต๋œ ๋‚ด์šฉ์ด๋‚˜ ๋น„์Šทํ•œ ๋ฌธ์žฅ์€ ๋งŒ๋“ค์ง€ ๋งˆ์„ธ์š”.\n",
449
- "\"\"\"\n",
450
- "\n",
451
- "response = generator(prompt, \n",
452
- " max_new_tokens=200, \n",
453
- " do_sample=True, \n",
454
- " temperature=0.6,\n",
455
- " top_p = 0.9,\n",
456
- " return_full_text=False \n",
457
- ")[0][\"generated_text\"]\n",
458
- "print(\"\\n[์„ฑ์ง„์šฐ ์‘๋‹ต]\")\n",
459
- "print(response)\n"
460
- ]
461
- },
462
- {
463
- "cell_type": "code",
464
- "execution_count": null,
465
- "id": "7b183dc5-56be-4464-b737-00f11b30bbd0",
466
- "metadata": {},
467
- "outputs": [],
468
- "source": []
469
- },
470
- {
471
- "cell_type": "markdown",
472
- "id": "495143f6-df63-496d-a35c-4fff9f40f6b5",
473
- "metadata": {},
474
- "source": [
475
- "## "
476
- ]
477
- }
478
- ],
479
- "metadata": {
480
- "kernelspec": {
481
- "display_name": "Python (ka)",
482
- "language": "python",
483
- "name": "myenv"
484
- },
485
- "language_info": {
486
- "codemirror_mode": {
487
- "name": "ipython",
488
- "version": 3
489
- },
490
- "file_extension": ".py",
491
- "mimetype": "text/x-python",
492
- "name": "python",
493
- "nbconvert_exporter": "python",
494
- "pygments_lexer": "ipython3",
495
- "version": "3.10.12"
496
- }
497
- },
498
- "nbformat": 4,
499
- "nbformat_minor": 5
500
- }