ginipick commited on
Commit
d2a7d2b
Β·
verified Β·
1 Parent(s): 825c5f8

Delete app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +0 -332
app-backup.py DELETED
@@ -1,332 +0,0 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
3
- import os
4
- import pandas as pd
5
- from typing import List, Dict, Tuple
6
- import json
7
- import io
8
-
9
- import traceback
10
- # μΆ”λ‘  API ν΄λΌμ΄μ–ΈνŠΈ μ„€μ •
11
- hf_client = InferenceClient(
12
- "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
13
- )
14
-
15
- def load_code(filename: str) -> str:
16
- try:
17
- with open(filename, 'r', encoding='utf-8') as file:
18
- return file.read()
19
- except FileNotFoundError:
20
- return f"{filename} νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
21
- except Exception as e:
22
- return f"νŒŒμΌμ„ μ½λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
23
-
24
- def load_parquet(filename: str) -> str:
25
- try:
26
- df = pd.read_parquet(filename, engine='pyarrow')
27
- return df.head(10).to_markdown(index=False)
28
- except FileNotFoundError:
29
- return f"{filename} νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
30
- except Exception as e:
31
- return f"νŒŒμΌμ„ μ½λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
32
-
33
-
34
- def respond(
35
- message: str,
36
- history: List[Dict[str, str]],
37
- system_message: str = "",
38
- max_tokens: int = 4000,
39
- temperature: float = 0.5,
40
- top_p: float = 0.9,
41
- parquet_data: str = None
42
- ) -> str:
43
- # μ‹œμŠ€ν…œ ν”„λ‘¬ν”„νŠΈ μ„€μ •
44
- if parquet_data:
45
- system_prefix = """λ°˜λ“œμ‹œ ν•œκΈ€λ‘œ λ‹΅λ³€ν•  것. λ„ˆλŠ” μ—…λ‘œλ“œλœ 데이터λ₯Ό 기반으둜 μ§ˆλ¬Έμ— λ‹΅λ³€ν•˜λŠ” 역할을 ν•œλ‹€. 데이터λ₯Ό λΆ„μ„ν•˜μ—¬ μ‚¬μš©μžμ—κ²Œ 도움이 λ˜λŠ” 정보λ₯Ό μ œκ³΅ν•˜λΌ. 데이터λ₯Ό ν™œμš©ν•˜μ—¬ μƒμ„Έν•˜κ³  μ •ν™•ν•œ 닡변을 μ œκ³΅ν•˜λ˜, λ―Όκ°ν•œ μ •λ³΄λ‚˜ 개인 정보λ₯Ό λ…ΈμΆœν•˜μ§€ 마라."""
46
- try:
47
- df = pd.read_json(io.StringIO(parquet_data))
48
- # λ°μ΄ν„°μ˜ μš”μ•½ 정보 생성
49
- data_summary = df.describe(include='all').to_string()
50
- system_prefix += f"\n\nμ—…λ‘œλ“œλœ λ°μ΄ν„°μ˜ μš”μ•½ 정보:\n{data_summary}"
51
- except Exception as e:
52
- print(f"데이터 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {str(e)}\n{traceback.format_exc()}")
53
- system_prefix += "\n\n데이터λ₯Ό λ‘œλ“œν•˜λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."
54
- else:
55
- system_prefix = system_message or "λ„ˆλŠ” AI μ‘°μ–Έμž 역할이닀."
56
-
57
- # λ©”μ‹œμ§€ 생성
58
- prompt = system_prefix + "\n\n"
59
- for chat in history:
60
- if chat['role'] == 'user':
61
- prompt += f"μ‚¬μš©μž: {chat['content']}\n"
62
- else:
63
- prompt += f"AI: {chat['content']}\n"
64
- prompt += f"μ‚¬μš©μž: {message}\nAI:"
65
-
66
- try:
67
- # λͺ¨λΈμ— λ©”μ‹œμ§€ 전솑 및 응닡 λ°›κΈ°
68
- response = ""
69
- stream = hf_client.text_generation(
70
- prompt=prompt,
71
- max_new_tokens=max_tokens,
72
- stream=True,
73
- temperature=temperature,
74
- top_p=top_p,
75
- )
76
- for msg in stream:
77
- if msg:
78
- response += msg
79
- yield response
80
- except Exception as e:
81
- error_message = f"μΆ”λ‘  쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}\n{traceback.format_exc()}"
82
- print(error_message)
83
- yield error_message
84
-
85
-
86
- def upload_csv(file_path: str) -> Tuple[str, str]:
87
- try:
88
- # CSV 파일 읽기
89
- df = pd.read_csv(file_path, sep=',')
90
- # ν•„μˆ˜ 컬럼 확인
91
- required_columns = {'id', 'text', 'label', 'metadata'}
92
- available_columns = set(df.columns)
93
- missing_columns = required_columns - available_columns
94
- if missing_columns:
95
- return f"CSV νŒŒμΌμ— λ‹€μŒ ν•„μˆ˜ 컬럼이 λˆ„λ½λ˜μ—ˆμŠ΅λ‹ˆλ‹€: {', '.join(missing_columns)}", ""
96
- # 데이터 ν΄λ Œμ§•
97
- df.drop_duplicates(inplace=True)
98
- df.fillna('', inplace=True)
99
- # 데이터 μœ ν˜• μ΅œμ ν™”
100
- df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
101
- # Parquet 파일둜 λ³€ν™˜
102
- parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet'
103
- df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
104
- return f"{parquet_filename} 파일이 μ„±κ³΅μ μœΌλ‘œ μ—…λ‘œλ“œλ˜κ³  λ³€ν™˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_filename
105
- except Exception as e:
106
- return f"CSV 파일 μ—…λ‘œλ“œ 및 λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}", ""
107
-
108
- def upload_parquet(file_path: str) -> Tuple[str, str, str]:
109
- try:
110
- # Parquet 파일 읽기
111
- df = pd.read_parquet(file_path, engine='pyarrow')
112
- # Markdown으둜 λ³€ν™˜ν•˜μ—¬ 미리보기
113
- parquet_content = df.head(10).to_markdown(index=False)
114
- # DataFrame을 JSON λ¬Έμžμ—΄λ‘œ λ³€ν™˜
115
- parquet_json = df.to_json(orient='records', force_ascii=False)
116
- return "Parquet 파일이 μ„±κ³΅μ μœΌλ‘œ μ—…λ‘œλ“œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_content, parquet_json
117
- except Exception as e:
118
- return f"Parquet 파일 μ—…λ‘œλ“œ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}", "", ""
119
-
120
- def text_to_parquet(text: str) -> Tuple[str, str, str]:
121
- try:
122
- # ��슀트λ₯Ό DataFrame으둜 λ³€ν™˜ (각 행은 콀마둜 ꡬ뢄)
123
- data = [line.split(',') for line in text.strip().split('\n')]
124
- df = pd.DataFrame(data, columns=['id', 'text', 'label', 'metadata'])
125
- # 데이터 μœ ν˜• μ΅œμ ν™”
126
- df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
127
- # Parquet 파일둜 λ³€ν™˜
128
- parquet_filename = 'text_to_parquet.parquet'
129
- df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
130
- # Parquet 파일 λ‚΄μš© 미리보기
131
- parquet_content = load_parquet(parquet_filename)
132
- return f"{parquet_filename} 파일이 μ„±κ³΅μ μœΌλ‘œ λ³€ν™˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_content, parquet_filename
133
- except Exception as e:
134
- return f"ν…μŠ€νŠΈ λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}", "", ""
135
-
136
- # CSS μ„€μ •
137
- css = """
138
- footer {
139
- visibility: hidden;
140
- }
141
- #chatbot-container, #chatbot-data-upload {
142
- height: 700px;
143
- overflow-y: scroll;
144
- }
145
- #chatbot-container .message, #chatbot-data-upload .message {
146
- font-size: 14px;
147
- }
148
- /* μž…λ ₯μ°½ 배경색 및 κΈ€μžμƒ‰ λ³€κ²½ */
149
- textarea, input[type="text"] {
150
- background-color: #ffffff; /* 흰색 λ°°κ²½ */
151
- color: #000000; /* 검정색 κΈ€μž */
152
- }
153
- /* 파일 μ—…λ‘œλ“œ μ˜μ—­ 높이 쑰절 */
154
- #parquet-upload-area {
155
- max-height: 150px;
156
- overflow-y: auto;
157
- }
158
- /* 초기 μ„€λͺ… 글씨 크기 쑰절 */
159
- #initial-description {
160
- font-size: 14px;
161
- }
162
- """
163
-
164
-
165
- # Gradio Blocks μΈν„°νŽ˜μ΄μŠ€ μ„€μ •
166
- with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
167
- gr.Markdown("# My RAG: LLM이 λ‚˜λ§Œμ˜ λ°μ΄ν„°λ‘œ ν•™μŠ΅ν•œ μ½˜ν…μΈ  생성/λ‹΅λ³€", elem_id="initial-description")
168
- gr.Markdown(
169
- "### 1) λ‚˜λ§Œμ˜ 데이터λ₯Ό μž…λ ₯ λ˜λŠ” CSV μ—…λ‘œλ“œλ‘œ Parquet 데이터셋 μžλ™ λ³€ν™˜ 2) Parquet 데이터셋을 μ—…λ‘œλ“œν•˜λ©΄, LLM이 맞좀 ν•™μŠ΅ λ°μ΄ν„°λ‘œ ν™œμš©ν•˜μ—¬ 응닡\n"
170
- "### Tip) '예제'λ₯Ό 톡해 λ‹€μ–‘ν•œ ν™œμš© 방법을 μ²΄ν—˜ν•˜κ³  μ‘μš©ν•΄ λ³΄μ„Έμš”, 데이터셋 μ—…λ‘œλ“œμ‹œ λ―Έλ¦¬λ³΄κΈ°λŠ” 10건만 좜λ ₯",
171
- elem_id="initial-description"
172
- )
173
-
174
- # 첫 번째 νƒ­: 챗봇 데이터 μ—…λ‘œλ“œ (νƒ­ 이름 λ³€κ²½: "My 데이터셋+LLM")
175
- with gr.Tab("My 데이터셋+LLM"):
176
- gr.Markdown("### LLMκ³Ό λŒ€ν™”ν•˜κΈ°")
177
- chatbot_data_upload = gr.Chatbot(label="챗봇", type="messages", elem_id="chatbot-data-upload")
178
- msg_data_upload = gr.Textbox(label="λ©”μ‹œμ§€ μž…λ ₯", placeholder="여기에 λ©”μ‹œμ§€λ₯Ό μž…λ ₯ν•˜μ„Έμš”...")
179
- send_data_upload = gr.Button("전솑")
180
-
181
- with gr.Accordion("μ‹œμŠ€ν…œ ν”„λ‘¬ν”„νŠΈ 및 μ˜΅μ…˜ μ„€μ •", open=False):
182
- system_message = gr.Textbox(label="System Message", value="λ„ˆλŠ” AI μ‘°μ–Έμž 역할이닀.")
183
- max_tokens = gr.Slider(minimum=1, maximum=8000, value=1000, label="Max Tokens")
184
- temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature")
185
- top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P")
186
-
187
- parquet_data_state = gr.State()
188
-
189
- def handle_message_data_upload(
190
- message: str,
191
- history: List[Dict[str, str]],
192
- system_message: str,
193
- max_tokens: int,
194
- temperature: float,
195
- top_p: float,
196
- parquet_data: str
197
- ):
198
- history = history or []
199
- try:
200
- # μ‚¬μš©μžμ˜ λ©”μ‹œμ§€λ₯Ό νžˆμŠ€ν† λ¦¬μ— μΆ”κ°€
201
- history.append({"role": "user", "content": message})
202
- # 응닡 생성
203
- response_gen = respond(
204
- message, history, system_message, max_tokens, temperature, top_p, parquet_data
205
- )
206
- partial_response = ""
207
- for partial in response_gen:
208
- partial_response = partial
209
- # λŒ€ν™” λ‚΄μ—­ μ—…λ°μ΄νŠΈ
210
- display_history = history + [
211
- {"role": "assistant", "content": partial_response}
212
- ]
213
- yield display_history, ""
214
- # μ–΄μ‹œμŠ€ν„΄νŠΈμ˜ 응닡을 νžˆμŠ€ν† λ¦¬μ— μΆ”κ°€
215
- history.append({"role": "assistant", "content": partial_response})
216
- except Exception as e:
217
- response = f"μΆ”λ‘  쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
218
- history.append({"role": "assistant", "content": response})
219
- yield history, ""
220
-
221
- send_data_upload.click(
222
- handle_message_data_upload,
223
- inputs=[
224
- msg_data_upload,
225
- chatbot_data_upload,
226
- system_message,
227
- max_tokens,
228
- temperature,
229
- top_p,
230
- parquet_data_state, # parquet_data_stateλ₯Ό μ‚¬μš©ν•˜μ—¬ μ—…λ‘œλ“œλœ 데이터λ₯Ό 전달
231
- ],
232
- outputs=[chatbot_data_upload, msg_data_upload],
233
- queue=True
234
- )
235
-
236
- # 예제 μΆ”κ°€
237
- with gr.Accordion("예제", open=False):
238
- gr.Examples(
239
- examples=[
240
- ["μ—…λ‘œλ“œλœ 데이터셋에 λŒ€ν•΄ μš”μ•½ μ„€λͺ…ν•˜λΌ."],
241
- ["μ—…λ‘œλ“œλœ 데이터셋 νŒŒμΌμ„ ν•™μŠ΅ λ°μ΄ν„°λ‘œ ν™œμš©ν•˜μ—¬, λ³Έ μ„œλΉ„μŠ€λ₯Ό SEO μ΅œμ ν™”ν•˜μ—¬ λΈ”λ‘œκ·Έ 포슀트(κ°œμš”, λ°°κ²½ 및 ν•„μš”μ„±, κΈ°μ‘΄ μœ μ‚¬ μ œν’ˆ/μ„œλΉ„μŠ€μ™€ λΉ„κ΅ν•˜μ—¬ 특μž₯점, ν™œμš©μ²˜, κ°€μΉ˜, κΈ°λŒ€νš¨κ³Ό, 결둠을 포함)둜 4000 토큰 이상 μž‘μ„±ν•˜λΌ"],
242
- ["μ—…λ‘œλ“œλœ 데이터셋 νŒŒμΌμ„ ν•™μŠ΅ λ°μ΄ν„°λ‘œ ν™œμš©ν•˜μ—¬, μ‚¬μš© 방법과 차별점, νŠΉμ§•, 강점을 μ€‘μ‹¬μœΌλ‘œ 4000 토큰 이상 유튜브 μ˜μƒ 슀크립트 ν˜•νƒœλ‘œ μž‘μ„±ν•˜λΌ"],
243
- ["μ—…λ‘œλ“œλœ 데이터셋 νŒŒμΌμ„ ν•™μŠ΅ λ°μ΄ν„°λ‘œ ν™œμš©ν•˜μ—¬, μ œν’ˆ 상세 νŽ˜μ΄μ§€ ν˜•μ‹μ˜ λ‚΄μš©μ„ 4000 토큰 이상 μžμ„Ένžˆ μ„€λͺ…ν•˜λΌ"],
244
- ["μ—…λ‘œλ“œλœ 데이터셋 νŒŒμΌμ„ ν•™μŠ΅ λ°μ΄ν„°λ‘œ ν™œμš©ν•˜μ—¬, FAQ 20건을 μƒμ„Έν•˜κ²Œ μž‘μ„±ν•˜λΌ. 4000토큰 이상 μ‚¬μš©ν•˜λΌ."],
245
- ["μ—…λ‘œλ“œλœ 데이터셋 νŒŒμΌμ„ ν•™μŠ΅ λ°μ΄ν„°λ‘œ ν™œμš©ν•˜μ—¬, νŠΉν—ˆ μΆœμ›μ— ν™œμš©ν•  기술 및 λΉ„μ¦ˆλ‹ˆμŠ€ λͺ¨λΈ 츑면을 ν¬ν•¨ν•˜μ—¬ νŠΉν—ˆ μΆœμ›μ„œ ꡬ성에 맞게 ν˜μ‹ μ μΈ 창의 발λͺ… λ‚΄μš©μ„ μ€‘μ‹¬μœΌλ‘œ 4000 토큰 이상 μž‘μ„±ν•˜λΌ."],
246
- ],
247
- inputs=msg_data_upload,
248
- label="예제 선택",
249
- )
250
-
251
- # Parquet 파일 μ—…λ‘œλ“œλ₯Ό ν™”λ©΄ ν•˜λ‹¨μœΌλ‘œ 이동
252
- gr.Markdown("### Parquet 파일 μ—…λ‘œλ“œ")
253
- with gr.Row():
254
- with gr.Column():
255
- parquet_upload = gr.File(
256
- label="Parquet 파일 μ—…λ‘œλ“œ", type="filepath", elem_id="parquet-upload-area"
257
- )
258
- parquet_upload_button = gr.Button("μ—…λ‘œλ“œ")
259
- parquet_upload_status = gr.Textbox(label="μ—…λ‘œλ“œ μƒνƒœ", interactive=False)
260
- parquet_preview_chat = gr.Markdown(label="Parquet 파일 미리보기")
261
-
262
- def handle_parquet_upload(file_path: str):
263
- message, parquet_content, parquet_json = upload_parquet(file_path)
264
- if parquet_json:
265
- return message, parquet_content, parquet_json
266
- else:
267
- return message, "", ""
268
-
269
- parquet_upload_button.click(
270
- handle_parquet_upload,
271
- inputs=parquet_upload,
272
- outputs=[parquet_upload_status, parquet_preview_chat, parquet_data_state]
273
- )
274
-
275
- # 두 번째 νƒ­: 데이터 λ³€ν™˜ (νƒ­ 이름 λ³€κ²½: "CSV to My 데이터셋")
276
- with gr.Tab("CSV to My 데이터셋"):
277
- gr.Markdown("### CSV 파일 μ—…λ‘œλ“œ 및 Parquet λ³€ν™˜")
278
- with gr.Row():
279
- with gr.Column():
280
- csv_file = gr.File(label="CSV 파일 μ—…λ‘œλ“œ", type="filepath")
281
- upload_button = gr.Button("μ—…λ‘œλ“œ 및 λ³€ν™˜")
282
- upload_status = gr.Textbox(label="μ—…λ‘œλ“œ μƒνƒœ", interactive=False)
283
- parquet_preview = gr.Markdown(label="Parquet 파일 미리보기")
284
- download_button = gr.File(label="Parquet 파일 λ‹€μš΄λ‘œλ“œ", interactive=False)
285
-
286
- def handle_csv_upload(file_path: str):
287
- message, parquet_filename = upload_csv(file_path)
288
- if parquet_filename:
289
- parquet_content = load_parquet(parquet_filename)
290
- return message, parquet_content, parquet_filename
291
- else:
292
- return message, "", None
293
-
294
- upload_button.click(
295
- handle_csv_upload,
296
- inputs=csv_file,
297
- outputs=[upload_status, parquet_preview, download_button]
298
- )
299
-
300
- # μ„Έ 번째 νƒ­: ν…μŠ€νŠΈ to csv to parquet λ³€ν™˜ (νƒ­ 이름 λ³€κ²½: "Text to My 데이터셋")
301
- with gr.Tab("Text to My 데이터셋"):
302
- gr.Markdown("### ν…μŠ€νŠΈλ₯Ό μž…λ ₯ν•˜λ©΄ CSV둜 λ³€ν™˜ ν›„ Parquet으둜 μžλ™ μ „ν™˜λ©λ‹ˆλ‹€.")
303
- with gr.Row():
304
- with gr.Column():
305
- text_input = gr.Textbox(
306
- label="ν…μŠ€νŠΈ μž…λ ₯ (각 행은 `id,text,label,metadata` ν˜•μ‹μœΌλ‘œ μž…λ ₯)",
307
- lines=10,
308
- placeholder="예: 1,μ΄μˆœμ‹ ,μž₯κ΅°,거뢁선\n2,원균,μž₯κ΅°,λͺ¨ν•¨\n3,μ„ μ‘°,μ™•,μ‹œκΈ°\n4,λ„μš”ν† λ―Έ νžˆλ°μš”μ‹œ,μ™•,침랡"
309
- )
310
- convert_button = gr.Button("λ³€ν™˜ 및 λ‹€μš΄λ‘œλ“œ")
311
- convert_status = gr.Textbox(label="λ³€ν™˜ μƒνƒœ", interactive=False)
312
- parquet_preview_convert = gr.Markdown(label="Parquet 파일 미리보기")
313
- download_parquet_convert = gr.File(label="Parquet 파일 λ‹€μš΄λ‘œλ“œ", interactive=False)
314
-
315
- def handle_text_to_parquet(text: str):
316
- message, parquet_content, parquet_filename = text_to_parquet(text)
317
- if parquet_filename:
318
- return message, parquet_content, parquet_filename
319
- else:
320
- return message, "", None
321
-
322
- convert_button.click(
323
- handle_text_to_parquet,
324
- inputs=text_input,
325
- outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
326
- )
327
-
328
- gr.Markdown("### [email protected]", elem_id="initial-description")
329
-
330
- if __name__ == "__main__":
331
- demo.launch()
332
-