ginipick commited on
Commit
b50c10b
โ€ข
1 Parent(s): 6182aa1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -50
app.py CHANGED
@@ -145,49 +145,56 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
145
  return error_message, "", ""
146
 
147
  def preprocess_text_with_llm(input_text: str) -> str:
148
- # LLM์—๊ฒŒ ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ์ „์ฒ˜๋ฆฌํ•˜๋„๋ก ์š”์ฒญ
149
- system_prompt = """๋‹น์‹ ์€ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž…๋ ฅ๋œ ๊ธด ํ…์ŠคํŠธ๋ฅผ ์•„๋ž˜์™€ ๊ฐ™์€ ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ์ „์ฒ˜๋ฆฌํ•˜์„ธ์š”:
150
-
151
- - **๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹:** `id,text,label,metadata`
152
- - **๊ฐ ํ–‰์€ ์ƒˆ๋กœ์šด ์ค„๋กœ ๊ตฌ๋ถ„๋˜๊ณ **, ํ•„๋“œ๋Š” ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„๋ฉ๋‹ˆ๋‹ค.
153
- - **ํ…์ŠคํŠธ๋‚˜ ๋‹ค๋ฅธ ํ•„๋“œ ๋‚ด์— ์‰ผํ‘œ๊ฐ€ ์žˆ์„ ๊ฒฝ์šฐ**, ํ•ด๋‹น ํ•„๋“œ๋ฅผ ํฐ๋”ฐ์˜ดํ‘œ(")๋กœ ๊ฐ์‹ธ์„ธ์š”.
154
- - **ํ•„๋“œ ๋‚ด์— ํฐ๋”ฐ์˜ดํ‘œ๊ฐ€ ์žˆ์„ ๊ฒฝ์šฐ**, ๋ฐฑ์Šฌ๋ž˜์‹œ(\\)๋กœ ์ด์Šค์ผ€์ดํ”„ ์ฒ˜๋ฆฌํ•˜์„ธ์š”. ์˜ˆ: \"
155
- - ํ…์ŠคํŠธ๋ฅผ **์˜๋ฏธ ๋‹จ์œ„๋กœ ๋ถ„ํ• **ํ•˜๊ณ , ๊ฐ ๋ฌธ์žฅ์— ๋Œ€ํ•ด **1๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋Š” ์—ฐ์†๋œ id**๋ฅผ ๋ถ€์—ฌํ•˜์„ธ์š”.
156
- - ๊ฐ ๋ฌธ์žฅ์— ๋Œ€ํ•ด **์ ์ ˆํ•œ label(์นดํ…Œ๊ณ ๋ฆฌ)**์„ ์ง€์ •ํ•˜์„ธ์š”. ์˜ˆ: "๊ธฐ์ˆ ", "์‚ฌํšŒ", "๊ฒฝ์ œ"
157
- - **metadata**์—๋Š” ์ถœ์ฒ˜๋‚˜ ๋‚ ์งœ ๋“ฑ์˜ ์ถ”๊ฐ€ ์ •๋ณด๋ฅผ ํฌํ•จํ•˜์„ธ์š”.
158
- - ์ตœ์ข… ๊ฒฐ๊ณผ๋Š” **๊ฐ ํ–‰์ด `id,text,label,metadata` ํ˜•์‹์˜ CSV**๊ฐ€ ๋˜๋„๋ก ํ•˜์„ธ์š”.
159
-
160
- **์˜ˆ์‹œ:**
 
 
 
 
161
 
162
  ์ž…๋ ฅ ํ…์ŠคํŠธ:
 
163
 
164
- "์˜ค๋Š˜์€ ๋‚ ์”จ๊ฐ€ ์ข‹๋‹ค. ๋‚ด์ผ์€ ๋น„๊ฐ€ ์˜ฌ ์˜ˆ์ •์ด๋‹ค."
165
-
166
- ์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ์…‹:
167
- 1,"์˜ค๋Š˜์€ ๋‚ ์”จ๊ฐ€ ์ข‹๋‹ค.","๋‚ ์”จ","2023-10-05"
168
- 2,"๋‚ด์ผ์€ ๋น„๊ฐ€ ์˜ฌ ์˜ˆ์ •์ด๋‹ค.","๋‚ ์”จ","2023-10-05"
169
-
170
- **์ด์ œ ์•„๋ž˜์˜ ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ์ฒ˜๋ฆฌํ•˜์„ธ์š”:**
171
-
172
- """ + input_text
173
 
174
- # LLM ํ˜ธ์ถœ ๋ฐ ์‘๋‹ต ์ฒ˜๋ฆฌ
175
  try:
176
  response = ""
177
  stream = hf_client.text_generation(
178
- prompt=system_prompt,
179
- max_new_tokens=2000,
180
- temperature=0.5,
181
  top_p=0.9,
182
  stream=True,
183
  )
 
184
  for msg in stream:
185
  if msg:
186
  response += msg
187
- # ๋””๋ฒ„๊น…: LLM์˜ ์‘๋‹ต ์ถœ๋ ฅ
188
- print("LLM ์‘๋‹ต:\n", response)
189
  processed_text = response.strip()
190
- return processed_text
 
 
 
 
 
 
 
 
 
 
191
  except Exception as e:
192
  error_message = f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}\n{traceback.format_exc()}"
193
  print(error_message)
@@ -221,8 +228,6 @@ textarea, input[type="text"] {
221
  }
222
  """
223
 
224
-
225
-
226
  # Gradio Blocks ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
227
  with gr.Blocks(css=css) as demo:
228
  gr.Markdown("# My RAG: LLM์ด ๋‚˜๋งŒ์˜ ๋ฐ์ดํ„ฐ๋กœ ํ•™์Šตํ•œ ์ฝ˜ํ…์ธ  ์ƒ์„ฑ/๋‹ต๋ณ€", elem_id="initial-description")
@@ -232,6 +237,8 @@ with gr.Blocks(css=css) as demo:
232
  elem_id="initial-description"
233
  )
234
 
 
 
235
  # ์ฒซ ๋ฒˆ์งธ ํƒญ: ์ฑ—๋ด‡ ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "My ๋ฐ์ดํ„ฐ์…‹+LLM")
236
  with gr.Tab("My ๋ฐ์ดํ„ฐ์…‹+LLM"):
237
  gr.Markdown("### LLM๊ณผ ๋Œ€ํ™”ํ•˜๊ธฐ")
@@ -386,7 +393,7 @@ with gr.Blocks(css=css) as demo:
386
  outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
387
  )
388
 
389
- # ๋„ค ๋ฒˆ์งธ ํƒญ: ํ…์ŠคํŠธ๋ฅผ ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ์ „์ฒ˜๋ฆฌ (ํƒญ ์ด๋ฆ„: "Text Preprocessing with LLM")
390
  with gr.Tab("Text Preprocessing with LLM"):
391
  gr.Markdown("### ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด LLM์ด ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.")
392
  with gr.Row():
@@ -396,37 +403,98 @@ with gr.Blocks(css=css) as demo:
396
  lines=15,
397
  placeholder="์—ฌ๊ธฐ์— ์ „์ฒ˜๋ฆฌํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”..."
398
  )
399
- preprocess_button = gr.Button("์ „์ฒ˜๋ฆฌ ์‹คํ–‰")
400
- preprocess_status = gr.Textbox(label="์ „์ฒ˜๋ฆฌ ์ƒํƒœ", interactive=False)
 
 
 
 
 
 
 
 
 
401
  processed_text_output = gr.Textbox(
402
  label="์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ์…‹ ์ถœ๋ ฅ",
403
  lines=15,
404
  interactive=False
405
  )
 
 
 
 
 
 
 
 
406
 
407
  def handle_text_preprocessing(input_text: str):
408
- preprocess_status.value = "์ „์ฒ˜๋ฆฌ ์ค‘์ž…๋‹ˆ๋‹ค. ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”..."
409
- processed_text = preprocess_text_with_llm(input_text)
410
- preprocess_status.value = "์ „์ฒ˜๋ฆฌ๊ฐ€ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
411
- return preprocess_status.value, processed_text
412
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  preprocess_button.click(
414
  handle_text_preprocessing,
415
  inputs=[raw_text_input],
416
- outputs=[preprocess_status, processed_text_output]
 
417
  )
418
 
419
- gr.Markdown("### [email protected]", elem_id="initial-description")
420
-
421
- if __name__ == "__main__":
422
- demo.launch()
423
-
424
-
425
-
426
-
427
-
428
-
429
 
 
 
 
 
 
430
 
 
 
 
 
 
 
 
 
 
 
431
 
 
432
 
 
 
 
145
  return error_message, "", ""
146
 
147
  def preprocess_text_with_llm(input_text: str) -> str:
148
+ if not input_text.strip():
149
+ return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
150
+
151
+ system_prompt = """๋‹น์‹ ์€ ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž…๋ ฅ๋œ ํ…์ŠคํŠธ๋ฅผ CSV ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ํ•˜์„ธ์š”.
152
+
153
+ ๊ทœ์น™:
154
+ 1. ์ถœ๋ ฅ ํ˜•์‹: id,text,label,metadata
155
+ 2. id: 1๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋Š” ์ˆœ์ฐจ์  ๋ฒˆํ˜ธ
156
+ 3. text: ์˜๋ฏธ ์žˆ๋Š” ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ๋œ ํ…์ŠคํŠธ
157
+ 4. label: ํ…์ŠคํŠธ์˜ ์ฃผ์ œ๋‚˜ ์นดํ…Œ๊ณ ๋ฆฌ
158
+ 5. metadata: ์ถ”๊ฐ€ ์ •๋ณด(๋‚ ์งœ, ์ถœ์ฒ˜ ๋“ฑ)
159
+
160
+ ์ฃผ์˜์‚ฌํ•ญ:
161
+ - ํ…์ŠคํŠธ์— ์‰ผํ‘œ๊ฐ€ ์žˆ์œผ๋ฉด ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ธ๊ธฐ
162
+ - ํฐ๋”ฐ์˜ดํ‘œ๋Š” ๋ฐฑ์Šฌ๋ž˜์‹œ๋กœ ์ด์Šค์ผ€์ดํ”„ ์ฒ˜๋ฆฌ
163
+ - ๊ฐ ํ–‰์€ ์ƒˆ๋กœ์šด ์ค„๋กœ ๊ตฌ๋ถ„
164
+ - ๋ชจ๋“  ํ•„๋“œ๋Š” ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„
165
 
166
  ์ž…๋ ฅ ํ…์ŠคํŠธ:
167
+ """
168
 
169
+ full_prompt = f"{system_prompt}\n\n{input_text}\n\n์ถœ๋ ฅ:"
 
 
 
 
 
 
 
 
170
 
 
171
  try:
172
  response = ""
173
  stream = hf_client.text_generation(
174
+ prompt=full_prompt,
175
+ max_new_tokens=4000, # ํ† ํฐ ์ˆ˜ ์ฆ๊ฐ€
176
+ temperature=0.3, # ๋” ๊ฒฐ์ •์ ์ธ ์ถœ๋ ฅ์„ ์œ„ํ•ด ๋‚ฎ์ถค
177
  top_p=0.9,
178
  stream=True,
179
  )
180
+
181
  for msg in stream:
182
  if msg:
183
  response += msg
184
+
185
+ # ์‘๋‹ต ์ •์ œ
186
  processed_text = response.strip()
187
+
188
+ # CSV ํ˜•์‹ ๊ฒ€์ฆ
189
+ try:
190
+ # StringIO๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ CSV ํ˜•์‹ ๊ฒ€์ฆ
191
+ from io import StringIO
192
+ import csv
193
+ csv.reader(StringIO(processed_text))
194
+ return processed_text
195
+ except csv.Error:
196
+ return "LLM์ด ์˜ฌ๋ฐ”๋ฅธ CSV ํ˜•์‹์„ ์ƒ์„ฑํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”."
197
+
198
  except Exception as e:
199
  error_message = f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}\n{traceback.format_exc()}"
200
  print(error_message)
 
228
  }
229
  """
230
 
 
 
231
  # Gradio Blocks ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
232
  with gr.Blocks(css=css) as demo:
233
  gr.Markdown("# My RAG: LLM์ด ๋‚˜๋งŒ์˜ ๋ฐ์ดํ„ฐ๋กœ ํ•™์Šตํ•œ ์ฝ˜ํ…์ธ  ์ƒ์„ฑ/๋‹ต๋ณ€", elem_id="initial-description")
 
237
  elem_id="initial-description"
238
  )
239
 
240
+
241
+
242
  # ์ฒซ ๋ฒˆ์งธ ํƒญ: ์ฑ—๋ด‡ ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "My ๋ฐ์ดํ„ฐ์…‹+LLM")
243
  with gr.Tab("My ๋ฐ์ดํ„ฐ์…‹+LLM"):
244
  gr.Markdown("### LLM๊ณผ ๋Œ€ํ™”ํ•˜๊ธฐ")
 
393
  outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
394
  )
395
 
396
+ # ๋„ค ๋ฒˆ์งธ ํƒญ: ํ…์ŠคํŠธ๋ฅผ ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ์ „์ฒ˜๋ฆฌ (๊ฐœ์„ ๋œ ๋ฒ„์ „)
397
  with gr.Tab("Text Preprocessing with LLM"):
398
  gr.Markdown("### ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด LLM์ด ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.")
399
  with gr.Row():
 
403
  lines=15,
404
  placeholder="์—ฌ๊ธฐ์— ์ „์ฒ˜๋ฆฌํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”..."
405
  )
406
+
407
+ with gr.Row():
408
+ preprocess_button = gr.Button("์ „์ฒ˜๋ฆฌ ์‹คํ–‰", variant="primary")
409
+ clear_button = gr.Button("์ดˆ๊ธฐํ™”")
410
+
411
+ preprocess_status = gr.Textbox(
412
+ label="์ „์ฒ˜๋ฆฌ ์ƒํƒœ",
413
+ interactive=False,
414
+ value="๋Œ€๊ธฐ ์ค‘..."
415
+ )
416
+
417
  processed_text_output = gr.Textbox(
418
  label="์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ์…‹ ์ถœ๋ ฅ",
419
  lines=15,
420
  interactive=False
421
  )
422
+
423
+ # Parquet ๋ณ€ํ™˜ ๋ฐ ๋‹ค์šด๋กœ๋“œ ์„น์…˜
424
+ with gr.Row():
425
+ convert_to_parquet_button = gr.Button("Parquet์œผ๋กœ ๋ณ€ํ™˜", visible=True)
426
+ download_parquet = gr.File(
427
+ label="๋ณ€ํ™˜๋œ Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ",
428
+ visible=False
429
+ )
430
 
431
  def handle_text_preprocessing(input_text: str):
432
+ if not input_text.strip():
433
+ return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", ""
434
+
435
+ try:
436
+ preprocess_status_msg = "์ „์ฒ˜๋ฆฌ๋ฅผ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค..."
437
+ yield preprocess_status_msg, ""
438
+
439
+ processed_text = preprocess_text_with_llm(input_text)
440
+
441
+ if processed_text:
442
+ preprocess_status_msg = "์ „์ฒ˜๋ฆฌ๊ฐ€ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
443
+ yield preprocess_status_msg, processed_text
444
+ else:
445
+ preprocess_status_msg = "์ „์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
446
+ yield preprocess_status_msg, ""
447
+
448
+ except Exception as e:
449
+ error_msg = f"์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
450
+ yield error_msg, ""
451
+
452
+ def clear_inputs():
453
+ return "", "๋Œ€๊ธฐ ์ค‘...", ""
454
+
455
+ def convert_to_parquet_file(processed_text: str):
456
+ if not processed_text.strip():
457
+ return "๋ณ€ํ™˜ํ•  ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", None
458
+
459
+ try:
460
+ message, parquet_content, parquet_filename = text_to_parquet(processed_text)
461
+ if parquet_filename:
462
+ return message, parquet_filename
463
+ return message, None
464
+ except Exception as e:
465
+ return f"Parquet ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}", None
466
+
467
+ # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ ์—ฐ๊ฒฐ
468
  preprocess_button.click(
469
  handle_text_preprocessing,
470
  inputs=[raw_text_input],
471
+ outputs=[preprocess_status, processed_text_output],
472
+ queue=True
473
  )
474
 
475
+ clear_button.click(
476
+ clear_inputs,
477
+ outputs=[raw_text_input, preprocess_status, processed_text_output]
478
+ )
 
 
 
 
 
 
479
 
480
+ convert_to_parquet_button.click(
481
+ convert_to_parquet_file,
482
+ inputs=[processed_text_output],
483
+ outputs=[preprocess_status, download_parquet]
484
+ )
485
 
486
+ # ์˜ˆ์ œ ํ…์ŠคํŠธ ์ถ”๊ฐ€
487
+ with gr.Accordion("์˜ˆ์ œ ํ…์ŠคํŠธ", open=False):
488
+ gr.Examples(
489
+ examples=[
490
+ ["์ด์ˆœ์‹ ์€ ์กฐ์„  ์ค‘๊ธฐ์˜ ๋ฌด์‹ ์ด๋‹ค. ๊ทธ๋Š” ์ž„์ง„์™œ๋ž€ ๋‹น์‹œ ํ•ด๊ตฐ์„ ์ด๋Œ์—ˆ๋‹ค. ๊ฑฐ๋ถ์„ ์„ ๋งŒ๋“ค์–ด ์™œ๊ตฐ๊ณผ ์‹ธ์› ๋‹ค."],
491
+ ["์ธ๊ณต์ง€๋Šฅ์€ ์ปดํ“จํ„ฐ ๊ณผํ•™์˜ ํ•œ ๋ถ„์•ผ์ด๋‹ค. ๊ธฐ๊ณ„ํ•™์Šต์€ ์ธ๊ณต์ง€๋Šฅ์˜ ํ•˜์œ„ ๋ถ„์•ผ์ด๋‹ค. ๋”ฅ๋Ÿฌ๋‹์€ ๊ธฐ๊ณ„ํ•™์Šต์˜ ํ•œ ๋ฐฉ๋ฒ•์ด๋‹ค."]
492
+ ],
493
+ inputs=raw_text_input,
494
+ label="์˜ˆ์ œ ์„ ํƒ"
495
+ )
496
 
497
+ gr.Markdown("### [email protected]", elem_id="initial-description")
498
 
499
+ if __name__ == "__main__":
500
+ demo.launch(share=True)