ginipick commited on
Commit
79997b0
โ€ข
1 Parent(s): dc6bd76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -6
app.py CHANGED
@@ -5,8 +5,8 @@ import pandas as pd
5
  from typing import List, Dict, Tuple
6
  import json
7
  import io
8
-
9
  import traceback
 
10
  # ์ถ”๋ก  API ํด๋ผ์ด์–ธํŠธ ์„ค์ •
11
  hf_client = InferenceClient(
12
  "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
@@ -30,7 +30,6 @@ def load_parquet(filename: str) -> str:
30
  except Exception as e:
31
  return f"ํŒŒ์ผ์„ ์ฝ๋Š” ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
32
 
33
-
34
  def respond(
35
  message: str,
36
  history: List[Dict[str, str]],
@@ -82,7 +81,6 @@ def respond(
82
  print(error_message)
83
  yield error_message
84
 
85
-
86
  def upload_csv(file_path: str) -> Tuple[str, str]:
87
  try:
88
  # CSV ํŒŒ์ผ ์ฝ๊ธฐ
@@ -120,7 +118,7 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
120
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
121
  try:
122
  # ํ…์ŠคํŠธ๋ฅผ DataFrame์œผ๋กœ ๋ณ€ํ™˜ (๊ฐ ํ–‰์€ ์ฝค๋งˆ๋กœ ๊ตฌ๋ถ„)
123
- data = [line.split(',') for line in text.strip().split('\n')]
124
  df = pd.DataFrame(data, columns=['id', 'text', 'label', 'metadata'])
125
  # ๋ฐ์ดํ„ฐ ์œ ํ˜• ์ตœ์ ํ™”
126
  df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
@@ -133,6 +131,37 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
133
  except Exception as e:
134
  return f"ํ…์ŠคํŠธ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}", "", ""
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  # CSS ์„ค์ •
137
  css = """
138
  footer {
@@ -161,9 +190,8 @@ textarea, input[type="text"] {
161
  }
162
  """
163
 
164
-
165
  # Gradio Blocks ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
166
- with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
167
  gr.Markdown("# My RAG: LLM์ด ๋‚˜๋งŒ์˜ ๋ฐ์ดํ„ฐ๋กœ ํ•™์Šตํ•œ ์ฝ˜ํ…์ธ  ์ƒ์„ฑ/๋‹ต๋ณ€", elem_id="initial-description")
168
  gr.Markdown(
169
  "### 1) ๋‚˜๋งŒ์˜ ๋ฐ์ดํ„ฐ๋ฅผ ์ž…๋ ฅ ๋˜๋Š” CSV ์—…๋กœ๋“œ๋กœ Parquet ๋ฐ์ดํ„ฐ์…‹ ์ž๋™ ๋ณ€ํ™˜ 2) Parquet ๋ฐ์ดํ„ฐ์…‹์„ ์—…๋กœ๋“œํ•˜๋ฉด, LLM์ด ๋งž์ถค ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ ์‘๋‹ต\n"
@@ -325,8 +353,39 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
325
  outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
326
  )
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  gr.Markdown("### [email protected]", elem_id="initial-description")
329
 
330
  if __name__ == "__main__":
331
  demo.launch()
332
 
 
 
5
  from typing import List, Dict, Tuple
6
  import json
7
  import io
 
8
  import traceback
9
+
10
  # ์ถ”๋ก  API ํด๋ผ์ด์–ธํŠธ ์„ค์ •
11
  hf_client = InferenceClient(
12
  "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
 
30
  except Exception as e:
31
  return f"ํŒŒ์ผ์„ ์ฝ๋Š” ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
32
 
 
33
  def respond(
34
  message: str,
35
  history: List[Dict[str, str]],
 
81
  print(error_message)
82
  yield error_message
83
 
 
84
  def upload_csv(file_path: str) -> Tuple[str, str]:
85
  try:
86
  # CSV ํŒŒ์ผ ์ฝ๊ธฐ
 
118
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
119
  try:
120
  # ํ…์ŠคํŠธ๋ฅผ DataFrame์œผ๋กœ ๋ณ€ํ™˜ (๊ฐ ํ–‰์€ ์ฝค๋งˆ๋กœ ๊ตฌ๋ถ„)
121
+ data = [line.strip().split(',') for line in text.strip().split('\n')]
122
  df = pd.DataFrame(data, columns=['id', 'text', 'label', 'metadata'])
123
  # ๋ฐ์ดํ„ฐ ์œ ํ˜• ์ตœ์ ํ™”
124
  df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
 
131
  except Exception as e:
132
  return f"ํ…์ŠคํŠธ ๋ณ€ํ™˜ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}", "", ""
133
 
134
+ def preprocess_text_with_llm(input_text: str) -> str:
135
+ # LLM์—๊ฒŒ ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ์ „์ฒ˜๋ฆฌํ•˜๋„๋ก ์š”์ฒญ
136
+ system_prompt = """๋‹น์‹ ์€ ์ž…๋ ฅ๋œ ๊ธด ํ…์ŠคํŠธ๋ฅผ ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜๋Š” ์—ญํ• ์„ ํ•ฉ๋‹ˆ๋‹ค.
137
+ - ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์€ id,text,label,metadata์ž…๋‹ˆ๋‹ค.
138
+ - ๊ฐ ํ–‰์€ ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„๋˜๋ฉฐ, ํ…์ŠคํŠธ ๋‚ด์— ์‰ผํ‘œ๊ฐ€ ์žˆ์„ ๊ฒฝ์šฐ ์ œ๊ฑฐํ•˜๊ฑฐ๋‚˜ ๋‹ค๋ฅธ ๋ฌธ์ž๋กœ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค.
139
+ - ํ…์ŠคํŠธ๋ฅผ ์˜๋ฏธ ๋‹จ์œ„๋กœ ๋ถ„ํ• ํ•˜๊ณ , ์ ์ ˆํžˆ ๋ฌธ์žฅ์„ ์žฌ๊ตฌ์„ฑํ•˜๊ณ  ํŽธ์ง‘ํ•˜์—ฌ ์ตœ์ ํ™”๋œ ๋ฌธ์žฅ์œผ๋กœ ๋งŒ๋“ญ๋‹ˆ๋‹ค.
140
+ - ๊ฐ ๋ฌธ์žฅ์— ๋Œ€ํ•ด id๋ฅผ ๋ถ€์—ฌํ•˜๊ณ , ์ ์ ˆํ•œ label(์นดํ…Œ๊ณ ๋ฆฌ)์„ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
141
+ - metadata์—๋Š” ์ถœ์ฒ˜๋‚˜ ๋‚ ์งœ ๋“ฑ์˜ ์ถ”๊ฐ€ ์ •๋ณด๋ฅผ ํฌํ•จํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
142
+ - ์ตœ์ข… ๊ฒฐ๊ณผ๋Š” ๊ฐ ํ–‰์ด 'id,text,label,metadata' ํ˜•์‹์˜ CSV ํ˜•ํƒœ๊ฐ€ ๋˜๋„๋ก ํ•ฉ๋‹ˆ๋‹ค.
143
+ """
144
+ prompt = system_prompt + "\n\n์ž…๋ ฅ ํ…์ŠคํŠธ:\n" + input_text + "\n\n์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ์…‹:"
145
+ try:
146
+ response = ""
147
+ stream = hf_client.text_generation(
148
+ prompt=prompt,
149
+ max_new_tokens=2000,
150
+ temperature=0.5,
151
+ top_p=0.9,
152
+ stream=True,
153
+ )
154
+ for msg in stream:
155
+ if msg:
156
+ response += msg
157
+ # ์‘๋‹ต์—์„œ ์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ์…‹ ๋ถ€๋ถ„๋งŒ ์ถ”์ถœ
158
+ processed_text = response.strip()
159
+ return processed_text
160
+ except Exception as e:
161
+ error_message = f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}\n{traceback.format_exc()}"
162
+ print(error_message)
163
+ return error_message
164
+
165
  # CSS ์„ค์ •
166
  css = """
167
  footer {
 
190
  }
191
  """
192
 
 
193
  # Gradio Blocks ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
194
+ with gr.Blocks(css=css) as demo:
195
  gr.Markdown("# My RAG: LLM์ด ๋‚˜๋งŒ์˜ ๋ฐ์ดํ„ฐ๋กœ ํ•™์Šตํ•œ ์ฝ˜ํ…์ธ  ์ƒ์„ฑ/๋‹ต๋ณ€", elem_id="initial-description")
196
  gr.Markdown(
197
  "### 1) ๋‚˜๋งŒ์˜ ๋ฐ์ดํ„ฐ๋ฅผ ์ž…๋ ฅ ๋˜๋Š” CSV ์—…๋กœ๋“œ๋กœ Parquet ๋ฐ์ดํ„ฐ์…‹ ์ž๋™ ๋ณ€ํ™˜ 2) Parquet ๋ฐ์ดํ„ฐ์…‹์„ ์—…๋กœ๋“œํ•˜๋ฉด, LLM์ด ๋งž์ถค ํ•™์Šต ๋ฐ์ดํ„ฐ๋กœ ํ™œ์šฉํ•˜์—ฌ ์‘๋‹ต\n"
 
353
  outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
354
  )
355
 
356
+ # ๋„ค ๋ฒˆ์งธ ํƒญ: ํ…์ŠคํŠธ๋ฅผ ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ์ „์ฒ˜๋ฆฌ (ํƒญ ์ด๋ฆ„: "Text Preprocessing with LLM")
357
+ with gr.Tab("Text Preprocessing with LLM"):
358
+ gr.Markdown("### ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด LLM์ด ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.")
359
+ with gr.Row():
360
+ with gr.Column():
361
+ raw_text_input = gr.Textbox(
362
+ label="ํ…์ŠคํŠธ ์ž…๋ ฅ",
363
+ lines=15,
364
+ placeholder="์—ฌ๊ธฐ์— ์ „์ฒ˜๋ฆฌํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”..."
365
+ )
366
+ preprocess_button = gr.Button("์ „์ฒ˜๋ฆฌ ์‹คํ–‰")
367
+ preprocess_status = gr.Textbox(label="์ „์ฒ˜๋ฆฌ ์ƒํƒœ", interactive=False)
368
+ processed_text_output = gr.Textbox(
369
+ label="์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ์…‹ ์ถœ๋ ฅ",
370
+ lines=15,
371
+ interactive=False
372
+ )
373
+
374
+ def handle_text_preprocessing(input_text: str):
375
+ preprocess_status.value = "์ „์ฒ˜๋ฆฌ ์ค‘์ž…๋‹ˆ๋‹ค. ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”..."
376
+ processed_text = preprocess_text_with_llm(input_text)
377
+ preprocess_status.value = "์ „์ฒ˜๋ฆฌ๊ฐ€ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
378
+ return preprocess_status.value, processed_text
379
+
380
+ preprocess_button.click(
381
+ handle_text_preprocessing,
382
+ inputs=raw_text_input,
383
+ outputs=[preprocess_status, processed_text_output]
384
+ )
385
+
386
  gr.Markdown("### [email protected]", elem_id="initial-description")
387
 
388
  if __name__ == "__main__":
389
  demo.launch()
390
 
391
+