Update app.py
Browse files
app.py
CHANGED
@@ -5,8 +5,8 @@ import pandas as pd
|
|
5 |
from typing import List, Dict, Tuple
|
6 |
import json
|
7 |
import io
|
8 |
-
|
9 |
import traceback
|
|
|
10 |
# ์ถ๋ก API ํด๋ผ์ด์ธํธ ์ค์
|
11 |
hf_client = InferenceClient(
|
12 |
"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
|
@@ -30,7 +30,6 @@ def load_parquet(filename: str) -> str:
|
|
30 |
except Exception as e:
|
31 |
return f"ํ์ผ์ ์ฝ๋ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
32 |
|
33 |
-
|
34 |
def respond(
|
35 |
message: str,
|
36 |
history: List[Dict[str, str]],
|
@@ -82,7 +81,6 @@ def respond(
|
|
82 |
print(error_message)
|
83 |
yield error_message
|
84 |
|
85 |
-
|
86 |
def upload_csv(file_path: str) -> Tuple[str, str]:
|
87 |
try:
|
88 |
# CSV ํ์ผ ์ฝ๊ธฐ
|
@@ -120,7 +118,7 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
|
|
120 |
def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
121 |
try:
|
122 |
# ํ
์คํธ๋ฅผ DataFrame์ผ๋ก ๋ณํ (๊ฐ ํ์ ์ฝค๋ง๋ก ๊ตฌ๋ถ)
|
123 |
-
data = [line.split(',') for line in text.strip().split('\n')]
|
124 |
df = pd.DataFrame(data, columns=['id', 'text', 'label', 'metadata'])
|
125 |
# ๋ฐ์ดํฐ ์ ํ ์ต์ ํ
|
126 |
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
|
@@ -133,6 +131,37 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
|
133 |
except Exception as e:
|
134 |
return f"ํ
์คํธ ๋ณํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}", "", ""
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
# CSS ์ค์
|
137 |
css = """
|
138 |
footer {
|
@@ -161,9 +190,8 @@ textarea, input[type="text"] {
|
|
161 |
}
|
162 |
"""
|
163 |
|
164 |
-
|
165 |
# Gradio Blocks ์ธํฐํ์ด์ค ์ค์
|
166 |
-
with gr.Blocks(
|
167 |
gr.Markdown("# My RAG: LLM์ด ๋๋ง์ ๋ฐ์ดํฐ๋ก ํ์ตํ ์ฝํ
์ธ ์์ฑ/๋ต๋ณ", elem_id="initial-description")
|
168 |
gr.Markdown(
|
169 |
"### 1) ๋๋ง์ ๋ฐ์ดํฐ๋ฅผ ์
๋ ฅ ๋๋ CSV ์
๋ก๋๋ก Parquet ๋ฐ์ดํฐ์
์๋ ๋ณํ 2) Parquet ๋ฐ์ดํฐ์
์ ์
๋ก๋ํ๋ฉด, LLM์ด ๋ง์ถค ํ์ต ๋ฐ์ดํฐ๋ก ํ์ฉํ์ฌ ์๋ต\n"
|
@@ -325,8 +353,39 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
|
|
325 |
outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
|
326 |
)
|
327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
gr.Markdown("### [email protected]", elem_id="initial-description")
|
329 |
|
330 |
if __name__ == "__main__":
|
331 |
demo.launch()
|
332 |
|
|
|
|
5 |
from typing import List, Dict, Tuple
|
6 |
import json
|
7 |
import io
|
|
|
8 |
import traceback
|
9 |
+
|
10 |
# ์ถ๋ก API ํด๋ผ์ด์ธํธ ์ค์
|
11 |
hf_client = InferenceClient(
|
12 |
"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
|
|
|
30 |
except Exception as e:
|
31 |
return f"ํ์ผ์ ์ฝ๋ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
32 |
|
|
|
33 |
def respond(
|
34 |
message: str,
|
35 |
history: List[Dict[str, str]],
|
|
|
81 |
print(error_message)
|
82 |
yield error_message
|
83 |
|
|
|
84 |
def upload_csv(file_path: str) -> Tuple[str, str]:
|
85 |
try:
|
86 |
# CSV ํ์ผ ์ฝ๊ธฐ
|
|
|
118 |
def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
119 |
try:
|
120 |
# ํ
์คํธ๋ฅผ DataFrame์ผ๋ก ๋ณํ (๊ฐ ํ์ ์ฝค๋ง๋ก ๊ตฌ๋ถ)
|
121 |
+
data = [line.strip().split(',') for line in text.strip().split('\n')]
|
122 |
df = pd.DataFrame(data, columns=['id', 'text', 'label', 'metadata'])
|
123 |
# ๋ฐ์ดํฐ ์ ํ ์ต์ ํ
|
124 |
df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
|
|
|
131 |
except Exception as e:
|
132 |
return f"ํ
์คํธ ๋ณํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}", "", ""
|
133 |
|
134 |
+
def preprocess_text_with_llm(input_text: str) -> str:
|
135 |
+
# LLM์๊ฒ ์
๋ ฅ ํ
์คํธ๋ฅผ ์ ์ฒ๋ฆฌํ๋๋ก ์์ฒญ
|
136 |
+
system_prompt = """๋น์ ์ ์
๋ ฅ๋ ๊ธด ํ
์คํธ๋ฅผ ๋ฐ์ดํฐ์
ํ์์ ๋ง๊ฒ ์ ์ฒ๋ฆฌํ๋ ์ญํ ์ ํฉ๋๋ค.
|
137 |
+
- ๋ฐ์ดํฐ์
ํ์์ id,text,label,metadata์
๋๋ค.
|
138 |
+
- ๊ฐ ํ์ ์ผํ๋ก ๊ตฌ๋ถ๋๋ฉฐ, ํ
์คํธ ๋ด์ ์ผํ๊ฐ ์์ ๊ฒฝ์ฐ ์ ๊ฑฐํ๊ฑฐ๋ ๋ค๋ฅธ ๋ฌธ์๋ก ๋์ฒดํฉ๋๋ค.
|
139 |
+
- ํ
์คํธ๋ฅผ ์๋ฏธ ๋จ์๋ก ๋ถํ ํ๊ณ , ์ ์ ํ ๋ฌธ์ฅ์ ์ฌ๊ตฌ์ฑํ๊ณ ํธ์งํ์ฌ ์ต์ ํ๋ ๋ฌธ์ฅ์ผ๋ก ๋ง๋ญ๋๋ค.
|
140 |
+
- ๊ฐ ๋ฌธ์ฅ์ ๋ํด id๋ฅผ ๋ถ์ฌํ๊ณ , ์ ์ ํ label(์นดํ
๊ณ ๋ฆฌ)์ ์ง์ ํฉ๋๋ค.
|
141 |
+
- metadata์๋ ์ถ์ฒ๋ ๋ ์ง ๋ฑ์ ์ถ๊ฐ ์ ๋ณด๋ฅผ ํฌํจํ ์ ์์ต๋๋ค.
|
142 |
+
- ์ต์ข
๊ฒฐ๊ณผ๋ ๊ฐ ํ์ด 'id,text,label,metadata' ํ์์ CSV ํํ๊ฐ ๋๋๋ก ํฉ๋๋ค.
|
143 |
+
"""
|
144 |
+
prompt = system_prompt + "\n\n์
๋ ฅ ํ
์คํธ:\n" + input_text + "\n\n์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ์
:"
|
145 |
+
try:
|
146 |
+
response = ""
|
147 |
+
stream = hf_client.text_generation(
|
148 |
+
prompt=prompt,
|
149 |
+
max_new_tokens=2000,
|
150 |
+
temperature=0.5,
|
151 |
+
top_p=0.9,
|
152 |
+
stream=True,
|
153 |
+
)
|
154 |
+
for msg in stream:
|
155 |
+
if msg:
|
156 |
+
response += msg
|
157 |
+
# ์๋ต์์ ์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ์
๋ถ๋ถ๋ง ์ถ์ถ
|
158 |
+
processed_text = response.strip()
|
159 |
+
return processed_text
|
160 |
+
except Exception as e:
|
161 |
+
error_message = f"์ ์ฒ๋ฆฌ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}\n{traceback.format_exc()}"
|
162 |
+
print(error_message)
|
163 |
+
return error_message
|
164 |
+
|
165 |
# CSS ์ค์
|
166 |
css = """
|
167 |
footer {
|
|
|
190 |
}
|
191 |
"""
|
192 |
|
|
|
193 |
# Gradio Blocks ์ธํฐํ์ด์ค ์ค์
|
194 |
+
with gr.Blocks(css=css) as demo:
|
195 |
gr.Markdown("# My RAG: LLM์ด ๋๋ง์ ๋ฐ์ดํฐ๋ก ํ์ตํ ์ฝํ
์ธ ์์ฑ/๋ต๋ณ", elem_id="initial-description")
|
196 |
gr.Markdown(
|
197 |
"### 1) ๋๋ง์ ๋ฐ์ดํฐ๋ฅผ ์
๋ ฅ ๋๋ CSV ์
๋ก๋๋ก Parquet ๋ฐ์ดํฐ์
์๋ ๋ณํ 2) Parquet ๋ฐ์ดํฐ์
์ ์
๋ก๋ํ๋ฉด, LLM์ด ๋ง์ถค ํ์ต ๋ฐ์ดํฐ๋ก ํ์ฉํ์ฌ ์๋ต\n"
|
|
|
353 |
outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
|
354 |
)
|
355 |
|
356 |
+
# ๋ค ๋ฒ์งธ ํญ: ํ
์คํธ๋ฅผ ๋ฐ์ดํฐ์
ํ์์ผ๋ก ์ ์ฒ๋ฆฌ (ํญ ์ด๋ฆ: "Text Preprocessing with LLM")
|
357 |
+
with gr.Tab("Text Preprocessing with LLM"):
|
358 |
+
gr.Markdown("### ํ
์คํธ๋ฅผ ์
๋ ฅํ๋ฉด LLM์ด ๋ฐ์ดํฐ์
ํ์์ ๋ง๊ฒ ์ ์ฒ๋ฆฌํ์ฌ ์ถ๋ ฅํฉ๋๋ค.")
|
359 |
+
with gr.Row():
|
360 |
+
with gr.Column():
|
361 |
+
raw_text_input = gr.Textbox(
|
362 |
+
label="ํ
์คํธ ์
๋ ฅ",
|
363 |
+
lines=15,
|
364 |
+
placeholder="์ฌ๊ธฐ์ ์ ์ฒ๋ฆฌํ ํ
์คํธ๋ฅผ ์
๋ ฅํ์ธ์..."
|
365 |
+
)
|
366 |
+
preprocess_button = gr.Button("์ ์ฒ๋ฆฌ ์คํ")
|
367 |
+
preprocess_status = gr.Textbox(label="์ ์ฒ๋ฆฌ ์ํ", interactive=False)
|
368 |
+
processed_text_output = gr.Textbox(
|
369 |
+
label="์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ์
์ถ๋ ฅ",
|
370 |
+
lines=15,
|
371 |
+
interactive=False
|
372 |
+
)
|
373 |
+
|
374 |
+
def handle_text_preprocessing(input_text: str):
|
375 |
+
preprocess_status.value = "์ ์ฒ๋ฆฌ ์ค์
๋๋ค. ์ ์๋ง ๊ธฐ๋ค๋ ค์ฃผ์ธ์..."
|
376 |
+
processed_text = preprocess_text_with_llm(input_text)
|
377 |
+
preprocess_status.value = "์ ์ฒ๋ฆฌ๊ฐ ์๋ฃ๋์์ต๋๋ค."
|
378 |
+
return preprocess_status.value, processed_text
|
379 |
+
|
380 |
+
preprocess_button.click(
|
381 |
+
handle_text_preprocessing,
|
382 |
+
inputs=raw_text_input,
|
383 |
+
outputs=[preprocess_status, processed_text_output]
|
384 |
+
)
|
385 |
+
|
386 |
gr.Markdown("### [email protected]", elem_id="initial-description")
|
387 |
|
388 |
if __name__ == "__main__":
|
389 |
demo.launch()
|
390 |
|
391 |
+
|