Update app.py
Browse files
app.py
CHANGED
@@ -145,49 +145,56 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
|
|
145 |
return error_message, "", ""
|
146 |
|
147 |
def preprocess_text_with_llm(input_text: str) -> str:
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
161 |
|
162 |
์
๋ ฅ ํ
์คํธ:
|
|
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ์
:
|
167 |
-
1,"์ค๋์ ๋ ์จ๊ฐ ์ข๋ค.","๋ ์จ","2023-10-05"
|
168 |
-
2,"๋ด์ผ์ ๋น๊ฐ ์ฌ ์์ ์ด๋ค.","๋ ์จ","2023-10-05"
|
169 |
-
|
170 |
-
**์ด์ ์๋์ ์
๋ ฅ ํ
์คํธ๋ฅผ ์ฒ๋ฆฌํ์ธ์:**
|
171 |
-
|
172 |
-
""" + input_text
|
173 |
|
174 |
-
# LLM ํธ์ถ ๋ฐ ์๋ต ์ฒ๋ฆฌ
|
175 |
try:
|
176 |
response = ""
|
177 |
stream = hf_client.text_generation(
|
178 |
-
prompt=
|
179 |
-
max_new_tokens=
|
180 |
-
temperature=0.
|
181 |
top_p=0.9,
|
182 |
stream=True,
|
183 |
)
|
|
|
184 |
for msg in stream:
|
185 |
if msg:
|
186 |
response += msg
|
187 |
-
|
188 |
-
|
189 |
processed_text = response.strip()
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
except Exception as e:
|
192 |
error_message = f"์ ์ฒ๋ฆฌ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}\n{traceback.format_exc()}"
|
193 |
print(error_message)
|
@@ -221,8 +228,6 @@ textarea, input[type="text"] {
|
|
221 |
}
|
222 |
"""
|
223 |
|
224 |
-
|
225 |
-
|
226 |
# Gradio Blocks ์ธํฐํ์ด์ค ์ค์
|
227 |
with gr.Blocks(css=css) as demo:
|
228 |
gr.Markdown("# My RAG: LLM์ด ๋๋ง์ ๋ฐ์ดํฐ๋ก ํ์ตํ ์ฝํ
์ธ ์์ฑ/๋ต๋ณ", elem_id="initial-description")
|
@@ -232,6 +237,8 @@ with gr.Blocks(css=css) as demo:
|
|
232 |
elem_id="initial-description"
|
233 |
)
|
234 |
|
|
|
|
|
235 |
# ์ฒซ ๋ฒ์งธ ํญ: ์ฑ๋ด ๋ฐ์ดํฐ ์
๋ก๋ (ํญ ์ด๋ฆ ๋ณ๊ฒฝ: "My ๋ฐ์ดํฐ์
+LLM")
|
236 |
with gr.Tab("My ๋ฐ์ดํฐ์
+LLM"):
|
237 |
gr.Markdown("### LLM๊ณผ ๋ํํ๊ธฐ")
|
@@ -386,7 +393,7 @@ with gr.Blocks(css=css) as demo:
|
|
386 |
outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
|
387 |
)
|
388 |
|
389 |
-
# ๋ค ๋ฒ์งธ ํญ: ํ
์คํธ๋ฅผ ๋ฐ์ดํฐ์
ํ์์ผ๋ก ์ ์ฒ๋ฆฌ (
|
390 |
with gr.Tab("Text Preprocessing with LLM"):
|
391 |
gr.Markdown("### ํ
์คํธ๋ฅผ ์
๋ ฅํ๋ฉด LLM์ด ๋ฐ์ดํฐ์
ํ์์ ๋ง๊ฒ ์ ์ฒ๋ฆฌํ์ฌ ์ถ๋ ฅํฉ๋๋ค.")
|
392 |
with gr.Row():
|
@@ -396,37 +403,98 @@ with gr.Blocks(css=css) as demo:
|
|
396 |
lines=15,
|
397 |
placeholder="์ฌ๊ธฐ์ ์ ์ฒ๋ฆฌํ ํ
์คํธ๋ฅผ ์
๋ ฅํ์ธ์..."
|
398 |
)
|
399 |
-
|
400 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
processed_text_output = gr.Textbox(
|
402 |
label="์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ์
์ถ๋ ฅ",
|
403 |
lines=15,
|
404 |
interactive=False
|
405 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
|
407 |
def handle_text_preprocessing(input_text: str):
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
preprocess_button.click(
|
414 |
handle_text_preprocessing,
|
415 |
inputs=[raw_text_input],
|
416 |
-
outputs=[preprocess_status, processed_text_output]
|
|
|
417 |
)
|
418 |
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
|
|
|
|
|
|
|
|
|
|
|
430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
|
|
|
432 |
|
|
|
|
|
|
145 |
return error_message, "", ""
|
146 |
|
147 |
def preprocess_text_with_llm(input_text: str) -> str:
|
148 |
+
if not input_text.strip():
|
149 |
+
return "์
๋ ฅ ํ
์คํธ๊ฐ ๋น์ด์์ต๋๋ค."
|
150 |
+
|
151 |
+
system_prompt = """๋น์ ์ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์ ๋ฌธ๊ฐ์
๋๋ค. ์
๋ ฅ๋ ํ
์คํธ๋ฅผ CSV ๋ฐ์ดํฐ์
ํ์์ผ๋ก ๋ณํํ์ธ์.
|
152 |
+
|
153 |
+
๊ท์น:
|
154 |
+
1. ์ถ๋ ฅ ํ์: id,text,label,metadata
|
155 |
+
2. id: 1๋ถํฐ ์์ํ๋ ์์ฐจ์ ๋ฒํธ
|
156 |
+
3. text: ์๋ฏธ ์๋ ๋จ์๋ก ๋ถ๋ฆฌ๋ ํ
์คํธ
|
157 |
+
4. label: ํ
์คํธ์ ์ฃผ์ ๋ ์นดํ
๊ณ ๋ฆฌ
|
158 |
+
5. metadata: ์ถ๊ฐ ์ ๋ณด(๋ ์ง, ์ถ์ฒ ๋ฑ)
|
159 |
+
|
160 |
+
์ฃผ์์ฌํญ:
|
161 |
+
- ํ
์คํธ์ ์ผํ๊ฐ ์์ผ๋ฉด ํฐ๋ฐ์ดํ๋ก ๊ฐ์ธ๊ธฐ
|
162 |
+
- ํฐ๋ฐ์ดํ๋ ๋ฐฑ์ฌ๋์๋ก ์ด์ค์ผ์ดํ ์ฒ๋ฆฌ
|
163 |
+
- ๊ฐ ํ์ ์๋ก์ด ์ค๋ก ๊ตฌ๋ถ
|
164 |
+
- ๋ชจ๋ ํ๋๋ ์ผํ๋ก ๊ตฌ๋ถ
|
165 |
|
166 |
์
๋ ฅ ํ
์คํธ:
|
167 |
+
"""
|
168 |
|
169 |
+
full_prompt = f"{system_prompt}\n\n{input_text}\n\n์ถ๋ ฅ:"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
|
|
171 |
try:
|
172 |
response = ""
|
173 |
stream = hf_client.text_generation(
|
174 |
+
prompt=full_prompt,
|
175 |
+
max_new_tokens=4000, # ํ ํฐ ์ ์ฆ๊ฐ
|
176 |
+
temperature=0.3, # ๋ ๊ฒฐ์ ์ ์ธ ์ถ๋ ฅ์ ์ํด ๋ฎ์ถค
|
177 |
top_p=0.9,
|
178 |
stream=True,
|
179 |
)
|
180 |
+
|
181 |
for msg in stream:
|
182 |
if msg:
|
183 |
response += msg
|
184 |
+
|
185 |
+
# ์๋ต ์ ์
|
186 |
processed_text = response.strip()
|
187 |
+
|
188 |
+
# CSV ํ์ ๊ฒ์ฆ
|
189 |
+
try:
|
190 |
+
# StringIO๋ฅผ ์ฌ์ฉํ์ฌ CSV ํ์ ๊ฒ์ฆ
|
191 |
+
from io import StringIO
|
192 |
+
import csv
|
193 |
+
csv.reader(StringIO(processed_text))
|
194 |
+
return processed_text
|
195 |
+
except csv.Error:
|
196 |
+
return "LLM์ด ์ฌ๋ฐ๋ฅธ CSV ํ์์ ์์ฑํ์ง ๋ชปํ์ต๋๋ค. ๋ค์ ์๋ํด์ฃผ์ธ์."
|
197 |
+
|
198 |
except Exception as e:
|
199 |
error_message = f"์ ์ฒ๋ฆฌ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}\n{traceback.format_exc()}"
|
200 |
print(error_message)
|
|
|
228 |
}
|
229 |
"""
|
230 |
|
|
|
|
|
231 |
# Gradio Blocks ์ธํฐํ์ด์ค ์ค์
|
232 |
with gr.Blocks(css=css) as demo:
|
233 |
gr.Markdown("# My RAG: LLM์ด ๋๋ง์ ๋ฐ์ดํฐ๋ก ํ์ตํ ์ฝํ
์ธ ์์ฑ/๋ต๋ณ", elem_id="initial-description")
|
|
|
237 |
elem_id="initial-description"
|
238 |
)
|
239 |
|
240 |
+
|
241 |
+
|
242 |
# ์ฒซ ๋ฒ์งธ ํญ: ์ฑ๋ด ๋ฐ์ดํฐ ์
๋ก๋ (ํญ ์ด๋ฆ ๋ณ๊ฒฝ: "My ๋ฐ์ดํฐ์
+LLM")
|
243 |
with gr.Tab("My ๋ฐ์ดํฐ์
+LLM"):
|
244 |
gr.Markdown("### LLM๊ณผ ๋ํํ๊ธฐ")
|
|
|
393 |
outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
|
394 |
)
|
395 |
|
396 |
+
# ๋ค ๋ฒ์งธ ํญ: ํ
์คํธ๋ฅผ ๋ฐ์ดํฐ์
ํ์์ผ๋ก ์ ์ฒ๋ฆฌ (๊ฐ์ ๋ ๋ฒ์ )
|
397 |
with gr.Tab("Text Preprocessing with LLM"):
|
398 |
gr.Markdown("### ํ
์คํธ๋ฅผ ์
๋ ฅํ๋ฉด LLM์ด ๋ฐ์ดํฐ์
ํ์์ ๋ง๊ฒ ์ ์ฒ๋ฆฌํ์ฌ ์ถ๋ ฅํฉ๋๋ค.")
|
399 |
with gr.Row():
|
|
|
403 |
lines=15,
|
404 |
placeholder="์ฌ๊ธฐ์ ์ ์ฒ๋ฆฌํ ํ
์คํธ๋ฅผ ์
๋ ฅํ์ธ์..."
|
405 |
)
|
406 |
+
|
407 |
+
with gr.Row():
|
408 |
+
preprocess_button = gr.Button("์ ์ฒ๋ฆฌ ์คํ", variant="primary")
|
409 |
+
clear_button = gr.Button("์ด๊ธฐํ")
|
410 |
+
|
411 |
+
preprocess_status = gr.Textbox(
|
412 |
+
label="์ ์ฒ๋ฆฌ ์ํ",
|
413 |
+
interactive=False,
|
414 |
+
value="๋๊ธฐ ์ค..."
|
415 |
+
)
|
416 |
+
|
417 |
processed_text_output = gr.Textbox(
|
418 |
label="์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ์
์ถ๋ ฅ",
|
419 |
lines=15,
|
420 |
interactive=False
|
421 |
)
|
422 |
+
|
423 |
+
# Parquet ๋ณํ ๋ฐ ๋ค์ด๋ก๋ ์น์
|
424 |
+
with gr.Row():
|
425 |
+
convert_to_parquet_button = gr.Button("Parquet์ผ๋ก ๋ณํ", visible=True)
|
426 |
+
download_parquet = gr.File(
|
427 |
+
label="๋ณํ๋ Parquet ํ์ผ ๋ค์ด๋ก๋",
|
428 |
+
visible=False
|
429 |
+
)
|
430 |
|
431 |
def handle_text_preprocessing(input_text: str):
|
432 |
+
if not input_text.strip():
|
433 |
+
return "์
๋ ฅ ํ
์คํธ๊ฐ ์์ต๋๋ค.", ""
|
434 |
+
|
435 |
+
try:
|
436 |
+
preprocess_status_msg = "์ ์ฒ๋ฆฌ๋ฅผ ์์ํฉ๋๋ค..."
|
437 |
+
yield preprocess_status_msg, ""
|
438 |
+
|
439 |
+
processed_text = preprocess_text_with_llm(input_text)
|
440 |
+
|
441 |
+
if processed_text:
|
442 |
+
preprocess_status_msg = "์ ์ฒ๋ฆฌ๊ฐ ์๋ฃ๋์์ต๋๋ค."
|
443 |
+
yield preprocess_status_msg, processed_text
|
444 |
+
else:
|
445 |
+
preprocess_status_msg = "์ ์ฒ๋ฆฌ ๊ฒฐ๊ณผ๊ฐ ์์ต๋๋ค."
|
446 |
+
yield preprocess_status_msg, ""
|
447 |
+
|
448 |
+
except Exception as e:
|
449 |
+
error_msg = f"์ฒ๋ฆฌ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
450 |
+
yield error_msg, ""
|
451 |
+
|
452 |
+
def clear_inputs():
|
453 |
+
return "", "๋๊ธฐ ์ค...", ""
|
454 |
+
|
455 |
+
def convert_to_parquet_file(processed_text: str):
|
456 |
+
if not processed_text.strip():
|
457 |
+
return "๋ณํํ ํ
์คํธ๊ฐ ์์ต๋๋ค.", None
|
458 |
+
|
459 |
+
try:
|
460 |
+
message, parquet_content, parquet_filename = text_to_parquet(processed_text)
|
461 |
+
if parquet_filename:
|
462 |
+
return message, parquet_filename
|
463 |
+
return message, None
|
464 |
+
except Exception as e:
|
465 |
+
return f"Parquet ๋ณํ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}", None
|
466 |
+
|
467 |
+
# ์ด๋ฒคํธ ํธ๋ค๋ฌ ์ฐ๊ฒฐ
|
468 |
preprocess_button.click(
|
469 |
handle_text_preprocessing,
|
470 |
inputs=[raw_text_input],
|
471 |
+
outputs=[preprocess_status, processed_text_output],
|
472 |
+
queue=True
|
473 |
)
|
474 |
|
475 |
+
clear_button.click(
|
476 |
+
clear_inputs,
|
477 |
+
outputs=[raw_text_input, preprocess_status, processed_text_output]
|
478 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
479 |
|
480 |
+
convert_to_parquet_button.click(
|
481 |
+
convert_to_parquet_file,
|
482 |
+
inputs=[processed_text_output],
|
483 |
+
outputs=[preprocess_status, download_parquet]
|
484 |
+
)
|
485 |
|
486 |
+
# ์์ ํ
์คํธ ์ถ๊ฐ
|
487 |
+
with gr.Accordion("์์ ํ
์คํธ", open=False):
|
488 |
+
gr.Examples(
|
489 |
+
examples=[
|
490 |
+
["์ด์์ ์ ์กฐ์ ์ค๊ธฐ์ ๋ฌด์ ์ด๋ค. ๊ทธ๋ ์์ง์๋ ๋น์ ํด๊ตฐ์ ์ด๋์๋ค. ๊ฑฐ๋ถ์ ์ ๋ง๋ค์ด ์๊ตฐ๊ณผ ์ธ์ ๋ค."],
|
491 |
+
["์ธ๊ณต์ง๋ฅ์ ์ปดํจํฐ ๊ณผํ์ ํ ๋ถ์ผ์ด๋ค. ๊ธฐ๊ณํ์ต์ ์ธ๊ณต์ง๋ฅ์ ํ์ ๋ถ์ผ์ด๋ค. ๋ฅ๋ฌ๋์ ๊ธฐ๊ณํ์ต์ ํ ๋ฐฉ๋ฒ์ด๋ค."]
|
492 |
+
],
|
493 |
+
inputs=raw_text_input,
|
494 |
+
label="์์ ์ ํ"
|
495 |
+
)
|
496 |
|
497 |
+
gr.Markdown("### [email protected]", elem_id="initial-description")
|
498 |
|
499 |
+
if __name__ == "__main__":
|
500 |
+
demo.launch(share=True)
|