Update app.py
Browse files
app.py
CHANGED
@@ -187,34 +187,36 @@ def preprocess_text_with_llm(input_text: str) -> str:
|
|
187 |
1. ์ถ๋ ฅ ํ์: id,text,label,metadata
|
188 |
2. id: 1๋ถํฐ ์์ํ๋ ์์ฐจ์ ๋ฒํธ
|
189 |
3. text: ์๋ฏธ ์๋ ๋จ์๋ก ๋ถ๋ฆฌ๋ ํ
์คํธ
|
190 |
-
4. label: ํ
์คํธ์ ์ฃผ์ ๋ ์นดํ
๊ณ ๋ฆฌ๋ฅผ
|
191 |
-
-
|
192 |
-
-
|
193 |
-
-
|
194 |
-
- Health (๊ฑด๊ฐ)
|
195 |
-
- Entertainment (์ํฐํ
์ธ๋จผํธ)
|
196 |
-
- Business (๋น์ฆ๋์ค)
|
197 |
-
- Safety (์์ )
|
198 |
-
- Culture (๋ฌธํ)
|
199 |
- Politics (์ ์น)
|
200 |
-
-
|
201 |
5. metadata: ๋ ์ง, ์ถ์ฒ ๋ฑ ์ถ๊ฐ ์ ๋ณด
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
์ฃผ์์ฌํญ:
|
204 |
- text์ ์ผํ๊ฐ ์์ผ๋ฉด ํฐ๋ฐ์ดํ๋ก ๊ฐ์ธ๊ธฐ
|
205 |
- ํฐ๋ฐ์ดํ๋ ๋ฐฑ์ฌ๋์๋ก ์ด์ค์ผ์ดํ ์ฒ๋ฆฌ
|
206 |
- ๊ฐ ํ์ ์๋ก์ด ์ค๋ก ๊ตฌ๋ถ
|
207 |
-
-
|
208 |
-
- text์ label์ ์๋ก ๋ค๋ฅธ ๋ด์ฉ์ด์ด์ผ ํจ"""
|
209 |
|
210 |
-
full_prompt = f"{system_prompt}\n\n{input_text}\n\n์ถ๋ ฅ:"
|
211 |
|
212 |
try:
|
213 |
response = ""
|
214 |
stream = hf_client.text_generation(
|
215 |
prompt=full_prompt,
|
216 |
max_new_tokens=4000,
|
217 |
-
temperature=0.
|
218 |
top_p=0.9,
|
219 |
stream=True,
|
220 |
)
|
@@ -223,11 +225,24 @@ def preprocess_text_with_llm(input_text: str) -> str:
|
|
223 |
if msg:
|
224 |
response += msg
|
225 |
|
226 |
-
# <EOS_TOKEN> ์ด์ ๊น์ง๋ง
|
227 |
if "<EOS_TOKEN>" in response:
|
228 |
processed_text = response.split("<EOS_TOKEN>")[0].strip()
|
229 |
else:
|
230 |
processed_text = response.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
# CSV ํ์ ๊ฒ์ฆ
|
233 |
try:
|
@@ -436,7 +451,7 @@ with gr.Blocks(css=css) as demo:
|
|
436 |
outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
|
437 |
)
|
438 |
|
439 |
-
#
|
440 |
with gr.Tab("Text Preprocessing with LLM"):
|
441 |
gr.Markdown("### ํ
์คํธ๋ฅผ ์
๋ ฅํ๋ฉด LLM์ด ๋ฐ์ดํฐ์
ํ์์ ๋ง๊ฒ ์ ์ฒ๋ฆฌํ์ฌ ์ถ๋ ฅํฉ๋๋ค.")
|
442 |
with gr.Row():
|
@@ -450,7 +465,7 @@ with gr.Blocks(css=css) as demo:
|
|
450 |
with gr.Row():
|
451 |
preprocess_button = gr.Button("์ ์ฒ๋ฆฌ ์คํ", variant="primary")
|
452 |
clear_button = gr.Button("์ด๊ธฐํ")
|
453 |
-
|
454 |
preprocess_status = gr.Textbox(
|
455 |
label="์ ์ฒ๋ฆฌ ์ํ",
|
456 |
interactive=False,
|
@@ -464,12 +479,11 @@ with gr.Blocks(css=css) as demo:
|
|
464 |
)
|
465 |
|
466 |
# Parquet ๋ณํ ๋ฐ ๋ค์ด๋ก๋ ์น์
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
)
|
473 |
|
474 |
def handle_text_preprocessing(input_text: str):
|
475 |
if not input_text.strip():
|
|
|
187 |
1. ์ถ๋ ฅ ํ์: id,text,label,metadata
|
188 |
2. id: 1๋ถํฐ ์์ํ๋ ์์ฐจ์ ๋ฒํธ
|
189 |
3. text: ์๋ฏธ ์๋ ๋จ์๋ก ๋ถ๋ฆฌ๋ ํ
์คํธ
|
190 |
+
4. label: ํ
์คํธ์ ์ฃผ์ ๋ ์นดํ
๊ณ ๋ฆฌ๋ฅผ ์๋ ๊ธฐ์ค์ผ๋ก ์ ํํ๊ฒ ํ ๊ฐ๋ง ์ ํ
|
191 |
+
- Historical_Figure (์ญ์ฌ์ ์ธ๋ฌผ)
|
192 |
+
- Military_History (๊ตฐ์ฌ ์ญ์ฌ)
|
193 |
+
- Technology (๊ธฐ์ )
|
|
|
|
|
|
|
|
|
|
|
194 |
- Politics (์ ์น)
|
195 |
+
- Culture (๋ฌธํ)
|
196 |
5. metadata: ๋ ์ง, ์ถ์ฒ ๋ฑ ์ถ๊ฐ ์ ๋ณด
|
197 |
|
198 |
+
์ค์:
|
199 |
+
- ๋์ผํ ํ
์คํธ๋ฅผ ๋ฐ๋ณตํด์ ์ถ๋ ฅํ์ง ๋ง ๊ฒ
|
200 |
+
- ๊ฐ ํ
์คํธ๋ ํ ๋ฒ๋ง ์ฒ๋ฆฌํ์ฌ ๊ฐ์ฅ ์ ํฉํ label์ ์ ํํ ๊ฒ
|
201 |
+
- ์
๋ ฅ ํ
์คํธ๋ฅผ ์๋ฏธ ๋จ์๋ก ์ ์ ํ ๋ถ๋ฆฌํ ๊ฒ
|
202 |
+
|
203 |
+
์์:
|
204 |
+
1,"์ด์์ ์ ์กฐ์ ์ค๊ธฐ์ ๋ฌด์ ์ด๋ค.","Historical_Figure","์กฐ์ ์๋, ์ํค๋ฐฑ๊ณผ"
|
205 |
+
|
206 |
์ฃผ์์ฌํญ:
|
207 |
- text์ ์ผํ๊ฐ ์์ผ๋ฉด ํฐ๋ฐ์ดํ๋ก ๊ฐ์ธ๊ธฐ
|
208 |
- ํฐ๋ฐ์ดํ๋ ๋ฐฑ์ฌ๋์๋ก ์ด์ค์ผ์ดํ ์ฒ๋ฆฌ
|
209 |
- ๊ฐ ํ์ ์๋ก์ด ์ค๋ก ๊ตฌ๋ถ
|
210 |
+
- ๋ถํ์ํ ๋ฐ๋ณต ์ถ๋ ฅ ๊ธ์ง"""
|
|
|
211 |
|
212 |
+
full_prompt = f"{system_prompt}\n\n์
๋ ฅํ
์คํธ:\n{input_text}\n\n์ถ๋ ฅ:"
|
213 |
|
214 |
try:
|
215 |
response = ""
|
216 |
stream = hf_client.text_generation(
|
217 |
prompt=full_prompt,
|
218 |
max_new_tokens=4000,
|
219 |
+
temperature=0.1, # ๋ ๊ฒฐ์ ์ ์ธ ์ถ๋ ฅ์ ์ํด ๋ฎ์ถค
|
220 |
top_p=0.9,
|
221 |
stream=True,
|
222 |
)
|
|
|
225 |
if msg:
|
226 |
response += msg
|
227 |
|
228 |
+
# <EOS_TOKEN> ์ด์ ๊น์ง๋ง ์ถ์ถํ๊ณ ์ ์
|
229 |
if "<EOS_TOKEN>" in response:
|
230 |
processed_text = response.split("<EOS_TOKEN>")[0].strip()
|
231 |
else:
|
232 |
processed_text = response.strip()
|
233 |
+
|
234 |
+
# ์ค๋ณต ์ถ๋ ฅ ์ ๊ฑฐ
|
235 |
+
lines = processed_text.split('\n')
|
236 |
+
unique_lines = []
|
237 |
+
seen_texts = set()
|
238 |
+
|
239 |
+
for line in lines:
|
240 |
+
line = line.strip()
|
241 |
+
if line and '์ถ๋ ฅ:' not in line and line not in seen_texts:
|
242 |
+
unique_lines.append(line)
|
243 |
+
seen_texts.add(line)
|
244 |
+
|
245 |
+
processed_text = '\n'.join(unique_lines)
|
246 |
|
247 |
# CSV ํ์ ๊ฒ์ฆ
|
248 |
try:
|
|
|
451 |
outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
|
452 |
)
|
453 |
|
454 |
+
# ๋ค๋ฒ์งธ ํญ์ UI ๋ถ๋ถ ์์
|
455 |
with gr.Tab("Text Preprocessing with LLM"):
|
456 |
gr.Markdown("### ํ
์คํธ๋ฅผ ์
๋ ฅํ๋ฉด LLM์ด ๋ฐ์ดํฐ์
ํ์์ ๋ง๊ฒ ์ ์ฒ๋ฆฌํ์ฌ ์ถ๋ ฅํฉ๋๋ค.")
|
457 |
with gr.Row():
|
|
|
465 |
with gr.Row():
|
466 |
preprocess_button = gr.Button("์ ์ฒ๋ฆฌ ์คํ", variant="primary")
|
467 |
clear_button = gr.Button("์ด๊ธฐํ")
|
468 |
+
|
469 |
preprocess_status = gr.Textbox(
|
470 |
label="์ ์ฒ๋ฆฌ ์ํ",
|
471 |
interactive=False,
|
|
|
479 |
)
|
480 |
|
481 |
# Parquet ๋ณํ ๋ฐ ๋ค์ด๋ก๋ ์น์
|
482 |
+
convert_to_parquet_button = gr.Button("Parquet์ผ๋ก ๋ณํ")
|
483 |
+
download_parquet = gr.File(label="๋ณํ๋ Parquet ํ์ผ ๋ค์ด๋ก๋")
|
484 |
+
|
485 |
+
|
486 |
+
|
|
|
487 |
|
488 |
def handle_text_preprocessing(input_text: str):
|
489 |
if not input_text.strip():
|