ginipick commited on
Commit
56d3d16
โ€ข
1 Parent(s): 2ff5289

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -23
app.py CHANGED
@@ -187,34 +187,36 @@ def preprocess_text_with_llm(input_text: str) -> str:
187
  1. ์ถœ๋ ฅ ํ˜•์‹: id,text,label,metadata
188
  2. id: 1๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋Š” ์ˆœ์ฐจ์  ๋ฒˆํ˜ธ
189
  3. text: ์˜๋ฏธ ์žˆ๋Š” ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ๋œ ํ…์ŠคํŠธ
190
- 4. label: ํ…์ŠคํŠธ์˜ ์ฃผ์ œ๋‚˜ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ๋‹ค์Œ ์ค‘์—์„œ ์„ ํƒ
191
- - AI_Technology (AI ๊ธฐ์ˆ  ๊ด€๋ จ)
192
- - Social_Issue (์‚ฌํšŒ ๋ฌธ์ œ)
193
- - Education (๊ต์œก)
194
- - Health (๊ฑด๊ฐ•)
195
- - Entertainment (์—”ํ„ฐํ…Œ์ธ๋จผํŠธ)
196
- - Business (๋น„์ฆˆ๋‹ˆ์Šค)
197
- - Safety (์•ˆ์ „)
198
- - Culture (๋ฌธํ™”)
199
  - Politics (์ •์น˜)
200
- - Environment (ํ™˜๊ฒฝ)
201
  5. metadata: ๋‚ ์งœ, ์ถœ์ฒ˜ ๋“ฑ ์ถ”๊ฐ€ ์ •๋ณด
202
 
 
 
 
 
 
 
 
 
203
  ์ฃผ์˜์‚ฌํ•ญ:
204
  - text์— ์‰ผํ‘œ๊ฐ€ ์žˆ์œผ๋ฉด ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ธ๊ธฐ
205
  - ํฐ๋”ฐ์˜ดํ‘œ๋Š” ๋ฐฑ์Šฌ๋ž˜์‹œ๋กœ ์ด์Šค์ผ€์ดํ”„ ์ฒ˜๋ฆฌ
206
  - ๊ฐ ํ–‰์€ ์ƒˆ๋กœ์šด ์ค„๋กœ ๊ตฌ๋ถ„
207
- - label์€ ๋ฐ˜๋“œ์‹œ ์œ„ ์นดํ…Œ๊ณ ๋ฆฌ ์ค‘ ํ•˜๋‚˜๋ฅผ ์„ ํƒ
208
- - text์™€ label์€ ์„œ๋กœ ๋‹ค๋ฅธ ๋‚ด์šฉ์ด์–ด์•ผ ํ•จ"""
209
 
210
- full_prompt = f"{system_prompt}\n\n{input_text}\n\n์ถœ๋ ฅ:"
211
 
212
  try:
213
  response = ""
214
  stream = hf_client.text_generation(
215
  prompt=full_prompt,
216
  max_new_tokens=4000,
217
- temperature=0.3,
218
  top_p=0.9,
219
  stream=True,
220
  )
@@ -223,11 +225,24 @@ def preprocess_text_with_llm(input_text: str) -> str:
223
  if msg:
224
  response += msg
225
 
226
- # <EOS_TOKEN> ์ด์ „๊นŒ์ง€๋งŒ ์ถ”์ถœ
227
  if "<EOS_TOKEN>" in response:
228
  processed_text = response.split("<EOS_TOKEN>")[0].strip()
229
  else:
230
  processed_text = response.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  # CSV ํ˜•์‹ ๊ฒ€์ฆ
233
  try:
@@ -436,7 +451,7 @@ with gr.Blocks(css=css) as demo:
436
  outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
437
  )
438
 
439
- # ๋„ค ๋ฒˆ์งธ ํƒญ: ํ…์ŠคํŠธ๋ฅผ ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์œผ๋กœ ์ „์ฒ˜๋ฆฌ (๊ฐœ์„ ๋œ ๋ฒ„์ „)
440
  with gr.Tab("Text Preprocessing with LLM"):
441
  gr.Markdown("### ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด LLM์ด ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.")
442
  with gr.Row():
@@ -450,7 +465,7 @@ with gr.Blocks(css=css) as demo:
450
  with gr.Row():
451
  preprocess_button = gr.Button("์ „์ฒ˜๋ฆฌ ์‹คํ–‰", variant="primary")
452
  clear_button = gr.Button("์ดˆ๊ธฐํ™”")
453
-
454
  preprocess_status = gr.Textbox(
455
  label="์ „์ฒ˜๋ฆฌ ์ƒํƒœ",
456
  interactive=False,
@@ -464,12 +479,11 @@ with gr.Blocks(css=css) as demo:
464
  )
465
 
466
  # Parquet ๋ณ€ํ™˜ ๋ฐ ๋‹ค์šด๋กœ๋“œ ์„น์…˜
467
- with gr.Row():
468
- convert_to_parquet_button = gr.Button("Parquet์œผ๋กœ ๋ณ€ํ™˜", visible=True)
469
- download_parquet = gr.File(
470
- label="๋ณ€ํ™˜๋œ Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ",
471
- visible=False
472
- )
473
 
474
  def handle_text_preprocessing(input_text: str):
475
  if not input_text.strip():
 
187
  1. ์ถœ๋ ฅ ํ˜•์‹: id,text,label,metadata
188
  2. id: 1๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋Š” ์ˆœ์ฐจ์  ๋ฒˆํ˜ธ
189
  3. text: ์˜๋ฏธ ์žˆ๋Š” ๋‹จ์œ„๋กœ ๋ถ„๋ฆฌ๋œ ํ…์ŠคํŠธ
190
+ 4. label: ํ…์ŠคํŠธ์˜ ์ฃผ์ œ๋‚˜ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ์•„๋ž˜ ๊ธฐ์ค€์œผ๋กœ ์ •ํ™•ํ•˜๊ฒŒ ํ•œ ๊ฐœ๋งŒ ์„ ํƒ
191
+ - Historical_Figure (์—ญ์‚ฌ์  ์ธ๋ฌผ)
192
+ - Military_History (๊ตฐ์‚ฌ ์—ญ์‚ฌ)
193
+ - Technology (๊ธฐ์ˆ )
 
 
 
 
 
194
  - Politics (์ •์น˜)
195
+ - Culture (๋ฌธํ™”)
196
  5. metadata: ๋‚ ์งœ, ์ถœ์ฒ˜ ๋“ฑ ์ถ”๊ฐ€ ์ •๋ณด
197
 
198
+ ์ค‘์š”:
199
+ - ๋™์ผํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜๋ณตํ•ด์„œ ์ถœ๋ ฅํ•˜์ง€ ๋ง ๊ฒƒ
200
+ - ๊ฐ ํ…์ŠคํŠธ๋Š” ํ•œ ๋ฒˆ๋งŒ ์ฒ˜๋ฆฌํ•˜์—ฌ ๊ฐ€์žฅ ์ ํ•ฉํ•œ label์„ ์„ ํƒํ•  ๊ฒƒ
201
+ - ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ์˜๋ฏธ ๋‹จ์œ„๋กœ ์ ์ ˆํžˆ ๋ถ„๋ฆฌํ•  ๊ฒƒ
202
+
203
+ ์˜ˆ์‹œ:
204
+ 1,"์ด์ˆœ์‹ ์€ ์กฐ์„  ์ค‘๊ธฐ์˜ ๋ฌด์‹ ์ด๋‹ค.","Historical_Figure","์กฐ์„ ์‹œ๋Œ€, ์œ„ํ‚ค๋ฐฑ๊ณผ"
205
+
206
  ์ฃผ์˜์‚ฌํ•ญ:
207
  - text์— ์‰ผํ‘œ๊ฐ€ ์žˆ์œผ๋ฉด ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ธ๊ธฐ
208
  - ํฐ๋”ฐ์˜ดํ‘œ๋Š” ๋ฐฑ์Šฌ๋ž˜์‹œ๋กœ ์ด์Šค์ผ€์ดํ”„ ์ฒ˜๋ฆฌ
209
  - ๊ฐ ํ–‰์€ ์ƒˆ๋กœ์šด ์ค„๋กœ ๊ตฌ๋ถ„
210
+ - ๋ถˆํ•„์š”ํ•œ ๋ฐ˜๋ณต ์ถœ๋ ฅ ๊ธˆ์ง€"""
 
211
 
212
+ full_prompt = f"{system_prompt}\n\n์ž…๋ ฅํ…์ŠคํŠธ:\n{input_text}\n\n์ถœ๋ ฅ:"
213
 
214
  try:
215
  response = ""
216
  stream = hf_client.text_generation(
217
  prompt=full_prompt,
218
  max_new_tokens=4000,
219
+ temperature=0.1, # ๋” ๊ฒฐ์ •์ ์ธ ์ถœ๋ ฅ์„ ์œ„ํ•ด ๋‚ฎ์ถค
220
  top_p=0.9,
221
  stream=True,
222
  )
 
225
  if msg:
226
  response += msg
227
 
228
+ # <EOS_TOKEN> ์ด์ „๊นŒ์ง€๋งŒ ์ถ”์ถœํ•˜๊ณ  ์ •์ œ
229
  if "<EOS_TOKEN>" in response:
230
  processed_text = response.split("<EOS_TOKEN>")[0].strip()
231
  else:
232
  processed_text = response.strip()
233
+
234
+ # ์ค‘๋ณต ์ถœ๋ ฅ ์ œ๊ฑฐ
235
+ lines = processed_text.split('\n')
236
+ unique_lines = []
237
+ seen_texts = set()
238
+
239
+ for line in lines:
240
+ line = line.strip()
241
+ if line and '์ถœ๋ ฅ:' not in line and line not in seen_texts:
242
+ unique_lines.append(line)
243
+ seen_texts.add(line)
244
+
245
+ processed_text = '\n'.join(unique_lines)
246
 
247
  # CSV ํ˜•์‹ ๊ฒ€์ฆ
248
  try:
 
451
  outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
452
  )
453
 
454
+ # ๋„ค๋ฒˆ์งธ ํƒญ์˜ UI ๋ถ€๋ถ„ ์ˆ˜์ •
455
  with gr.Tab("Text Preprocessing with LLM"):
456
  gr.Markdown("### ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๋ฉด LLM์ด ๋ฐ์ดํ„ฐ์…‹ ํ˜•์‹์— ๋งž๊ฒŒ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.")
457
  with gr.Row():
 
465
  with gr.Row():
466
  preprocess_button = gr.Button("์ „์ฒ˜๋ฆฌ ์‹คํ–‰", variant="primary")
467
  clear_button = gr.Button("์ดˆ๊ธฐํ™”")
468
+
469
  preprocess_status = gr.Textbox(
470
  label="์ „์ฒ˜๋ฆฌ ์ƒํƒœ",
471
  interactive=False,
 
479
  )
480
 
481
  # Parquet ๋ณ€ํ™˜ ๋ฐ ๋‹ค์šด๋กœ๋“œ ์„น์…˜
482
+ convert_to_parquet_button = gr.Button("Parquet์œผ๋กœ ๋ณ€ํ™˜")
483
+ download_parquet = gr.File(label="๋ณ€ํ™˜๋œ Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ")
484
+
485
+
486
+
 
487
 
488
  def handle_text_preprocessing(input_text: str):
489
  if not input_text.strip():