ginipick commited on
Commit
4f03165
โ€ข
1 Parent(s): 6998afd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -73
app.py CHANGED
@@ -65,51 +65,6 @@ def preprocess_single_chunk(chunk: str) -> str:
65
  except Exception as e:
66
  return f"์ฒญํฌ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
67
 
68
- def preprocess_text_with_llm(input_text: str) -> str:
69
- if not input_text.strip():
70
- return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
71
-
72
- try:
73
- # ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
74
- chunks = chunk_text(input_text)
75
-
76
- # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋กœ ์ฒญํฌ๋“ค์„ ์ฒ˜๋ฆฌ
77
- with ThreadPoolExecutor(max_workers=3) as executor:
78
- processed_chunks = list(executor.map(cached_preprocess, chunks))
79
-
80
- # ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ ๋ฐ ์ค‘๋ณต ์ œ๊ฑฐ
81
- all_lines = []
82
- seen_texts = set()
83
- current_id = 1
84
-
85
- for chunk_result in processed_chunks:
86
- lines = chunk_result.split('\n')
87
- for line in lines:
88
- line = line.strip()
89
- if line and '์ถœ๋ ฅ:' not in line and line not in seen_texts:
90
- # ID ์žฌํ• ๋‹น
91
- parts = line.split(',', 1)
92
- if len(parts) > 1:
93
- new_line = f"{current_id},{parts[1]}"
94
- all_lines.append(new_line)
95
- seen_texts.add(new_line)
96
- current_id += 1
97
-
98
- processed_text = '\n'.join(all_lines)
99
-
100
- # CSV ํ˜•์‹ ๊ฒ€์ฆ
101
- try:
102
- from io import StringIO
103
- import csv
104
- csv.reader(StringIO(processed_text))
105
- return processed_text
106
- except csv.Error:
107
- return "LLM์ด ์˜ฌ๋ฐ”๋ฅธ CSV ํ˜•์‹์„ ์ƒ์„ฑํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”."
108
-
109
- except Exception as e:
110
- error_message = f"์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
111
- print(error_message)
112
- return error_message
113
 
114
 
115
 
@@ -278,6 +233,7 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
278
  print(f"{error_message}\n{traceback.format_exc()}")
279
  return error_message, "", ""
280
 
 
281
  def preprocess_text_with_llm(input_text: str) -> str:
282
  if not input_text.strip():
283
  return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
@@ -310,40 +266,50 @@ def preprocess_text_with_llm(input_text: str) -> str:
310
  - ๊ฐ ํ–‰์€ ์ƒˆ๋กœ์šด ์ค„๋กœ ๊ตฌ๋ถ„
311
  - ๋ถˆํ•„์š”ํ•œ ๋ฐ˜๋ณต ์ถœ๋ ฅ ๊ธˆ์ง€"""
312
 
313
- full_prompt = f"{system_prompt}\n\n์ž…๋ ฅํ…์ŠคํŠธ:\n{input_text}\n\n์ถœ๋ ฅ:"
314
-
315
  try:
316
- response = ""
317
- stream = hf_client.text_generation(
318
- prompt=full_prompt,
319
- max_new_tokens=4000,
320
- temperature=0.1, # ๋” ๊ฒฐ์ •์ ์ธ ์ถœ๋ ฅ์„ ์œ„ํ•ด ๋‚ฎ์ถค
321
- top_p=0.9,
322
- stream=True,
323
- )
324
 
325
- for msg in stream:
326
- if msg:
327
- response += msg
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
- # <EOS_TOKEN> ์ด์ „๊นŒ์ง€๋งŒ ์ถ”์ถœํ•˜๊ณ  ์ •์ œ
330
- if "<EOS_TOKEN>" in response:
331
- processed_text = response.split("<EOS_TOKEN>")[0].strip()
332
- else:
333
- processed_text = response.strip()
334
-
335
- # ์ค‘๋ณต ์ถœ๋ ฅ ์ œ๊ฑฐ
336
- lines = processed_text.split('\n')
337
- unique_lines = []
338
  seen_texts = set()
 
339
 
340
- for line in lines:
341
- line = line.strip()
342
- if line and '์ถœ๋ ฅ:' not in line and line not in seen_texts:
343
- unique_lines.append(line)
344
- seen_texts.add(line)
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
- processed_text = '\n'.join(unique_lines)
347
 
348
  # CSV ํ˜•์‹ ๊ฒ€์ฆ
349
  try:
@@ -359,6 +325,7 @@ def preprocess_text_with_llm(input_text: str) -> str:
359
  print(error_message)
360
  return error_message
361
 
 
362
  # CSS ์„ค์ •
363
  css = """
364
  footer {
 
65
  except Exception as e:
66
  return f"์ฒญํฌ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
 
70
 
 
233
  print(f"{error_message}\n{traceback.format_exc()}")
234
  return error_message, "", ""
235
 
236
+
237
  def preprocess_text_with_llm(input_text: str) -> str:
238
  if not input_text.strip():
239
  return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
 
266
  - ๊ฐ ํ–‰์€ ์ƒˆ๋กœ์šด ์ค„๋กœ ๊ตฌ๋ถ„
267
  - ๋ถˆํ•„์š”ํ•œ ๋ฐ˜๋ณต ์ถœ๋ ฅ ๊ธˆ์ง€"""
268
 
 
 
269
  try:
270
+ # ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
271
+ chunks = chunk_text(input_text)
 
 
 
 
 
 
272
 
273
+ # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋กœ ์ฒญํฌ๋“ค์„ ์ฒ˜๋ฆฌ
274
+ with ThreadPoolExecutor(max_workers=3) as executor:
275
+ processed_chunks = []
276
+ for chunk in chunks:
277
+ # ๊ฐ ์ฒญํฌ์— ๋Œ€ํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
278
+ chunk_prompt = f"{system_prompt}\n\n์ž…๋ ฅํ…์ŠคํŠธ:\n{chunk}\n\n์ถœ๋ ฅ:"
279
+ future = executor.submit(
280
+ hf_client.text_generation,
281
+ prompt=chunk_prompt,
282
+ max_new_tokens=2000,
283
+ temperature=0.1,
284
+ top_p=0.5,
285
+ stream=False
286
+ )
287
+ processed_chunks.append(future.result())
288
 
289
+ # ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ ๋ฐ ์ค‘๋ณต ์ œ๊ฑฐ
290
+ all_lines = []
 
 
 
 
 
 
 
291
  seen_texts = set()
292
+ current_id = 1
293
 
294
+ for chunk_result in processed_chunks:
295
+ # EOS_TOKEN ์ฒ˜๋ฆฌ
296
+ if "<EOS_TOKEN>" in chunk_result:
297
+ chunk_result = chunk_result.split("<EOS_TOKEN>")[0]
298
+
299
+ lines = chunk_result.strip().split('\n')
300
+ for line in lines:
301
+ line = line.strip()
302
+ if line and '์ถœ๋ ฅ:' not in line and line not in seen_texts:
303
+ # ID ์žฌํ• ๋‹น
304
+ parts = line.split(',', 1)
305
+ if len(parts) > 1:
306
+ new_line = f"{current_id},{parts[1]}"
307
+ if new_line not in seen_texts: # ์ถ”๊ฐ€์ ์ธ ์ค‘๋ณต ๊ฒ€์‚ฌ
308
+ all_lines.append(new_line)
309
+ seen_texts.add(new_line)
310
+ current_id += 1
311
 
312
+ processed_text = '\n'.join(all_lines)
313
 
314
  # CSV ํ˜•์‹ ๊ฒ€์ฆ
315
  try:
 
325
  print(error_message)
326
  return error_message
327
 
328
+
329
  # CSS ์„ค์ •
330
  css = """
331
  footer {