ginipick commited on
Commit
2a5d06b
β€’
1 Parent(s): f450551

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -47
app.py CHANGED
@@ -11,10 +11,12 @@ from functools import lru_cache
11
  from concurrent.futures import ThreadPoolExecutor
12
  import math
13
  import nltk
14
- nltk.download('punkt')
15
  from nltk.tokenize import sent_tokenize
16
  from transformers import AutoTokenizer
17
 
 
 
 
18
  # μΆ”λ‘  API ν΄λΌμ΄μ–ΈνŠΈ μ„€μ •
19
  hf_client = InferenceClient(
20
  "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
@@ -63,13 +65,13 @@ def preprocess_single_chunk(chunk: str) -> str:
63
  prompt=full_prompt,
64
  max_new_tokens=2000, # 토큰 수 μ œν•œ
65
  temperature=0.1, # 더 결정적인 좜λ ₯
66
- top_p=0.5, # 더 μ§‘μ€‘λœ 좜λ ₯
67
- stream=False # 슀트리밍 λΉ„ν™œμ„±ν™”
68
  )
69
-
70
  return response.strip()
71
  except Exception as e:
72
- return f"청크 처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
 
73
 
74
  def load_code(filename: str) -> str:
75
  try:
@@ -78,7 +80,8 @@ def load_code(filename: str) -> str:
78
  except FileNotFoundError:
79
  return f"{filename} νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
80
  except Exception as e:
81
- return f"νŒŒμΌμ„ μ½λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
 
82
 
83
  def load_parquet(filename: str) -> str:
84
  try:
@@ -87,7 +90,8 @@ def load_parquet(filename: str) -> str:
87
  except FileNotFoundError:
88
  return f"{filename} νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
89
  except Exception as e:
90
- return f"νŒŒμΌμ„ μ½λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
 
91
 
92
  def respond(
93
  message: str,
@@ -136,9 +140,9 @@ def respond(
136
  response += msg
137
  yield response
138
  except Exception as e:
139
- error_message = f"μΆ”λ‘  쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}\n{traceback.format_exc()}"
140
  print(error_message)
141
- yield error_message
142
 
143
  def upload_csv(file_path: str) -> Tuple[str, str]:
144
  try:
@@ -160,7 +164,8 @@ def upload_csv(file_path: str) -> Tuple[str, str]:
160
  df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
161
  return f"{parquet_filename} 파일이 μ„±κ³΅μ μœΌλ‘œ μ—…λ‘œλ“œλ˜κ³  λ³€ν™˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_filename
162
  except Exception as e:
163
- return f"CSV 파일 μ—…λ‘œλ“œ 및 λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}", ""
 
164
 
165
  def upload_parquet(file_path: str) -> Tuple[str, str, str]:
166
  try:
@@ -172,25 +177,26 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
172
  parquet_json = df.to_json(orient='records', force_ascii=False)
173
  return "Parquet 파일이 μ„±κ³΅μ μœΌλ‘œ μ—…λ‘œλ“œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_content, parquet_json
174
  except Exception as e:
175
- return f"Parquet 파일 μ—…λ‘œλ“œ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}", "", ""
 
176
 
177
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
178
  try:
179
  from io import StringIO
180
  import csv
181
-
182
  # μž…λ ₯ ν…μŠ€νŠΈ μ •μ œ
183
  lines = text.strip().split('\n')
184
  cleaned_lines = []
185
-
186
  for line in lines:
187
  # 빈 쀄 κ±΄λ„ˆλ›°κΈ°
188
  if not line.strip():
189
  continue
190
-
191
  # μŒλ”°μ˜΄ν‘œ μ •κ·œν™”
192
  line = line.replace('""', '"') # 쀑볡 μŒλ”°μ˜΄ν‘œ 처리
193
-
194
  # CSV νŒŒμ‹±μ„ μœ„ν•œ μž„μ‹œ StringIO 객체 생성
195
  temp_buffer = StringIO(line)
196
  try:
@@ -201,14 +207,14 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
201
  # 각 ν•„λ“œλ₯Ό 적절히 ν¬λ§·νŒ…
202
  formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
203
  cleaned_lines.append(formatted_line)
204
- except:
205
  continue
206
  finally:
207
  temp_buffer.close()
208
-
209
  # μ •μ œλœ CSV 데이터 생성
210
  cleaned_csv = '\n'.join(cleaned_lines)
211
-
212
  # DataFrame 생성
213
  df = pd.read_csv(
214
  StringIO(cleaned_csv),
@@ -217,28 +223,28 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
217
  escapechar='\\',
218
  names=['id', 'text', 'label', 'metadata']
219
  )
220
-
221
  # 데이터 μœ ν˜• μ΅œμ ν™”
222
  df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
223
-
224
  # Parquet 파일둜 λ³€ν™˜
225
  parquet_filename = 'text_to_parquet.parquet'
226
  df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
227
-
228
  # Parquet 파일 λ‚΄μš© 미리보기
229
  parquet_content = load_parquet(parquet_filename)
230
-
231
  return f"{parquet_filename} 파일이 μ„±κ³΅μ μœΌλ‘œ λ³€ν™˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_content, parquet_filename
232
-
233
  except Exception as e:
234
  error_message = f"ν…μŠ€νŠΈ λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
235
  print(f"{error_message}\n{traceback.format_exc()}")
236
- return error_message, "", ""
237
 
238
  def preprocess_text_with_llm(input_text: str) -> str:
239
  if not input_text.strip():
240
- return "μž…λ ₯ ν…μŠ€νŠΈκ°€ λΉ„μ–΄μžˆμŠ΅λ‹ˆλ‹€."
241
-
242
  system_prompt = """당신은 데이터 μ „μ²˜λ¦¬ μ „λ¬Έκ°€μž…λ‹ˆλ‹€. μž…λ ₯된 ν…μŠ€νŠΈλ₯Ό CSV 데이터셋 ν˜•μ‹μœΌλ‘œ λ³€ν™˜ν•˜μ„Έμš”.
243
 
244
  κ·œμΉ™:
@@ -270,7 +276,7 @@ def preprocess_text_with_llm(input_text: str) -> str:
270
  try:
271
  # ν…μŠ€νŠΈλ₯Ό 청크둜 λΆ„ν• 
272
  chunks = chunk_text(input_text)
273
-
274
  # 병렬 처리둜 청크듀을 처리
275
  with ThreadPoolExecutor(max_workers=3) as executor:
276
  futures = []
@@ -286,18 +292,19 @@ def preprocess_text_with_llm(input_text: str) -> str:
286
  stream=False
287
  )
288
  futures.append(future)
 
289
  processed_chunks = [future.result() for future in futures]
290
-
291
  # κ²°κ³Ό 병합 및 쀑볡 제거
292
  all_lines = []
293
  seen_texts = set()
294
  current_id = 1
295
-
296
  for chunk_result in processed_chunks:
297
  # EOS_TOKEN 처리
298
  if "<EOS_TOKEN>" in chunk_result:
299
  chunk_result = chunk_result.split("<EOS_TOKEN>")[0]
300
-
301
  lines = chunk_result.strip().split('\n')
302
  for line in lines:
303
  line = line.strip()
@@ -310,9 +317,9 @@ def preprocess_text_with_llm(input_text: str) -> str:
310
  all_lines.append(new_line)
311
  seen_texts.add(new_line)
312
  current_id += 1
313
-
314
  processed_text = '\n'.join(all_lines)
315
-
316
  # CSV ν˜•μ‹ 검증
317
  try:
318
  from io import StringIO
@@ -321,11 +328,11 @@ def preprocess_text_with_llm(input_text: str) -> str:
321
  return processed_text
322
  except csv.Error:
323
  return "LLM이 μ˜¬λ°”λ₯Έ CSV ν˜•μ‹μ„ μƒμ„±ν•˜μ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€. λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
324
-
325
  except Exception as e:
326
  error_message = f"μ „μ²˜λ¦¬ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
327
  print(error_message)
328
- return error_message
329
 
330
  # CSS μ„€μ •
331
  css = """
@@ -407,7 +414,8 @@ with gr.Blocks(css=css) as demo:
407
  # μ–΄μ‹œμŠ€ν„΄νŠΈμ˜ 응닡을 νžˆμŠ€ν† λ¦¬μ— μΆ”κ°€
408
  history.append({"role": "assistant", "content": partial_response})
409
  except Exception as e:
410
- response = f"μΆ”λ‘  쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
 
411
  history.append({"role": "assistant", "content": response})
412
  yield history, ""
413
 
@@ -528,46 +536,48 @@ with gr.Blocks(css=css) as demo:
528
  lines=15,
529
  placeholder="여기에 μ „μ²˜λ¦¬ν•  ν…μŠ€νŠΈλ₯Ό μž…λ ₯ν•˜μ„Έμš”..."
530
  )
531
-
532
  with gr.Row():
533
  preprocess_button = gr.Button("μ „μ²˜λ¦¬ μ‹€ν–‰", variant="primary")
534
  clear_button = gr.Button("μ΄ˆκΈ°ν™”")
535
-
536
  preprocess_status = gr.Textbox(
537
  label="μ „μ²˜λ¦¬ μƒνƒœ",
538
  interactive=False,
539
  value="λŒ€κΈ° 쀑..."
540
  )
541
-
542
  processed_text_output = gr.Textbox(
543
  label="μ „μ²˜λ¦¬λœ 데이터셋 좜λ ₯",
544
  lines=15,
545
  interactive=False
546
  )
547
-
548
  # Parquet λ³€ν™˜ 및 λ‹€μš΄λ‘œλ“œ μ„Ήμ…˜
549
  convert_to_parquet_button = gr.Button("Parquet으둜 λ³€ν™˜")
550
  download_parquet = gr.File(label="λ³€ν™˜λœ Parquet 파일 λ‹€μš΄λ‘œλ“œ")
551
 
552
  def handle_text_preprocessing(input_text: str):
553
  if not input_text.strip():
554
- return "μž…λ ₯ ν…μŠ€νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€.", ""
555
-
 
556
  try:
557
  preprocess_status_msg = "μ „μ²˜λ¦¬λ₯Ό μ‹œμž‘ν•©λ‹ˆλ‹€..."
558
  yield preprocess_status_msg, ""
559
-
560
  processed_text = preprocess_text_with_llm(input_text)
561
-
562
  if processed_text:
563
  preprocess_status_msg = "μ „μ²˜λ¦¬κ°€ μ™„λ£Œλ˜μ—ˆμŠ΅λ‹ˆλ‹€."
564
  yield preprocess_status_msg, processed_text
565
  else:
566
  preprocess_status_msg = "μ „μ²˜λ¦¬ κ²°κ³Όκ°€ μ—†μŠ΅λ‹ˆλ‹€."
567
  yield preprocess_status_msg, ""
568
-
569
  except Exception as e:
570
- error_msg = f"처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
 
571
  yield error_msg, ""
572
 
573
  def clear_inputs():
@@ -576,14 +586,15 @@ with gr.Blocks(css=css) as demo:
576
  def convert_to_parquet_file(processed_text: str):
577
  if not processed_text.strip():
578
  return "λ³€ν™˜ν•  ν…μŠ€νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€.", None
579
-
580
  try:
581
  message, parquet_content, parquet_filename = text_to_parquet(processed_text)
582
  if parquet_filename:
583
  return message, parquet_filename
584
  return message, None
585
  except Exception as e:
586
- return f"Parquet λ³€ν™˜ 쀑 였λ₯˜ λ°œμƒ: {str(e)}", None
 
587
 
588
  # 이벀트 ν•Έλ“€λŸ¬ μ—°κ²°
589
  preprocess_button.click(
@@ -619,3 +630,4 @@ with gr.Blocks(css=css) as demo:
619
 
620
  if __name__ == "__main__":
621
  demo.launch(share=True) # μ½”λ“œμƒμ˜ 였λ₯˜λ‚˜ κ°œμ„ μ΄ ν•„μš”ν•œ 사항을 μΆ”λ‘ ν•˜μ—¬ λ³΄κ³ ν•˜λΌ
 
 
11
  from concurrent.futures import ThreadPoolExecutor
12
  import math
13
  import nltk
 
14
  from nltk.tokenize import sent_tokenize
15
  from transformers import AutoTokenizer
16
 
17
+ # NLTK 데이터 λ‹€μš΄λ‘œλ“œ
18
+ nltk.download('punkt')
19
+
20
  # μΆ”λ‘  API ν΄λΌμ΄μ–ΈνŠΈ μ„€μ •
21
  hf_client = InferenceClient(
22
  "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
 
65
  prompt=full_prompt,
66
  max_new_tokens=2000, # 토큰 수 μ œν•œ
67
  temperature=0.1, # 더 결정적인 좜λ ₯
68
+ top_p=0.5, # 더 μ§‘μ€‘λœ 좜λ ₯
69
+ stream=False # 슀트리밍 λΉ„ν™œμ„±ν™”
70
  )
 
71
  return response.strip()
72
  except Exception as e:
73
+ print(f"청크 처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}\n{traceback.format_exc()}")
74
+ return "청크 처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”."
75
 
76
  def load_code(filename: str) -> str:
77
  try:
 
80
  except FileNotFoundError:
81
  return f"{filename} νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
82
  except Exception as e:
83
+ print(f"파일 읽기 였λ₯˜: {str(e)}\n{traceback.format_exc()}")
84
+ return "νŒŒμΌμ„ μ½λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”."
85
 
86
  def load_parquet(filename: str) -> str:
87
  try:
 
90
  except FileNotFoundError:
91
  return f"{filename} νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
92
  except Exception as e:
93
+ print(f"Parquet 파일 λ‘œλ“œ 였λ₯˜: {str(e)}\n{traceback.format_exc()}")
94
+ return "νŒŒμΌμ„ μ½λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”."
95
 
96
  def respond(
97
  message: str,
 
140
  response += msg
141
  yield response
142
  except Exception as e:
143
+ error_message = f"μΆ”λ‘  쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
144
  print(error_message)
145
+ yield "μΆ”λ‘  쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”."
146
 
147
  def upload_csv(file_path: str) -> Tuple[str, str]:
148
  try:
 
164
  df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
165
  return f"{parquet_filename} 파일이 μ„±κ³΅μ μœΌλ‘œ μ—…λ‘œλ“œλ˜κ³  λ³€ν™˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_filename
166
  except Exception as e:
167
+ print(f"CSV 파일 μ—…λ‘œλ“œ 및 λ³€ν™˜ 쀑 였λ₯˜ λ°œμƒ: {str(e)}\n{traceback.format_exc()}")
168
+ return "CSV 파일 μ—…λ‘œλ“œ 및 λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”.", ""
169
 
170
  def upload_parquet(file_path: str) -> Tuple[str, str, str]:
171
  try:
 
177
  parquet_json = df.to_json(orient='records', force_ascii=False)
178
  return "Parquet 파일이 μ„±κ³΅μ μœΌλ‘œ μ—…λ‘œλ“œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_content, parquet_json
179
  except Exception as e:
180
+ print(f"Parquet 파일 μ—…λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {str(e)}\n{traceback.format_exc()}")
181
+ return "Parquet 파일 μ—…λ‘œλ“œ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”.", "", ""
182
 
183
  def text_to_parquet(text: str) -> Tuple[str, str, str]:
184
  try:
185
  from io import StringIO
186
  import csv
187
+
188
  # μž…λ ₯ ν…μŠ€νŠΈ μ •μ œ
189
  lines = text.strip().split('\n')
190
  cleaned_lines = []
191
+
192
  for line in lines:
193
  # 빈 쀄 κ±΄λ„ˆλ›°κΈ°
194
  if not line.strip():
195
  continue
196
+
197
  # μŒλ”°μ˜΄ν‘œ μ •κ·œν™”
198
  line = line.replace('""', '"') # 쀑볡 μŒλ”°μ˜΄ν‘œ 처리
199
+
200
  # CSV νŒŒμ‹±μ„ μœ„ν•œ μž„μ‹œ StringIO 객체 생성
201
  temp_buffer = StringIO(line)
202
  try:
 
207
  # 각 ν•„λ“œλ₯Ό 적절히 ν¬λ§·νŒ…
208
  formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
209
  cleaned_lines.append(formatted_line)
210
+ except Exception as e:
211
  continue
212
  finally:
213
  temp_buffer.close()
214
+
215
  # μ •μ œλœ CSV 데이터 생성
216
  cleaned_csv = '\n'.join(cleaned_lines)
217
+
218
  # DataFrame 생성
219
  df = pd.read_csv(
220
  StringIO(cleaned_csv),
 
223
  escapechar='\\',
224
  names=['id', 'text', 'label', 'metadata']
225
  )
226
+
227
  # 데이터 μœ ν˜• μ΅œμ ν™”
228
  df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
229
+
230
  # Parquet 파일둜 λ³€ν™˜
231
  parquet_filename = 'text_to_parquet.parquet'
232
  df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
233
+
234
  # Parquet 파일 λ‚΄μš© 미리보기
235
  parquet_content = load_parquet(parquet_filename)
236
+
237
  return f"{parquet_filename} 파일이 μ„±κ³΅μ μœΌλ‘œ λ³€ν™˜λ˜μ—ˆμŠ΅λ‹ˆλ‹€.", parquet_content, parquet_filename
238
+
239
  except Exception as e:
240
  error_message = f"ν…μŠ€νŠΈ λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
241
  print(f"{error_message}\n{traceback.format_exc()}")
242
+ return "ν…μŠ€νŠΈ λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”.", "", ""
243
 
244
  def preprocess_text_with_llm(input_text: str) -> str:
245
  if not input_text.strip():
246
+ return "μž…λ ₯ ν…μŠ€νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€."
247
+
248
  system_prompt = """당신은 데이터 μ „μ²˜λ¦¬ μ „λ¬Έκ°€μž…λ‹ˆλ‹€. μž…λ ₯된 ν…μŠ€νŠΈλ₯Ό CSV 데이터셋 ν˜•μ‹μœΌλ‘œ λ³€ν™˜ν•˜μ„Έμš”.
249
 
250
  κ·œμΉ™:
 
276
  try:
277
  # ν…μŠ€νŠΈλ₯Ό 청크둜 λΆ„ν• 
278
  chunks = chunk_text(input_text)
279
+
280
  # 병렬 처리둜 청크듀을 처리
281
  with ThreadPoolExecutor(max_workers=3) as executor:
282
  futures = []
 
292
  stream=False
293
  )
294
  futures.append(future)
295
+
296
  processed_chunks = [future.result() for future in futures]
297
+
298
  # κ²°κ³Ό 병합 및 쀑볡 제거
299
  all_lines = []
300
  seen_texts = set()
301
  current_id = 1
302
+
303
  for chunk_result in processed_chunks:
304
  # EOS_TOKEN 처리
305
  if "<EOS_TOKEN>" in chunk_result:
306
  chunk_result = chunk_result.split("<EOS_TOKEN>")[0]
307
+
308
  lines = chunk_result.strip().split('\n')
309
  for line in lines:
310
  line = line.strip()
 
317
  all_lines.append(new_line)
318
  seen_texts.add(new_line)
319
  current_id += 1
320
+
321
  processed_text = '\n'.join(all_lines)
322
+
323
  # CSV ν˜•μ‹ 검증
324
  try:
325
  from io import StringIO
 
328
  return processed_text
329
  except csv.Error:
330
  return "LLM이 μ˜¬λ°”λ₯Έ CSV ν˜•μ‹μ„ μƒμ„±ν•˜μ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€. λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
331
+
332
  except Exception as e:
333
  error_message = f"μ „μ²˜λ¦¬ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
334
  print(error_message)
335
+ return "μ „μ²˜λ¦¬ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”."
336
 
337
  # CSS μ„€μ •
338
  css = """
 
414
  # μ–΄μ‹œμŠ€ν„΄νŠΈμ˜ 응닡을 νžˆμŠ€ν† λ¦¬μ— μΆ”κ°€
415
  history.append({"role": "assistant", "content": partial_response})
416
  except Exception as e:
417
+ print(f"λ©”μ‹œμ§€ 처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}\n{traceback.format_exc()}")
418
+ response = "λ©”μ‹œμ§€ 처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”."
419
  history.append({"role": "assistant", "content": response})
420
  yield history, ""
421
 
 
536
  lines=15,
537
  placeholder="여기에 μ „μ²˜λ¦¬ν•  ν…μŠ€νŠΈλ₯Ό μž…λ ₯ν•˜μ„Έμš”..."
538
  )
539
+
540
  with gr.Row():
541
  preprocess_button = gr.Button("μ „μ²˜λ¦¬ μ‹€ν–‰", variant="primary")
542
  clear_button = gr.Button("μ΄ˆκΈ°ν™”")
543
+
544
  preprocess_status = gr.Textbox(
545
  label="μ „μ²˜λ¦¬ μƒνƒœ",
546
  interactive=False,
547
  value="λŒ€κΈ° 쀑..."
548
  )
549
+
550
  processed_text_output = gr.Textbox(
551
  label="μ „μ²˜λ¦¬λœ 데이터셋 좜λ ₯",
552
  lines=15,
553
  interactive=False
554
  )
555
+
556
  # Parquet λ³€ν™˜ 및 λ‹€μš΄λ‘œλ“œ μ„Ήμ…˜
557
  convert_to_parquet_button = gr.Button("Parquet으둜 λ³€ν™˜")
558
  download_parquet = gr.File(label="λ³€ν™˜λœ Parquet 파일 λ‹€μš΄λ‘œλ“œ")
559
 
560
  def handle_text_preprocessing(input_text: str):
561
  if not input_text.strip():
562
+ yield "μž…λ ₯ ν…μŠ€νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€.", ""
563
+ return
564
+
565
  try:
566
  preprocess_status_msg = "μ „μ²˜λ¦¬λ₯Ό μ‹œμž‘ν•©λ‹ˆλ‹€..."
567
  yield preprocess_status_msg, ""
568
+
569
  processed_text = preprocess_text_with_llm(input_text)
570
+
571
  if processed_text:
572
  preprocess_status_msg = "μ „μ²˜λ¦¬κ°€ μ™„λ£Œλ˜μ—ˆμŠ΅λ‹ˆλ‹€."
573
  yield preprocess_status_msg, processed_text
574
  else:
575
  preprocess_status_msg = "μ „μ²˜λ¦¬ κ²°κ³Όκ°€ μ—†μŠ΅λ‹ˆλ‹€."
576
  yield preprocess_status_msg, ""
577
+
578
  except Exception as e:
579
+ error_msg = "μ „μ²˜λ¦¬ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”."
580
+ print(f"μ „μ²˜λ¦¬ 쀑 였λ₯˜ λ°œμƒ: {str(e)}\n{traceback.format_exc()}")
581
  yield error_msg, ""
582
 
583
  def clear_inputs():
 
586
  def convert_to_parquet_file(processed_text: str):
587
  if not processed_text.strip():
588
  return "λ³€ν™˜ν•  ν…μŠ€νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€.", None
589
+
590
  try:
591
  message, parquet_content, parquet_filename = text_to_parquet(processed_text)
592
  if parquet_filename:
593
  return message, parquet_filename
594
  return message, None
595
  except Exception as e:
596
+ print(f"Parquet λ³€ν™˜ 쀑 였λ₯˜ λ°œμƒ: {str(e)}\n{traceback.format_exc()}")
597
+ return "Parquet λ³€ν™˜ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. κ΄€λ¦¬μžμ—κ²Œ λ¬Έμ˜ν•˜μ„Έμš”.", None
598
 
599
  # 이벀트 ν•Έλ“€λŸ¬ μ—°κ²°
600
  preprocess_button.click(
 
630
 
631
  if __name__ == "__main__":
632
  demo.launch(share=True) # μ½”λ“œμƒμ˜ 였λ₯˜λ‚˜ κ°œμ„ μ΄ ν•„μš”ν•œ 사항을 μΆ”λ‘ ν•˜μ—¬ λ³΄κ³ ν•˜λΌ
633
+