ginipick commited on
Commit
aca9376
โ€ข
1 Parent(s): 4f03165

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -25
app.py CHANGED
@@ -7,34 +7,40 @@ import json
7
  import io
8
  import traceback
9
  import csv
 
 
 
 
 
 
 
10
 
11
  # ์ถ”๋ก  API ํด๋ผ์ด์–ธํŠธ ์„ค์ •
12
  hf_client = InferenceClient(
13
  "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
14
  )
15
 
16
- from functools import lru_cache
17
- from concurrent.futures import ThreadPoolExecutor
18
- import math
19
-
20
  def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
21
  """ํ…์ŠคํŠธ๋ฅผ ๋” ์ž‘์€ ์ฒญํฌ๋กœ ๋ถ„ํ• """
22
- sentences = text.split('.')
 
23
  chunks = []
24
  current_chunk = []
25
  current_length = 0
26
-
27
  for sentence in sentences:
28
- sentence = sentence.strip() + '.'
29
- if current_length + len(sentence) > chunk_size:
 
 
30
  if current_chunk:
31
  chunks.append(' '.join(current_chunk))
32
  current_chunk = [sentence]
33
- current_length = len(sentence)
34
  else:
35
  current_chunk.append(sentence)
36
- current_length += len(sentence)
37
-
38
  if current_chunk:
39
  chunks.append(' '.join(current_chunk))
40
  return chunks
@@ -65,10 +71,6 @@ def preprocess_single_chunk(chunk: str) -> str:
65
  except Exception as e:
66
  return f"์ฒญํฌ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
67
 
68
-
69
-
70
-
71
-
72
  def load_code(filename: str) -> str:
73
  try:
74
  with open(filename, 'r', encoding='utf-8') as file:
@@ -233,7 +235,6 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
233
  print(f"{error_message}\n{traceback.format_exc()}")
234
  return error_message, "", ""
235
 
236
-
237
  def preprocess_text_with_llm(input_text: str) -> str:
238
  if not input_text.strip():
239
  return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
@@ -272,7 +273,7 @@ def preprocess_text_with_llm(input_text: str) -> str:
272
 
273
  # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋กœ ์ฒญํฌ๋“ค์„ ์ฒ˜๋ฆฌ
274
  with ThreadPoolExecutor(max_workers=3) as executor:
275
- processed_chunks = []
276
  for chunk in chunks:
277
  # ๊ฐ ์ฒญํฌ์— ๋Œ€ํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
278
  chunk_prompt = f"{system_prompt}\n\n์ž…๋ ฅํ…์ŠคํŠธ:\n{chunk}\n\n์ถœ๋ ฅ:"
@@ -284,7 +285,8 @@ def preprocess_text_with_llm(input_text: str) -> str:
284
  top_p=0.5,
285
  stream=False
286
  )
287
- processed_chunks.append(future.result())
 
288
 
289
  # ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ ๋ฐ ์ค‘๋ณต ์ œ๊ฑฐ
290
  all_lines = []
@@ -325,7 +327,6 @@ def preprocess_text_with_llm(input_text: str) -> str:
325
  print(error_message)
326
  return error_message
327
 
328
-
329
  # CSS ์„ค์ •
330
  css = """
331
  footer {
@@ -363,8 +364,6 @@ with gr.Blocks(css=css) as demo:
363
  elem_id="initial-description"
364
  )
365
 
366
-
367
-
368
  # ์ฒซ ๋ฒˆ์งธ ํƒญ: ์ฑ—๋ด‡ ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "My ๋ฐ์ดํ„ฐ์…‹+LLM")
369
  with gr.Tab("My ๋ฐ์ดํ„ฐ์…‹+LLM"):
370
  gr.Markdown("### LLM๊ณผ ๋Œ€ํ™”ํ•˜๊ธฐ")
@@ -550,9 +549,6 @@ with gr.Blocks(css=css) as demo:
550
  convert_to_parquet_button = gr.Button("Parquet์œผ๋กœ ๋ณ€ํ™˜")
551
  download_parquet = gr.File(label="๋ณ€ํ™˜๋œ Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ")
552
 
553
-
554
-
555
-
556
  def handle_text_preprocessing(input_text: str):
557
  if not input_text.strip():
558
  return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", ""
@@ -622,4 +618,4 @@ with gr.Blocks(css=css) as demo:
622
  gr.Markdown("### [email protected]", elem_id="initial-description")
623
 
624
  if __name__ == "__main__":
625
- demo.launch(share=True)
 
7
  import io
8
  import traceback
9
  import csv
10
+ from functools import lru_cache
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ import math
13
+ import nltk
14
+ nltk.download('punkt')
15
+ from nltk.tokenize import sent_tokenize
16
+ from transformers import AutoTokenizer
17
 
18
  # ์ถ”๋ก  API ํด๋ผ์ด์–ธํŠธ ์„ค์ •
19
  hf_client = InferenceClient(
20
  "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
21
  )
22
 
 
 
 
 
23
  def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
24
  """ํ…์ŠคํŠธ๋ฅผ ๋” ์ž‘์€ ์ฒญํฌ๋กœ ๋ถ„ํ• """
25
+ tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus-08-2024")
26
+ sentences = sent_tokenize(text)
27
  chunks = []
28
  current_chunk = []
29
  current_length = 0
30
+
31
  for sentence in sentences:
32
+ sentence = sentence.strip()
33
+ tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=False)
34
+ sentence_length = len(tokenized_sentence)
35
+ if current_length + sentence_length > chunk_size:
36
  if current_chunk:
37
  chunks.append(' '.join(current_chunk))
38
  current_chunk = [sentence]
39
+ current_length = sentence_length
40
  else:
41
  current_chunk.append(sentence)
42
+ current_length += sentence_length
43
+
44
  if current_chunk:
45
  chunks.append(' '.join(current_chunk))
46
  return chunks
 
71
  except Exception as e:
72
  return f"์ฒญํฌ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
73
 
 
 
 
 
74
  def load_code(filename: str) -> str:
75
  try:
76
  with open(filename, 'r', encoding='utf-8') as file:
 
235
  print(f"{error_message}\n{traceback.format_exc()}")
236
  return error_message, "", ""
237
 
 
238
  def preprocess_text_with_llm(input_text: str) -> str:
239
  if not input_text.strip():
240
  return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค."
 
273
 
274
  # ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋กœ ์ฒญํฌ๋“ค์„ ์ฒ˜๋ฆฌ
275
  with ThreadPoolExecutor(max_workers=3) as executor:
276
+ futures = []
277
  for chunk in chunks:
278
  # ๊ฐ ์ฒญํฌ์— ๋Œ€ํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ
279
  chunk_prompt = f"{system_prompt}\n\n์ž…๋ ฅํ…์ŠคํŠธ:\n{chunk}\n\n์ถœ๋ ฅ:"
 
285
  top_p=0.5,
286
  stream=False
287
  )
288
+ futures.append(future)
289
+ processed_chunks = [future.result() for future in futures]
290
 
291
  # ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ ๋ฐ ์ค‘๋ณต ์ œ๊ฑฐ
292
  all_lines = []
 
327
  print(error_message)
328
  return error_message
329
 
 
330
  # CSS ์„ค์ •
331
  css = """
332
  footer {
 
364
  elem_id="initial-description"
365
  )
366
 
 
 
367
  # ์ฒซ ๋ฒˆ์งธ ํƒญ: ์ฑ—๋ด‡ ๋ฐ์ดํ„ฐ ์—…๋กœ๋“œ (ํƒญ ์ด๋ฆ„ ๋ณ€๊ฒฝ: "My ๋ฐ์ดํ„ฐ์…‹+LLM")
368
  with gr.Tab("My ๋ฐ์ดํ„ฐ์…‹+LLM"):
369
  gr.Markdown("### LLM๊ณผ ๋Œ€ํ™”ํ•˜๊ธฐ")
 
549
  convert_to_parquet_button = gr.Button("Parquet์œผ๋กœ ๋ณ€ํ™˜")
550
  download_parquet = gr.File(label="๋ณ€ํ™˜๋œ Parquet ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ")
551
 
 
 
 
552
  def handle_text_preprocessing(input_text: str):
553
  if not input_text.strip():
554
  return "์ž…๋ ฅ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", ""
 
618
  gr.Markdown("### [email protected]", elem_id="initial-description")
619
 
620
  if __name__ == "__main__":
621
+ demo.launch(share=True) # ์ฝ”๋“œ์ƒ์˜ ์˜ค๋ฅ˜๋‚˜ ๊ฐœ์„ ์ด ํ•„์š”ํ•œ ์‚ฌํ•ญ์„ ์ถ”๋ก ํ•˜์—ฌ ๋ณด๊ณ ํ•˜๋ผ