myezrag

Running

App Files Files Community

ginipick commited on Oct 25, 2024

Commit

aca9376

verified ·

1 Parent(s): 4f03165

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -25

app.py CHANGED Viewed

@@ -7,34 +7,40 @@ import json
 import io
 import traceback
 import csv
 # 추론 API 클라이언트 설정
 hf_client = InferenceClient(
     "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
 )
-from functools import lru_cache
-from concurrent.futures import ThreadPoolExecutor
-import math
 def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
     """텍스트를 더 작은 청크로 분할"""
-    sentences = text.split('.')
     chunks = []
     current_chunk = []
     current_length = 0
     for sentence in sentences:
-        sentence = sentence.strip() + '.'
-        if current_length + len(sentence) > chunk_size:
             if current_chunk:
                 chunks.append(' '.join(current_chunk))
             current_chunk = [sentence]
-            current_length = len(sentence)
         else:
             current_chunk.append(sentence)
-            current_length += len(sentence)
     if current_chunk:
         chunks.append(' '.join(current_chunk))
     return chunks
@@ -65,10 +71,6 @@ def preprocess_single_chunk(chunk: str) -> str:
     except Exception as e:
         return f"청크 처리 중 오류 발생: {str(e)}"
 def load_code(filename: str) -> str:
     try:
         with open(filename, 'r', encoding='utf-8') as file:
@@ -233,7 +235,6 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
         print(f"{error_message}\n{traceback.format_exc()}")
         return error_message, "", ""
 def preprocess_text_with_llm(input_text: str) -> str:
     if not input_text.strip():
         return "입력 텍스트가 비어있습니다."
@@ -272,7 +273,7 @@ def preprocess_text_with_llm(input_text: str) -> str:
         # 병렬 처리로 청크들을 처리
         with ThreadPoolExecutor(max_workers=3) as executor:
-            processed_chunks = []
             for chunk in chunks:
                 # 각 청크에 대한 프롬프트 생성
                 chunk_prompt = f"{system_prompt}\n\n입력텍스트:\n{chunk}\n\n출력:"
@@ -284,7 +285,8 @@ def preprocess_text_with_llm(input_text: str) -> str:
                     top_p=0.5,
                     stream=False
                 )
-                processed_chunks.append(future.result())
         # 결과 병합 및 중복 제거
         all_lines = []
@@ -325,7 +327,6 @@ def preprocess_text_with_llm(input_text: str) -> str:
         print(error_message)
         return error_message
 # CSS 설정
 css = """
 footer {
@@ -363,8 +364,6 @@ with gr.Blocks(css=css) as demo:
         elem_id="initial-description"
     )
     # 첫 번째 탭: 챗봇 데이터 업로드 (탭 이름 변경: "My 데이터셋+LLM")
     with gr.Tab("My 데이터셋+LLM"):
         gr.Markdown("### LLM과 대화하기")
@@ -550,9 +549,6 @@ with gr.Blocks(css=css) as demo:
                 convert_to_parquet_button = gr.Button("Parquet으로 변환")
                 download_parquet = gr.File(label="변환된 Parquet 파일 다운로드")
                 def handle_text_preprocessing(input_text: str):
                     if not input_text.strip():
                         return "입력 텍스트가 없습니다.", ""
@@ -622,4 +618,4 @@ with gr.Blocks(css=css) as demo:
     gr.Markdown("### [email protected]", elem_id="initial-description")
 if __name__ == "__main__":
-    demo.launch(share=True)

 import io
 import traceback
 import csv
+from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor
+import math
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
+from transformers import AutoTokenizer
 # 추론 API 클라이언트 설정
 hf_client = InferenceClient(
     "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
 )
 def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
     """텍스트를 더 작은 청크로 분할"""
+    tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus-08-2024")
+    sentences = sent_tokenize(text)
     chunks = []
     current_chunk = []
     current_length = 0
     for sentence in sentences:
+        sentence = sentence.strip()
+        tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=False)
+        sentence_length = len(tokenized_sentence)
+        if current_length + sentence_length > chunk_size:
             if current_chunk:
                 chunks.append(' '.join(current_chunk))
             current_chunk = [sentence]
+            current_length = sentence_length
         else:
             current_chunk.append(sentence)
+            current_length += sentence_length
     if current_chunk:
         chunks.append(' '.join(current_chunk))
     return chunks
     except Exception as e:
         return f"청크 처리 중 오류 발생: {str(e)}"
 def load_code(filename: str) -> str:
     try:
         with open(filename, 'r', encoding='utf-8') as file:
         print(f"{error_message}\n{traceback.format_exc()}")
         return error_message, "", ""
 def preprocess_text_with_llm(input_text: str) -> str:
     if not input_text.strip():
         return "입력 텍스트가 비어있습니다."
         # 병렬 처리로 청크들을 처리
         with ThreadPoolExecutor(max_workers=3) as executor:
+            futures = []
             for chunk in chunks:
                 # 각 청크에 대한 프롬프트 생성
                 chunk_prompt = f"{system_prompt}\n\n입력텍스트:\n{chunk}\n\n출력:"
                     top_p=0.5,
                     stream=False
                 )
+                futures.append(future)
+            processed_chunks = [future.result() for future in futures]
         # 결과 병합 및 중복 제거
         all_lines = []
         print(error_message)
         return error_message
 # CSS 설정
 css = """
 footer {
         elem_id="initial-description"
     )
     # 첫 번째 탭: 챗봇 데이터 업로드 (탭 이름 변경: "My 데이터셋+LLM")
     with gr.Tab("My 데이터셋+LLM"):
         gr.Markdown("### LLM과 대화하기")
                 convert_to_parquet_button = gr.Button("Parquet으로 변환")
                 download_parquet = gr.File(label="변환된 Parquet 파일 다운로드")
                 def handle_text_preprocessing(input_text: str):
                     if not input_text.strip():
                         return "입력 텍스트가 없습니다.", ""
     gr.Markdown("### [email protected]", elem_id="initial-description")
 if __name__ == "__main__":
+    demo.launch(share=True)  # 코드상의 오류나 개선이 필요한 사항을 추론하여 보고하라