myezrag

Running

App Files Files Community

ginipick commited on Oct 25, 2024

Commit

6998afd

verified ·

1 Parent(s): b3bb461

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -0

app.py CHANGED Viewed

@@ -13,6 +13,107 @@ hf_client = InferenceClient(
     "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
 )
 def load_code(filename: str) -> str:
     try:
         with open(filename, 'r', encoding='utf-8') as file:

     "CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
 )
+from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor
+import math
+def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
+    """텍스트를 더 작은 청크로 분할"""
+    sentences = text.split('.')
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for sentence in sentences:
+        sentence = sentence.strip() + '.'
+        if current_length + len(sentence) > chunk_size:
+            if current_chunk:
+                chunks.append(' '.join(current_chunk))
+            current_chunk = [sentence]
+            current_length = len(sentence)
+        else:
+            current_chunk.append(sentence)
+            current_length += len(sentence)
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks
+@lru_cache(maxsize=100)
+def cached_preprocess(text: str) -> str:
+    """자주 사용되는 텍스트에 대한 전처리 결과를 캐싱"""
+    return preprocess_single_chunk(text)
+def preprocess_single_chunk(chunk: str) -> str:
+    """단일 청크에 대한 전처리 수행"""
+    system_prompt = """당신은 데이터 전처리 전문가입니다. 입력된 텍스트를 CSV 데이터셋 형식으로 빠르게 변환하세요.
+    [기존 규칙 동일]"""
+    full_prompt = f"{system_prompt}\n\n입력텍스트:\n{chunk}\n\n출력:"
+    try:
+        # 스트리밍 비활성화 및 파라미터 최적화
+        response = hf_client.text_generation(
+            prompt=full_prompt,
+            max_new_tokens=2000,  # 토큰 수 제한
+            temperature=0.1,      # 더 결정적인 출력
+            top_p=0.5,           # 더 집중된 출력
+            stream=False         # 스트리밍 비활성화
+        )
+        return response.strip()
+    except Exception as e:
+        return f"청크 처리 중 오류 발생: {str(e)}"
+def preprocess_text_with_llm(input_text: str) -> str:
+    if not input_text.strip():
+        return "입력 텍스트가 비어있습니다."
+    try:
+        # 텍스트를 청크로 분할
+        chunks = chunk_text(input_text)
+        # 병렬 처리로 청크들을 처리
+        with ThreadPoolExecutor(max_workers=3) as executor:
+            processed_chunks = list(executor.map(cached_preprocess, chunks))
+        # 결과 병합 및 중복 제거
+        all_lines = []
+        seen_texts = set()
+        current_id = 1
+        for chunk_result in processed_chunks:
+            lines = chunk_result.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and '출력:' not in line and line not in seen_texts:
+                    # ID 재할당
+                    parts = line.split(',', 1)
+                    if len(parts) > 1:
+                        new_line = f"{current_id},{parts[1]}"
+                        all_lines.append(new_line)
+                        seen_texts.add(new_line)
+                        current_id += 1
+        processed_text = '\n'.join(all_lines)
+        # CSV 형식 검증
+        try:
+            from io import StringIO
+            import csv
+            csv.reader(StringIO(processed_text))
+            return processed_text
+        except csv.Error:
+            return "LLM이 올바른 CSV 형식을 생성하지 못했습니다. 다시 시도해주세요."
+    except Exception as e:
+        error_message = f"전처리 중 오류가 발생했습니다: {str(e)}"
+        print(error_message)
+        return error_message
 def load_code(filename: str) -> str:
     try:
         with open(filename, 'r', encoding='utf-8') as file: