myezrag

Running

App Files Files Community

ginipick commited on Oct 25, 2024

Commit

4f03165

verified ·

1 Parent(s): 6998afd

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -73

app.py CHANGED Viewed

@@ -65,51 +65,6 @@ def preprocess_single_chunk(chunk: str) -> str:
     except Exception as e:
         return f"청크 처리 중 오류 발생: {str(e)}"
-def preprocess_text_with_llm(input_text: str) -> str:
-    if not input_text.strip():
-        return "입력 텍스트가 비어있습니다."
-    try:
-        # 텍스트를 청크로 분할
-        chunks = chunk_text(input_text)
-        # 병렬 처리로 청크들을 처리
-        with ThreadPoolExecutor(max_workers=3) as executor:
-            processed_chunks = list(executor.map(cached_preprocess, chunks))
-        # 결과 병합 및 중복 제거
-        all_lines = []
-        seen_texts = set()
-        current_id = 1
-        for chunk_result in processed_chunks:
-            lines = chunk_result.split('\n')
-            for line in lines:
-                line = line.strip()
-                if line and '출력:' not in line and line not in seen_texts:
-                    # ID 재할당
-                    parts = line.split(',', 1)
-                    if len(parts) > 1:
-                        new_line = f"{current_id},{parts[1]}"
-                        all_lines.append(new_line)
-                        seen_texts.add(new_line)
-                        current_id += 1
-        processed_text = '\n'.join(all_lines)
-        # CSV 형식 검증
-        try:
-            from io import StringIO
-            import csv
-            csv.reader(StringIO(processed_text))
-            return processed_text
-        except csv.Error:
-            return "LLM이 올바른 CSV 형식을 생성하지 못했습니다. 다시 시도해주세요."
-    except Exception as e:
-        error_message = f"전처리 중 오류가 발생했습니다: {str(e)}"
-        print(error_message)
-        return error_message
@@ -278,6 +233,7 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
         print(f"{error_message}\n{traceback.format_exc()}")
         return error_message, "", ""
 def preprocess_text_with_llm(input_text: str) -> str:
     if not input_text.strip():
         return "입력 텍스트가 비어있습니다."
@@ -310,40 +266,50 @@ def preprocess_text_with_llm(input_text: str) -> str:
 - 각 행은 새로운 줄로 구분
 - 불필요한 반복 출력 금지"""
-    full_prompt = f"{system_prompt}\n\n입력텍스트:\n{input_text}\n\n출력:"
     try:
-        response = ""
-        stream = hf_client.text_generation(
-            prompt=full_prompt,
-            max_new_tokens=4000,
-            temperature=0.1,  # 더 결정적인 출력을 위해 낮춤
-            top_p=0.9,
-            stream=True,
-        )
-        for msg in stream:
-            if msg:
-                response += msg
-        # <EOS_TOKEN> 이전까지만 추출하고 정제
-        if "<EOS_TOKEN>" in response:
-            processed_text = response.split("<EOS_TOKEN>")[0].strip()
-        else:
-            processed_text = response.strip()
-        # 중복 출력 제거
-        lines = processed_text.split('\n')
-        unique_lines = []
         seen_texts = set()
-        for line in lines:
-            line = line.strip()
-            if line and '출력:' not in line and line not in seen_texts:
-                unique_lines.append(line)
-                seen_texts.add(line)
-        processed_text = '\n'.join(unique_lines)
         # CSV 형식 검증
         try:
@@ -359,6 +325,7 @@ def preprocess_text_with_llm(input_text: str) -> str:
         print(error_message)
         return error_message
 # CSS 설정
 css = """
 footer {

     except Exception as e:
         return f"청크 처리 중 오류 발생: {str(e)}"
         print(f"{error_message}\n{traceback.format_exc()}")
         return error_message, "", ""
 def preprocess_text_with_llm(input_text: str) -> str:
     if not input_text.strip():
         return "입력 텍스트가 비어있습니다."
 - 각 행은 새로운 줄로 구분
 - 불필요한 반복 출력 금지"""
     try:
+        # 텍스트를 청크로 분할
+        chunks = chunk_text(input_text)
+        # 병렬 처리로 청크들을 처리
+        with ThreadPoolExecutor(max_workers=3) as executor:
+            processed_chunks = []
+            for chunk in chunks:
+                # 각 청크에 대한 프롬프트 생성
+                chunk_prompt = f"{system_prompt}\n\n입력텍스트:\n{chunk}\n\n출력:"
+                future = executor.submit(
+                    hf_client.text_generation,
+                    prompt=chunk_prompt,
+                    max_new_tokens=2000,
+                    temperature=0.1,
+                    top_p=0.5,
+                    stream=False
+                )
+                processed_chunks.append(future.result())
+        # 결과 병합 및 중복 제거
+        all_lines = []
         seen_texts = set()
+        current_id = 1
+        for chunk_result in processed_chunks:
+            # EOS_TOKEN 처리
+            if "<EOS_TOKEN>" in chunk_result:
+                chunk_result = chunk_result.split("<EOS_TOKEN>")[0]
+            lines = chunk_result.strip().split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and '출력:' not in line and line not in seen_texts:
+                    # ID 재할당
+                    parts = line.split(',', 1)
+                    if len(parts) > 1:
+                        new_line = f"{current_id},{parts[1]}"
+                        if new_line not in seen_texts:  # 추가적인 중복 검사
+                            all_lines.append(new_line)
+                            seen_texts.add(new_line)
+                            current_id += 1
+        processed_text = '\n'.join(all_lines)
         # CSV 형식 검증
         try:
         print(error_message)
         return error_message
 # CSS 설정
 css = """
 footer {