myezrag

Running

App Files Files Community

ginipick commited on Oct 25, 2024

Commit

06ef6db

verified ·

1 Parent(s): 56d3d16

Update app-backup2.py

Browse files

Files changed (1) hide show

app-backup2.py +90 -33

app-backup2.py CHANGED Viewed

@@ -119,29 +119,62 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
 def text_to_parquet(text: str) -> Tuple[str, str, str]:
     try:
         from io import StringIO
-        # CSV 데이터를 StringIO를 통해 읽기
-        csv_data = StringIO(text)
         df = pd.read_csv(
-            csv_data,
             sep=',',
-            dtype=str,
-            quoting=csv.QUOTE_ALL,  # 모든 필드를 큰따옴표로 감싸는 것으로 처리
-            escapechar='\\',        # 이스케이프 문자 설정
-            engine='python',        # Python 엔진 사용
-            header=None,            # 첫 번째 행을 열 이름으로 사용하지 않음
-            names=['id', 'text', 'label', 'metadata']  # 열 이름 지정
         )
         # 데이터 유형 최적화
         df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
         # Parquet 파일로 변환
         parquet_filename = 'text_to_parquet.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
         # Parquet 파일 내용 미리보기
         parquet_content = load_parquet(parquet_filename)
         return f"{parquet_filename} 파일이 성공적으로 변환되었습니다.", parquet_content, parquet_filename
     except Exception as e:
-        error_message = f"텍스트 변환 중 오류가 발생했습니다: {str(e)}\n{traceback.format_exc()}"
-        print(error_message)
         return error_message, "", ""
 def preprocess_text_with_llm(input_text: str) -> str:
@@ -154,26 +187,36 @@ def preprocess_text_with_llm(input_text: str) -> str:
 1. 출력 형식: id,text,label,metadata
 2. id: 1부터 시작하는 순차적 번호
 3. text: 의미 있는 단위로 분리된 텍스트
-4. label: 텍스트의 주제나 카테고리
-5. metadata: 추가 정보(날짜, 출처 등)
 주의사항:
-- 텍스트에 쉼표가 있으면 큰따옴표로 감싸기
 - 큰따옴표는 백슬래시로 이스케이프 처리
 - 각 행은 새로운 줄로 구분
-- 모든 필드는 쉼표로 구분
-입력 텍스트:
-"""
-    full_prompt = f"{system_prompt}\n\n{input_text}\n\n출력:"
     try:
         response = ""
         stream = hf_client.text_generation(
             prompt=full_prompt,
-            max_new_tokens=4000,  # 토큰 수 증가
-            temperature=0.3,      # 더 결정적인 출력을 위해 낮춤
             top_p=0.9,
             stream=True,
         )
@@ -182,12 +225,27 @@ def preprocess_text_with_llm(input_text: str) -> str:
             if msg:
                 response += msg
-        # 응답 정제
-        processed_text = response.strip()
         # CSV 형식 검증
         try:
-            # StringIO를 사용하여 CSV 형식 검증
             from io import StringIO
             import csv
             csv.reader(StringIO(processed_text))
@@ -196,7 +254,7 @@ def preprocess_text_with_llm(input_text: str) -> str:
             return "LLM이 올바른 CSV 형식을 생성하지 못했습니다. 다시 시도해주세요."
     except Exception as e:
-        error_message = f"전처리 중 오류가 발생했습니다: {str(e)}\n{traceback.format_exc()}"
         print(error_message)
         return error_message
@@ -393,7 +451,7 @@ with gr.Blocks(css=css) as demo:
                     outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
                 )
-    # 네 번째 탭: 텍스트를 데이터셋 형식으로 전처리 (개선된 버전)
     with gr.Tab("Text Preprocessing with LLM"):
         gr.Markdown("### 텍스트를 입력하면 LLM이 데이터셋 형식에 맞게 전처리하여 출력합니다.")
         with gr.Row():
@@ -407,7 +465,7 @@ with gr.Blocks(css=css) as demo:
                 with gr.Row():
                     preprocess_button = gr.Button("전처리 실행", variant="primary")
                     clear_button = gr.Button("초기화")
                 preprocess_status = gr.Textbox(
                     label="전처리 상태",
                     interactive=False,
@@ -421,12 +479,11 @@ with gr.Blocks(css=css) as demo:
                 )
                 # Parquet 변환 및 다운로드 섹션
-                with gr.Row():
-                    convert_to_parquet_button = gr.Button("Parquet으로 변환", visible=True)
-                    download_parquet = gr.File(
-                        label="변환된 Parquet 파일 다운로드",
-                        visible=False
-                    )
                 def handle_text_preprocessing(input_text: str):
                     if not input_text.strip():

 def text_to_parquet(text: str) -> Tuple[str, str, str]:
     try:
         from io import StringIO
+        import csv
+        # 입력 텍스트 정제
+        lines = text.strip().split('\n')
+        cleaned_lines = []
+        for line in lines:
+            # 빈 줄 건너뛰기
+            if not line.strip():
+                continue
+            # 쌍따옴표 정규화
+            line = line.replace('""', '"')  # 중복 쌍따옴표 처리
+            # CSV 파싱을 위한 임시 StringIO 객체 생성
+            temp_buffer = StringIO(line)
+            try:
+                # CSV 라인 파싱 시도
+                reader = csv.reader(temp_buffer, quoting=csv.QUOTE_ALL)
+                parsed_line = next(reader)
+                if len(parsed_line) == 4:  # id, text, label, metadata
+                    # 각 필드를 적절히 포맷팅
+                    formatted_line = f'{parsed_line[0]},"{parsed_line[1]}","{parsed_line[2]}","{parsed_line[3]}"'
+                    cleaned_lines.append(formatted_line)
+            except:
+                continue
+            finally:
+                temp_buffer.close()
+        # 정제된 CSV 데이터 생성
+        cleaned_csv = '\n'.join(cleaned_lines)
+        # DataFrame 생성
         df = pd.read_csv(
+            StringIO(cleaned_csv),
             sep=',',
+            quoting=csv.QUOTE_ALL,
+            escapechar='\\',
+            names=['id', 'text', 'label', 'metadata']
         )
         # 데이터 유형 최적화
         df = df.astype({'id': 'int32', 'text': 'string', 'label': 'string', 'metadata': 'string'})
         # Parquet 파일로 변환
         parquet_filename = 'text_to_parquet.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
         # Parquet 파일 내용 미리보기
         parquet_content = load_parquet(parquet_filename)
         return f"{parquet_filename} 파일이 성공적으로 변환되었습니다.", parquet_content, parquet_filename
     except Exception as e:
+        error_message = f"텍스트 변환 중 오류가 발생했습니다: {str(e)}"
+        print(f"{error_message}\n{traceback.format_exc()}")
         return error_message, "", ""
 def preprocess_text_with_llm(input_text: str) -> str:
 1. 출력 형식: id,text,label,metadata
 2. id: 1부터 시작하는 순차적 번호
 3. text: 의미 있는 단위로 분리된 텍스트
+4. label: 텍스트의 주제나 카테고리를 아래 기준으로 정확하게 한 개만 선택
+   - Historical_Figure (역사적 인물)
+   - Military_History (군사 역사)
+   - Technology (기술)
+   - Politics (정치)
+   - Culture (문화)
+5. metadata: 날짜, 출처 등 추가 정보
+중요:
+- 동일한 텍스트를 반복해서 출력하지 말 것
+- 각 텍스트는 한 번만 처리하여 가장 적합한 label을 선택할 것
+- 입력 텍스트를 의미 단위로 적절히 분리할 것
+예시:
+1,"이순신은 조선 중기의 무신이다.","Historical_Figure","조선시대, 위키백과"
 주의사항:
+- text에 쉼표가 있으면 큰따옴표로 감싸기
 - 큰따옴표는 백슬래시로 이스케이프 처리
 - 각 행은 새로운 줄로 구분
+- 불필요한 반복 출력 금지"""
+    full_prompt = f"{system_prompt}\n\n입력텍스트:\n{input_text}\n\n출력:"
     try:
         response = ""
         stream = hf_client.text_generation(
             prompt=full_prompt,
+            max_new_tokens=4000,
+            temperature=0.1,  # 더 결정적인 출력을 위해 낮춤
             top_p=0.9,
             stream=True,
         )
             if msg:
                 response += msg
+        # <EOS_TOKEN> 이전까지만 추출하고 정제
+        if "<EOS_TOKEN>" in response:
+            processed_text = response.split("<EOS_TOKEN>")[0].strip()
+        else:
+            processed_text = response.strip()
+        # 중복 출력 제거
+        lines = processed_text.split('\n')
+        unique_lines = []
+        seen_texts = set()
+        for line in lines:
+            line = line.strip()
+            if line and '출력:' not in line and line not in seen_texts:
+                unique_lines.append(line)
+                seen_texts.add(line)
+        processed_text = '\n'.join(unique_lines)
         # CSV 형식 검증
         try:
             from io import StringIO
             import csv
             csv.reader(StringIO(processed_text))
             return "LLM이 올바른 CSV 형식을 생성하지 못했습니다. 다시 시도해주세요."
     except Exception as e:
+        error_message = f"전처리 중 오류가 발생했습니다: {str(e)}"
         print(error_message)
         return error_message
                     outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
                 )
+    # 네번째 탭의 UI 부분 수정
     with gr.Tab("Text Preprocessing with LLM"):
         gr.Markdown("### 텍스트를 입력하면 LLM이 데이터셋 형식에 맞게 전처리하여 출력합니다.")
         with gr.Row():
                 with gr.Row():
                     preprocess_button = gr.Button("전처리 실행", variant="primary")
                     clear_button = gr.Button("초기화")
                 preprocess_status = gr.Textbox(
                     label="전처리 상태",
                     interactive=False,
                 )
                 # Parquet 변환 및 다운로드 섹션
+                convert_to_parquet_button = gr.Button("Parquet으로 변환")
+                download_parquet = gr.File(label="변환된 Parquet 파일 다운로드")
                 def handle_text_preprocessing(input_text: str):
                     if not input_text.strip():