Spaces:

dseditor
/

Docfixer

Sleeping

App Files Files Community

dseditor commited on Jul 18

Commit

af86850

verified ·

1 Parent(s): b565e7e

Upload app.py

Browse files

Files changed (1) hide show

app.py +32 -59

app.py CHANGED Viewed

@@ -9,111 +9,84 @@ from docx.oxml import OxmlElement
 import tempfile
 import os
 def format_docx(file, chapter_keywords):
     """
     處理上傳的 Word 文件
     """
     if file is None:
         return None, "請上傳一個 Word 文件"
     if not chapter_keywords.strip():
         return None, "請輸入章節分段方式（例如：章,節,話）"
     try:
-        # 讀取上傳的文件
         doc = Document(file.name)
         # 解析章節關鍵字
         keywords = [keyword.strip() for keyword in chapter_keywords.split(',')]
         # 建立正規表示式模式
         patterns = []
         for keyword in keywords:
-            # 匹配多種數字格式：阿拉伯數字、中文數字、羅馬數字等
-            # 例如：第1章、第一章、第二十三章、第I章等
-            pattern = rf'第\s*[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s*{keyword}'
             patterns.append(pattern)
-        # 合併所有模式
         combined_pattern = '|'.join(patterns)
-        # 處理文件：先收集所有段落內容和類型
         content_list = []
         for paragraph in doc.paragraphs:
             text = paragraph.text.strip()
-            # 檢查是否為章節標題
             if text and re.search(combined_pattern, text):
                 content_list.append(('heading', text))
-            elif text:  # 有內容的一般段落
                 content_list.append(('paragraph', text))
-            else:  # 空段落
-                # 只有在前一個不是空段落時才添加空段落
                 if not content_list or content_list[-1][0] != 'empty':
                     content_list.append(('empty', ''))
-        # 清空整個文件
         for paragraph in doc.paragraphs:
             p = paragraph._element
             p.getparent().remove(p)
-        # 重新建立文件內容
         for content_type, text in content_list:
             if content_type == 'heading':
-                # 直接添加段落，然後手動設定為標題格式
-                heading = doc.add_paragraph(text)
-                # 手動設定標題格式
-                for run in heading.runs:
-                    run.font.bold = True
-                    run.font.size = Pt(16)
-                    run.font.name = '新細明體'  # 設定中文字體
-                # 設定段落格式
                 heading.paragraph_format.space_before = Cm(0)
-                heading.paragraph_format.space_after = Cm(0.3)  # 標題後稍微間距
                 heading.paragraph_format.line_spacing = 1.0
-                # 在標題前分頁
-                heading.paragraph_format.page_break_before = True
-                # 重置標題的縮排
                 heading.paragraph_format.left_indent = Cm(0)
                 heading.paragraph_format.first_line_indent = Cm(0)
-                # 嘗試設定大綱層級（這樣可以在導覽窗格中顯示）
-                try:
-                    # 直接設定大綱層級，不依賴樣式
-                    heading.paragraph_format.outline_level = 0  # 0 = 層級1
-                except:
-                    pass  # 如果失敗就跳過
             elif content_type == 'paragraph':
-                # 添加一般段落
-                para = doc.add_paragraph(text)
-                # 設定樣式
-                para.style.paragraph_format.space_before = Cm(0)
-                para.style.paragraph_format.space_after = Cm(0)
-                para.style.paragraph_format.line_spacing = 1.0
-                # 首行縮排
                 para.paragraph_format.left_indent = Cm(0)
                 para.paragraph_format.first_line_indent = Cm(0.7)
             elif content_type == 'empty':
-                # 添加空段落
                 doc.add_paragraph('')
-        # 儲存處理後的文件
         output_path = tempfile.mktemp(suffix='.docx')
         doc.save(output_path)
         return output_path, f"✅ 處理完成！找到章節關鍵字：{', '.join(keywords)}"
     except Exception as e:
         return None, f"❌ 處理失敗：{str(e)}"
 def create_interface():
     """
     建立 Gradio 介面
     """

 import tempfile
 import os
 def format_docx(file, chapter_keywords):
     """
     處理上傳的 Word 文件
     """
     if file is None:
         return None, "請上傳一個 Word 文件"
     if not chapter_keywords.strip():
         return None, "請輸入章節分段方式（例如：章,節,話）"
     try:
+        from docx import Document
+        from docx.shared import Cm, Pt
+        import re
+        import tempfile
         doc = Document(file.name)
         # 解析章節關鍵字
         keywords = [keyword.strip() for keyword in chapter_keywords.split(',')]
         # 建立正規表示式模式
         patterns = []
         for keyword in keywords:
+            pattern = f'第\s*[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s*{keyword}'
             patterns.append(pattern)
         combined_pattern = '|'.join(patterns)
         content_list = []
         for paragraph in doc.paragraphs:
             text = paragraph.text.strip()
             if text and re.search(combined_pattern, text):
                 content_list.append(('heading', text))
+            elif text:
                 content_list.append(('paragraph', text))
+            else:
                 if not content_list or content_list[-1][0] != 'empty':
                     content_list.append(('empty', ''))
         for paragraph in doc.paragraphs:
             p = paragraph._element
             p.getparent().remove(p)
         for content_type, text in content_list:
             if content_type == 'heading':
+                heading = doc.add_paragraph(text, style='Heading 1')
+                heading.paragraph_format.page_break_before = True
                 heading.paragraph_format.space_before = Cm(0)
+                heading.paragraph_format.space_after = Cm(0.3)
                 heading.paragraph_format.line_spacing = 1.0
                 heading.paragraph_format.left_indent = Cm(0)
                 heading.paragraph_format.first_line_indent = Cm(0)
+                for run in heading.runs:
+                    run.font.name = '新細明體'
+                    run.font.size = Pt(16)
             elif content_type == 'paragraph':
+                lines = [line.strip() for line in text.splitlines() if line.strip()]
+                clean_text = ' '.join(lines)
+                para = doc.add_paragraph(clean_text)
+                para.paragraph_format.space_before = Cm(0)
+                para.paragraph_format.space_after = Cm(0)
+                para.paragraph_format.line_spacing = 1.0
                 para.paragraph_format.left_indent = Cm(0)
                 para.paragraph_format.first_line_indent = Cm(0.7)
+                for run in para.runs:
+                    run.font.name = '新細明體'
             elif content_type == 'empty':
                 doc.add_paragraph('')
         output_path = tempfile.mktemp(suffix='.docx')
         doc.save(output_path)
         return output_path, f"✅ 處理完成！找到章節關鍵字：{', '.join(keywords)}"
     except Exception as e:
         return None, f"❌ 處理失敗：{str(e)}"
 def create_interface():
     """
     建立 Gradio 介面
     """