Spaces:

dseditor
/

Docfixer

Sleeping

App Files Files Community

dseditor commited on Jul 18

Commit

82e99be

verified ·

1 Parent(s): ca276f2

Upload app.py

Browse files

Files changed (1) hide show

app.py +29 -52

app.py CHANGED Viewed

@@ -16,18 +16,14 @@ def set_outline_level(paragraph, level: int = 0):
     pPr.append(outline)
 def normalize_paragraph(text):
     text = re.sub(r'[\r\n]+', ' ', text)
-    # 壓縮多餘空白
     text = re.sub(r'\s{2,}', ' ', text)
     return text.strip()
 def format_docx(file, chapter_keywords):
-    """
-    處理上傳的 Word 文件
-    """
     if file is None:
         return None, "請上傳一個 Word 文件"
     if not chapter_keywords.strip():
         return None, "請輸入章節分段方式（例如：章,節,話）"
@@ -42,33 +38,34 @@ def format_docx(file, chapter_keywords):
             heading_style.font.bold = True
             heading_style.font.size = Pt(16)
-        # 解析章節關鍵字
-        keywords = [keyword.strip() for keyword in chapter_keywords.split(',')]
-        # 建立正規表示式模式
-        patterns = []
-        for keyword in keywords:
-            pattern = f'第\s*[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s*{keyword}'
-            patterns.append(pattern)
         combined_pattern = '|'.join(patterns)
         content_list = []
-        for paragraph in doc.paragraphs:
-            text = paragraph.text.strip()
-            if text and re.search(combined_pattern, text):
                 content_list.append(('heading', text))
-            elif text:
-                content_list.append(('paragraph', text))
             else:
-                if not content_list or content_list[-1][0] != 'empty':
-                    content_list.append(('empty', ''))
-        for paragraph in doc.paragraphs:
-            p = paragraph._element
-            p.getparent().remove(p)
-        for content_type, text in content_list:
-            if content_type == 'heading':
                 heading = doc.add_paragraph(text, style='Heading 1')
                 heading.paragraph_format.page_break_before = True
                 heading.paragraph_format.space_before = Cm(0)
@@ -80,7 +77,7 @@ def format_docx(file, chapter_keywords):
                 for run in heading.runs:
                     run.font.name = '新細明體'
                     run.font.size = Pt(16)
-            elif content_type == 'paragraph':
                 clean_text = normalize_paragraph(text)
                 para = doc.add_paragraph(clean_text)
                 para.paragraph_format.space_before = Cm(0)
@@ -90,7 +87,7 @@ def format_docx(file, chapter_keywords):
                 para.paragraph_format.first_line_indent = Cm(0.7)
                 for run in para.runs:
                     run.font.name = '新細明體'
-            elif content_type == 'empty':
                 doc.add_paragraph('')
         output_path = tempfile.mktemp(suffix='.docx')
@@ -110,34 +107,14 @@ def create_interface():
         with gr.Row():
             with gr.Column(scale=1):
-                file_input = gr.File(
-                    label="上傳 Word 文件 (.docx)",
-                    file_types=[".docx"],
-                    file_count="single"
-                )
-                chapter_input = gr.Textbox(
-                    label="章節分段方式",
-                    placeholder="章,節,話",
-                    info="請輸入章節關鍵字，用逗號分隔（例如：章,節,話）",
-                    value="章,節,話"
-                )
                 process_btn = gr.Button("🔄 開始處理", variant="primary", size="lg")
             with gr.Column(scale=1):
-                status_output = gr.Textbox(
-                    label="處理狀態",
-                    interactive=False,
-                    lines=3
-                )
-                download_output = gr.File(
-                    label="下載處理後的文���",
-                    interactive=False
-                )
-        process_btn.click(
-            fn=format_docx,
-            inputs=[file_input, chapter_input],
-            outputs=[download_output, status_output]
-        )
     return demo

     pPr.append(outline)
 def normalize_paragraph(text):
+    # 處理段落內換行符號與多餘空格
     text = re.sub(r'[\r\n]+', ' ', text)
     text = re.sub(r'\s{2,}', ' ', text)
     return text.strip()
 def format_docx(file, chapter_keywords):
     if file is None:
         return None, "請上傳一個 Word 文件"
     if not chapter_keywords.strip():
         return None, "請輸入章節分段方式（例如：章,節,話）"
             heading_style.font.bold = True
             heading_style.font.size = Pt(16)
+        # 章節關鍵字模式
+        keywords = [k.strip() for k in chapter_keywords.split(',')]
+        patterns = [f'第\s*[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s*{k}' for k in keywords]
         combined_pattern = '|'.join(patterns)
+        # 模擬 Word 的 ^p^p -> ^p 處理段落
         content_list = []
+        prev_empty = False
+        for para in doc.paragraphs:
+            text = para.text.strip()
+            if not text:
+                if not prev_empty:
+                    content_list.append(('empty', ''))
+                prev_empty = True
+            elif re.search(combined_pattern, text):
                 content_list.append(('heading', text))
+                prev_empty = False
             else:
+                content_list.append(('paragraph', text))
+                prev_empty = False
+        # 清空原始內容
+        for p in doc.paragraphs:
+            p._element.getparent().remove(p._element)
+        # 重建段落
+        for kind, text in content_list:
+            if kind == 'heading':
                 heading = doc.add_paragraph(text, style='Heading 1')
                 heading.paragraph_format.page_break_before = True
                 heading.paragraph_format.space_before = Cm(0)
                 for run in heading.runs:
                     run.font.name = '新細明體'
                     run.font.size = Pt(16)
+            elif kind == 'paragraph':
                 clean_text = normalize_paragraph(text)
                 para = doc.add_paragraph(clean_text)
                 para.paragraph_format.space_before = Cm(0)
                 para.paragraph_format.first_line_indent = Cm(0.7)
                 for run in para.runs:
                     run.font.name = '新細明體'
+            elif kind == 'empty':
                 doc.add_paragraph('')
         output_path = tempfile.mktemp(suffix='.docx')
         with gr.Row():
             with gr.Column(scale=1):
+                file_input = gr.File(label="上傳 Word 文件 (.docx)", file_types=[".docx"], file_count="single")
+                chapter_input = gr.Textbox(label="章節分段方式", placeholder="章,節,話", value="章,節,話")
                 process_btn = gr.Button("🔄 開始處理", variant="primary", size="lg")
             with gr.Column(scale=1):
+                status_output = gr.Textbox(label="處理狀態", interactive=False, lines=3)
+                download_output = gr.File(label="下載處理後的文件", interactive=False)
+        process_btn.click(fn=format_docx, inputs=[file_input, chapter_input], outputs=[download_output, status_output])
     return demo