dseditor commited on
Commit
82e99be
·
verified ·
1 Parent(s): ca276f2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -52
app.py CHANGED
@@ -16,18 +16,14 @@ def set_outline_level(paragraph, level: int = 0):
16
  pPr.append(outline)
17
 
18
  def normalize_paragraph(text):
 
19
  text = re.sub(r'[\r\n]+', ' ', text)
20
- # 壓縮多餘空白
21
  text = re.sub(r'\s{2,}', ' ', text)
22
  return text.strip()
23
 
24
  def format_docx(file, chapter_keywords):
25
- """
26
- 處理上傳的 Word 文件
27
- """
28
  if file is None:
29
  return None, "請上傳一個 Word 文件"
30
-
31
  if not chapter_keywords.strip():
32
  return None, "請輸入章節分段方式(例如:章,節,話)"
33
 
@@ -42,33 +38,34 @@ def format_docx(file, chapter_keywords):
42
  heading_style.font.bold = True
43
  heading_style.font.size = Pt(16)
44
 
45
- # 解析章節關鍵字
46
- keywords = [keyword.strip() for keyword in chapter_keywords.split(',')]
47
-
48
- # 建立正規表示式模式
49
- patterns = []
50
- for keyword in keywords:
51
- pattern = f'第\s*[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s*{keyword}'
52
- patterns.append(pattern)
53
  combined_pattern = '|'.join(patterns)
54
 
 
55
  content_list = []
56
- for paragraph in doc.paragraphs:
57
- text = paragraph.text.strip()
58
- if text and re.search(combined_pattern, text):
 
 
 
 
 
59
  content_list.append(('heading', text))
60
- elif text:
61
- content_list.append(('paragraph', text))
62
  else:
63
- if not content_list or content_list[-1][0] != 'empty':
64
- content_list.append(('empty', ''))
65
 
66
- for paragraph in doc.paragraphs:
67
- p = paragraph._element
68
- p.getparent().remove(p)
69
 
70
- for content_type, text in content_list:
71
- if content_type == 'heading':
 
72
  heading = doc.add_paragraph(text, style='Heading 1')
73
  heading.paragraph_format.page_break_before = True
74
  heading.paragraph_format.space_before = Cm(0)
@@ -80,7 +77,7 @@ def format_docx(file, chapter_keywords):
80
  for run in heading.runs:
81
  run.font.name = '新細明體'
82
  run.font.size = Pt(16)
83
- elif content_type == 'paragraph':
84
  clean_text = normalize_paragraph(text)
85
  para = doc.add_paragraph(clean_text)
86
  para.paragraph_format.space_before = Cm(0)
@@ -90,7 +87,7 @@ def format_docx(file, chapter_keywords):
90
  para.paragraph_format.first_line_indent = Cm(0.7)
91
  for run in para.runs:
92
  run.font.name = '新細明體'
93
- elif content_type == 'empty':
94
  doc.add_paragraph('')
95
 
96
  output_path = tempfile.mktemp(suffix='.docx')
@@ -110,34 +107,14 @@ def create_interface():
110
 
111
  with gr.Row():
112
  with gr.Column(scale=1):
113
- file_input = gr.File(
114
- label="上傳 Word 文件 (.docx)",
115
- file_types=[".docx"],
116
- file_count="single"
117
- )
118
- chapter_input = gr.Textbox(
119
- label="章節分段方式",
120
- placeholder="章,節,話",
121
- info="請輸入章節關鍵字,用逗號分隔(例如:章,節,話)",
122
- value="章,節,話"
123
- )
124
  process_btn = gr.Button("🔄 開始處理", variant="primary", size="lg")
125
  with gr.Column(scale=1):
126
- status_output = gr.Textbox(
127
- label="處理狀態",
128
- interactive=False,
129
- lines=3
130
- )
131
- download_output = gr.File(
132
- label="下載處理後的文���",
133
- interactive=False
134
- )
135
 
136
- process_btn.click(
137
- fn=format_docx,
138
- inputs=[file_input, chapter_input],
139
- outputs=[download_output, status_output]
140
- )
141
 
142
  return demo
143
 
 
16
  pPr.append(outline)
17
 
18
  def normalize_paragraph(text):
19
+ # 處理段落內換行符號與多餘空格
20
  text = re.sub(r'[\r\n]+', ' ', text)
 
21
  text = re.sub(r'\s{2,}', ' ', text)
22
  return text.strip()
23
 
24
  def format_docx(file, chapter_keywords):
 
 
 
25
  if file is None:
26
  return None, "請上傳一個 Word 文件"
 
27
  if not chapter_keywords.strip():
28
  return None, "請輸入章節分段方式(例如:章,節,話)"
29
 
 
38
  heading_style.font.bold = True
39
  heading_style.font.size = Pt(16)
40
 
41
+ # 章節關鍵字模式
42
+ keywords = [k.strip() for k in chapter_keywords.split(',')]
43
+ patterns = [f'第\s*[0-9一二三四五六七八九十百千萬壹貳參肆伍陸柒捌玖拾佰仟萬IVXLCDMivxlcdm]+\s*{k}' for k in keywords]
 
 
 
 
 
44
  combined_pattern = '|'.join(patterns)
45
 
46
+ # 模擬 Word 的 ^p^p -> ^p 處理段落
47
  content_list = []
48
+ prev_empty = False
49
+ for para in doc.paragraphs:
50
+ text = para.text.strip()
51
+ if not text:
52
+ if not prev_empty:
53
+ content_list.append(('empty', ''))
54
+ prev_empty = True
55
+ elif re.search(combined_pattern, text):
56
  content_list.append(('heading', text))
57
+ prev_empty = False
 
58
  else:
59
+ content_list.append(('paragraph', text))
60
+ prev_empty = False
61
 
62
+ # 清空原始內容
63
+ for p in doc.paragraphs:
64
+ p._element.getparent().remove(p._element)
65
 
66
+ # 重建段落
67
+ for kind, text in content_list:
68
+ if kind == 'heading':
69
  heading = doc.add_paragraph(text, style='Heading 1')
70
  heading.paragraph_format.page_break_before = True
71
  heading.paragraph_format.space_before = Cm(0)
 
77
  for run in heading.runs:
78
  run.font.name = '新細明體'
79
  run.font.size = Pt(16)
80
+ elif kind == 'paragraph':
81
  clean_text = normalize_paragraph(text)
82
  para = doc.add_paragraph(clean_text)
83
  para.paragraph_format.space_before = Cm(0)
 
87
  para.paragraph_format.first_line_indent = Cm(0.7)
88
  for run in para.runs:
89
  run.font.name = '新細明體'
90
+ elif kind == 'empty':
91
  doc.add_paragraph('')
92
 
93
  output_path = tempfile.mktemp(suffix='.docx')
 
107
 
108
  with gr.Row():
109
  with gr.Column(scale=1):
110
+ file_input = gr.File(label="上傳 Word 文件 (.docx)", file_types=[".docx"], file_count="single")
111
+ chapter_input = gr.Textbox(label="章節分段方式", placeholder="章,節,話", value="章,節,話")
 
 
 
 
 
 
 
 
 
112
  process_btn = gr.Button("🔄 開始處理", variant="primary", size="lg")
113
  with gr.Column(scale=1):
114
+ status_output = gr.Textbox(label="處理狀態", interactive=False, lines=3)
115
+ download_output = gr.File(label="下載處理後的文件", interactive=False)
 
 
 
 
 
 
 
116
 
117
+ process_btn.click(fn=format_docx, inputs=[file_input, chapter_input], outputs=[download_output, status_output])
 
 
 
 
118
 
119
  return demo
120