XinyueZhou commited on
Commit
6e73024
·
verified ·
1 Parent(s): 3919ce3

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +366 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import json
4
+ import os
5
+ import re
6
+ import tempfile
7
+ import zipfile
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+
11
+ import gradio as gr
12
+ import requests
13
+ from PIL import Image
14
+ import pdf2image
15
+
16
+ API_URL = "https://t707h6d9q6oftbx3.aistudio-app.com/layout-parsing"
17
+ TOKEN = os.getenv("API_TOKEN", "c9e4aaf9634724e215690ba66a66dbdbdf3222a2")
18
+
19
+ CSS = """
20
+ :root {
21
+ --sand-color: #FAF9F6;
22
+ --white: #ffffff;
23
+ --shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
24
+ --text-color: #FAF9F6;
25
+ --black:#000000;
26
+ }
27
+
28
+ body {
29
+ display: flex;
30
+ justify-content: center;
31
+ background-color: var(--sand-color);
32
+ color: var(--text-color);
33
+ }
34
+
35
+ .gradio-container {
36
+ max-width: 1200px;
37
+ width: 100%;
38
+ margin: 20px auto;
39
+ padding: 20px;
40
+ background-color: var(--white);
41
+ border-radius: 8px;
42
+ box-shadow: var(--shadow);
43
+ }
44
+
45
+
46
+ #component-0,
47
+ #tabs,
48
+ #settings {
49
+ background-color: var(--white) !important;
50
+ border-radius: 8px;
51
+ padding: 15px;
52
+ }
53
+
54
+ .upload-section {
55
+ width: 100%;
56
+ max-width: 600px;
57
+ margin: 0 auto 30px;
58
+ padding: 20px;
59
+ background-color: var(--sand-color) !important;
60
+ border-radius: 8px;
61
+ box-shadow: var(--shadow);
62
+ }
63
+
64
+ .center-content {
65
+ display: flex;
66
+ flex-direction: column;
67
+ align-items: center;
68
+ text-align: center;
69
+ margin-bottom: 20px;
70
+ }
71
+
72
+ .header {
73
+ margin-bottom: 30px;
74
+ }
75
+
76
+ .result-container,
77
+ .pdf-preview,
78
+ .markdown-result,
79
+ .download-section {
80
+ background-color: var(--white);
81
+ border-radius: 8px;
82
+ box-shadow: var(--shadow);
83
+ padding: 20px;
84
+ }
85
+
86
+ .result-container {
87
+ display: flex;
88
+ gap: 20px;
89
+ margin-bottom: 30px;
90
+ }
91
+
92
+ .pdf-preview, .markdown-result {
93
+ flex: 1;
94
+ }
95
+
96
+ button {
97
+ background-color: var(--text-color) !important;
98
+ color: var(--black) !important;
99
+ border: none !important;
100
+ border-radius: 4px;
101
+ padding: 8px 16px;
102
+ }
103
+
104
+ button:hover {
105
+ opacity: 0.8 !important;
106
+ }
107
+
108
+ .radio-group {
109
+ margin-bottom: 15px !important;
110
+ }
111
+
112
+ .file-download {
113
+ margin-top: 15px !important;
114
+ }
115
+ .loader {
116
+ border: 5px solid #f3f3f3;
117
+ border-top: 5px solid #3498db;
118
+ border-radius: 50%;
119
+ width: 50px;
120
+ height: 50px;
121
+ animation: spin 1s linear infinite;
122
+ margin: 20px auto;
123
+ }
124
+
125
+ @keyframes spin {
126
+ 0% { transform: rotate(0deg); }
127
+ 100% { transform: rotate(360deg); }
128
+ }
129
+
130
+ .loader-container {
131
+ text-align: center;
132
+ margin: 20px 0;
133
+ }
134
+ """
135
+
136
+
137
+ def clean_markdown_text(text):
138
+ if not text:
139
+ return ""
140
+ text = re.sub(r'<[^>]+>', '', text)
141
+ text = re.sub(r'\n{3,}', '\n\n', text)
142
+ return text.strip()
143
+
144
+
145
+ def pdf_to_images(pdf_path):
146
+ try:
147
+ images = pdf2image.convert_from_path(pdf_path)
148
+ return [image for image in images]
149
+ except:
150
+ return None
151
+
152
+
153
+ def process_file(file_path, file_type):
154
+ try:
155
+ with open(file_path, "rb") as f:
156
+ file_bytes = f.read()
157
+
158
+ file_data = base64.b64encode(file_bytes).decode("ascii")
159
+ headers = {
160
+ "Authorization": f"token {TOKEN}",
161
+ "Content-Type": "application/json"
162
+ }
163
+
164
+ response = requests.post(
165
+ API_URL,
166
+ json={"file": file_data, "fileType": 0 if file_type == "pdf" else 1},
167
+ headers=headers,
168
+ timeout=60
169
+ )
170
+ response.raise_for_status()
171
+
172
+ result = response.json()
173
+ layout_results = result.get("result", {}).get("layoutParsingResults", [])
174
+
175
+ markdown_contents = []
176
+ clean_markdown_contents = []
177
+ for res in layout_results:
178
+ markdown = res.get("markdown", {})
179
+ if isinstance(markdown, str):
180
+ original = markdown
181
+ elif isinstance(markdown, dict):
182
+ original = markdown.get("text", "")
183
+
184
+ markdown_contents.append(original)
185
+ clean_markdown_contents.append(clean_markdown_text(original))
186
+
187
+ if file_type == "pdf":
188
+ images = pdf_to_images(file_path)
189
+ else:
190
+ images = [Image.open(file_path)]
191
+
192
+ return {
193
+ "original_file": file_path,
194
+ "markdown_contents": markdown_contents,
195
+ "clean_markdown_contents": clean_markdown_contents,
196
+ "pdf_images": images,
197
+ "api_response": result
198
+ }
199
+
200
+ except Exception as e:
201
+ raise gr.Error(f"Error processing file: {str(e)}")
202
+
203
+
204
+ def create_zip_file(results):
205
+ try:
206
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
207
+ zip_filename = f"analysis_results_{timestamp}.zip"
208
+
209
+ temp_dir = tempfile.mkdtemp()
210
+ zip_path = os.path.join(temp_dir, zip_filename)
211
+
212
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
213
+ original_path = results.get("original_file", "")
214
+ if original_path and Path(original_path).exists():
215
+ zipf.write(original_path, f"original/{Path(original_path).name}")
216
+
217
+ markdowns = results.get("markdown_contents", [])
218
+ for i, md_content in enumerate(markdowns):
219
+ if md_content:
220
+ zipf.writestr(f"markdown/original/markdown_{i + 1}.md", md_content)
221
+
222
+ api_response = results.get("api_response", {})
223
+ zipf.writestr("api_response.json", json.dumps(api_response, indent=2, ensure_ascii=False))
224
+
225
+ return zip_path
226
+
227
+ except Exception as e:
228
+ raise gr.Error(f"Error creating ZIP file: {str(e)}")
229
+
230
+
231
+ def export_markdown(results):
232
+ try:
233
+ markdowns = results.get("markdown_contents", [])
234
+ if not markdowns:
235
+ raise gr.Error("No markdown content to export")
236
+
237
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
238
+ filename = f"original_markdown_{timestamp}.md"
239
+ content = "\n\n".join(markdowns)
240
+
241
+ temp_dir = tempfile.mkdtemp()
242
+ file_path = os.path.join(temp_dir, filename)
243
+
244
+ with open(file_path, 'w', encoding='utf-8') as f:
245
+ f.write(content)
246
+
247
+ return file_path
248
+
249
+ except Exception as e:
250
+ raise gr.Error(f"Error exporting markdown: {str(e)}")
251
+
252
+
253
+ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
254
+ results_state = gr.State()
255
+
256
+ with gr.Column(elem_classes=["center-content", "header"]):
257
+ gr.Markdown("# Document Parsing System")
258
+ gr.Markdown("### Upload PDF or image files for analysis")
259
+
260
+ with gr.Column(elem_classes=["center-content", "upload-section"]):
261
+ file_type = gr.Radio(
262
+ ["pdf", "image"],
263
+ label="File type",
264
+ value="pdf",
265
+ interactive=True
266
+ )
267
+ file_input = gr.File(
268
+ label="Upload document",
269
+ file_types=[".pdf", ".jpg", ".jpeg", ".png"],
270
+ type="filepath"
271
+ )
272
+ process_btn = gr.Button("Analyze document", variant="primary")
273
+
274
+ # Loading spinner container
275
+ loading_spinner = gr.Column(
276
+ visible=False,
277
+ elem_classes=["loader-container"]
278
+ )
279
+ with loading_spinner:
280
+ gr.HTML("""
281
+ <div class="loader"></div>
282
+ <p>Wait...</p>
283
+ """)
284
+
285
+ with gr.Row(elem_classes=["result-container"]):
286
+ with gr.Column(elem_classes=["pdf-preview"]):
287
+ gr.Markdown("### Original document preview")
288
+ pdf_display = gr.Gallery(label="PDF page", show_label=False)
289
+
290
+ with gr.Column(elem_classes=["markdown-result"]):
291
+ with gr.Row(elem_classes=["radio-group"]):
292
+ display_mode = gr.Radio(
293
+ ["Original Markdown", "Cleaned Text"],
294
+ label="Display Mode",
295
+ value="Original Markdown",
296
+ interactive=True
297
+ )
298
+ markdown_display = gr.HTML(label="Analysis Results")
299
+
300
+ with gr.Column(elem_classes=["download-section"]):
301
+ gr.Markdown("### Result Export")
302
+ with gr.Row():
303
+ download_md_btn = gr.Button("Download Original Markdown", variant="secondary")
304
+ download_all_btn = gr.Button("Download Complete Analysis Results (ZIP)", variant="primary")
305
+ download_file = gr.File(visible=False, label="Download file", elem_classes=["file-download"])
306
+
307
+ # Define a function to toggle the loading spinner
308
+ def toggle_spinner():
309
+ return gr.update(visible=True)
310
+
311
+
312
+ def hide_spinner():
313
+ return gr.update(visible=False)
314
+
315
+
316
+ process_btn.click(
317
+ toggle_spinner,
318
+ outputs=[loading_spinner]
319
+ ).then(
320
+ process_file,
321
+ inputs=[file_input, file_type],
322
+ outputs=[results_state]
323
+ ).then(
324
+ hide_spinner,
325
+ outputs=[loading_spinner]
326
+ ).success(
327
+ lambda res: res["pdf_images"] if res and res.get("pdf_images") else [],
328
+ inputs=[results_state],
329
+ outputs=[pdf_display]
330
+ ).success(
331
+ lambda res: res["markdown_contents"][0] if res and res.get("markdown_contents") else "",
332
+ inputs=[results_state],
333
+ outputs=[markdown_display]
334
+ )
335
+
336
+ display_mode.change(
337
+ lambda mode, res: (
338
+ res["markdown_contents"][0] if mode == "原始Markdown"
339
+ else res["clean_markdown_contents"][0]
340
+ ) if res else "",
341
+ inputs=[display_mode, results_state],
342
+ outputs=[markdown_display]
343
+ )
344
+
345
+ download_md_btn.click(
346
+ export_markdown,
347
+ inputs=[results_state],
348
+ outputs=[download_file]
349
+ ).then(
350
+ lambda x: gr.update(visible=True),
351
+ inputs=[download_file],
352
+ outputs=[download_file]
353
+ )
354
+
355
+ download_all_btn.click(
356
+ create_zip_file,
357
+ inputs=[results_state],
358
+ outputs=[download_file]
359
+ ).then(
360
+ lambda x: gr.update(visible=True),
361
+ inputs=[download_file],
362
+ outputs=[download_file]
363
+ )
364
+
365
+ if __name__ == "__main__":
366
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=3.0
2
+ requests
3
+ pillow
4
+ pdf2image
5
+ python-dotenv