XinyueZhou commited on
Commit
14c5401
·
verified ·
1 Parent(s): 4898b9f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -51
app.py CHANGED
@@ -5,17 +5,55 @@ import os
5
  import re
6
  import tempfile
7
  import zipfile
 
 
8
  from datetime import datetime
9
  from pathlib import Path
10
 
11
  import gradio as gr
12
  import requests
13
  from PIL import Image
14
- import pdf2image
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  API_URL = "https://t707h6d9q6oftbx3.aistudio-app.com/layout-parsing"
17
  TOKEN = os.getenv("API_TOKEN")
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  CSS = """
21
  :root {
@@ -178,29 +216,106 @@ button:hover {
178
  text-align: center;
179
  margin: 20px 0;
180
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  """
182
 
183
  def clean_markdown_text(text):
 
184
  if not text:
185
  return ""
186
  text = re.sub(r'<[^>]+>', '', text)
187
  text = re.sub(r'\n{3,}', '\n\n', text)
188
  return text.strip()
189
 
190
-
191
- def pdf_to_images(pdf_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  try:
193
- images = pdf2image.convert_from_path(pdf_path)
194
- return [image for image in images]
195
- except:
196
- return None
197
-
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  def process_file(file_path, file_type):
 
200
  try:
 
 
 
 
 
 
 
 
 
 
201
  with open(file_path, "rb") as f:
202
  file_bytes = f.read()
203
 
 
204
  file_data = base64.b64encode(file_bytes).decode("ascii")
205
  headers = {
206
  "Authorization": f"token {TOKEN}",
@@ -215,6 +330,7 @@ def process_file(file_path, file_type):
215
  )
216
  response.raise_for_status()
217
 
 
218
  result = response.json()
219
  layout_results = result.get("result", {}).get("layoutParsingResults", [])
220
 
@@ -222,69 +338,94 @@ def process_file(file_path, file_type):
222
  clean_markdown_contents = []
223
  for res in layout_results:
224
  markdown = res.get("markdown", {})
225
- if isinstance(markdown, str):
226
- original = markdown
227
- elif isinstance(markdown, dict):
228
- original = markdown.get("text", "")
229
-
230
  markdown_contents.append(original)
231
  clean_markdown_contents.append(clean_markdown_text(original))
232
 
 
233
  if file_type == "pdf":
234
  images = pdf_to_images(file_path)
 
235
  else:
236
  images = [Image.open(file_path)]
 
237
 
238
  return {
239
  "original_file": file_path,
 
240
  "markdown_contents": markdown_contents,
241
  "clean_markdown_contents": clean_markdown_contents,
242
  "pdf_images": images,
 
243
  "api_response": result
244
  }
245
 
 
 
246
  except Exception as e:
247
  raise gr.Error(f"Error processing file: {str(e)}")
248
 
249
-
250
  def create_zip_file(results):
 
251
  try:
 
 
 
252
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
253
  zip_filename = f"analysis_results_{timestamp}.zip"
254
 
255
  temp_dir = tempfile.mkdtemp()
 
256
  zip_path = os.path.join(temp_dir, zip_filename)
257
 
258
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
 
259
  original_path = results.get("original_file", "")
260
  if original_path and Path(original_path).exists():
261
  zipf.write(original_path, f"original/{Path(original_path).name}")
262
 
263
- markdowns = results.get("markdown_contents", [])
264
- for i, md_content in enumerate(markdowns):
265
- if md_content:
266
- zipf.writestr(f"markdown/original/markdown_{i + 1}.md", md_content)
267
-
 
 
 
 
 
 
268
  api_response = results.get("api_response", {})
269
  zipf.writestr("api_response.json", json.dumps(api_response, indent=2, ensure_ascii=False))
270
 
 
 
 
 
 
 
 
271
  return zip_path
272
 
273
  except Exception as e:
274
  raise gr.Error(f"Error creating ZIP file: {str(e)}")
275
 
276
-
277
  def export_markdown(results):
 
278
  try:
 
 
 
279
  markdowns = results.get("markdown_contents", [])
280
  if not markdowns:
281
  raise gr.Error("No markdown content to export")
282
 
283
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
284
- filename = f"original_markdown_{timestamp}.md"
285
  content = "\n\n".join(markdowns)
286
 
287
  temp_dir = tempfile.mkdtemp()
 
288
  file_path = os.path.join(temp_dir, filename)
289
 
290
  with open(file_path, 'w', encoding='utf-8') as f:
@@ -298,11 +439,11 @@ def export_markdown(results):
298
  with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
299
  results_state = gr.State()
300
 
301
- # 添加logo图片
302
  with gr.Column(elem_classes=["logo-container"]):
303
- gr.Image("pp-structurev3.png", elem_classes=["logo-img"], show_label=False)
304
 
305
- # 添加导航栏链接
306
  with gr.Row(elem_classes=["nav-bar"]):
307
  gr.HTML("""
308
  <div class="nav-links">
@@ -311,19 +452,20 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
311
  </div>
312
  """)
313
 
 
314
  with gr.Column(elem_classes=["upload-section"]):
315
  file_type = gr.Radio(
316
  ["pdf", "image"],
317
- label="File type",
318
  value="pdf",
319
  interactive=True
320
  )
321
  file_input = gr.File(
322
- label="Upload document",
323
  file_types=[".pdf", ".jpg", ".jpeg", ".png"],
324
  type="filepath"
325
  )
326
- process_btn = gr.Button("Analyze document", variant="primary")
327
 
328
  loading_spinner = gr.Column(
329
  visible=False,
@@ -332,16 +474,21 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
332
  with loading_spinner:
333
  gr.HTML("""
334
  <div class="loader"></div>
335
- <p>Wait...</p>
336
  """)
337
 
 
338
  with gr.Row(elem_classes=["result-container"]):
339
  with gr.Column(elem_classes=["pdf-preview"]):
340
- gr.Markdown("### Original document preview")
341
- pdf_display = gr.Gallery(
342
- label="PDF page",
 
343
  show_label=False,
344
- elem_classes=["gallery-container"]
 
 
 
345
  )
346
 
347
  with gr.Column(elem_classes=["markdown-result"]):
@@ -352,21 +499,49 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
352
  value="Original Markdown",
353
  interactive=True
354
  )
355
- markdown_display = gr.HTML(label="Analysis Results")
 
 
356
  with gr.Column(elem_classes=["download-section"]):
357
  gr.Markdown("### Result Export")
358
  with gr.Row():
359
- download_md_btn = gr.Button("Download Original Markdown", variant="secondary")
360
- download_all_btn = gr.Button("Download Complete Analysis Results (ZIP)", variant="primary")
361
- download_file = gr.File(visible=False, label="Download file", elem_classes=["file-download"])
362
 
 
363
  def toggle_spinner():
364
  return gr.update(visible=True)
365
 
366
-
367
  def hide_spinner():
368
  return gr.update(visible=False)
369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
  process_btn.click(
372
  toggle_spinner,
@@ -378,21 +553,17 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
378
  ).then(
379
  hide_spinner,
380
  outputs=[loading_spinner]
381
- ).success(
382
- lambda res: res["pdf_images"] if res and res.get("pdf_images") else [],
383
- inputs=[results_state],
384
- outputs=[pdf_display]
385
- ).success(
386
- lambda res: res["markdown_contents"][0] if res and res.get("markdown_contents") else "",
387
  inputs=[results_state],
388
- outputs=[markdown_display]
389
  )
390
 
391
  display_mode.change(
392
  lambda mode, res: (
393
- res["markdown_contents"][0] if mode == "原始Markdown"
394
  else res["clean_markdown_contents"][0]
395
- ) if res else "",
396
  inputs=[display_mode, results_state],
397
  outputs=[markdown_display]
398
  )
@@ -402,8 +573,7 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
402
  inputs=[results_state],
403
  outputs=[download_file]
404
  ).then(
405
- lambda x: gr.update(visible=True),
406
- inputs=[download_file],
407
  outputs=[download_file]
408
  )
409
 
@@ -412,10 +582,20 @@ with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
412
  inputs=[results_state],
413
  outputs=[download_file]
414
  ).then(
415
- lambda x: gr.update(visible=True),
416
- inputs=[download_file],
417
  outputs=[download_file]
418
  )
419
 
420
  if __name__ == "__main__":
421
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
5
  import re
6
  import tempfile
7
  import zipfile
8
+ import shutil
9
+ import atexit
10
  from datetime import datetime
11
  from pathlib import Path
12
 
13
  import gradio as gr
14
  import requests
15
  from PIL import Image
 
16
 
17
+ try:
18
+ import pdf2image
19
+ PDF2IMAGE_AVAILABLE = True
20
+ except ImportError:
21
+ PDF2IMAGE_AVAILABLE = False
22
+
23
+ try:
24
+ import fitz # PyMuPDF
25
+ PYGMUPDF_AVAILABLE = True
26
+ except ImportError:
27
+ PYGMUPDF_AVAILABLE = False
28
+
29
+ # API Configuration
30
  API_URL = "https://t707h6d9q6oftbx3.aistudio-app.com/layout-parsing"
31
  TOKEN = os.getenv("API_TOKEN")
32
 
33
+ # Temporary directory management
34
+ temp_dirs = []
35
+
36
+ def cleanup():
37
+ """Clean up temporary directories"""
38
+ for dir_path in temp_dirs:
39
+ try:
40
+ shutil.rmtree(dir_path)
41
+ except:
42
+ pass
43
+
44
+ atexit.register(cleanup)
45
+
46
+ def image_to_base64(image_path):
47
+ """Convert image to base64 encoding"""
48
+ if not image_path or not Path(image_path).exists():
49
+ return ""
50
+ with open(image_path, "rb") as image_file:
51
+ return f"data:image/png;base64,{base64.b64encode(image_file.read()).decode('utf-8')}"
52
+
53
+ # Get current directory
54
+ current_dir = Path(__file__).parent
55
+ logo_path = current_dir / "pp-structurev3.png"
56
+ logo_base64 = image_to_base64(logo_path)
57
 
58
  CSS = """
59
  :root {
 
216
  text-align: center;
217
  margin: 20px 0;
218
  }
219
+
220
+ /* PDF Viewer specific styles */
221
+ .pdf-viewer-container {
222
+ width: 100%;
223
+ height: 600px;
224
+ border: 1px solid #ddd;
225
+ margin-top: 15px;
226
+ background-color: #f9f9f9;
227
+ display: flex;
228
+ justify-content: center;
229
+ align-items: center;
230
+ }
231
+
232
+ .pdf-viewer-container embed {
233
+ width: 100%;
234
+ height: 100%;
235
+ }
236
+
237
+ .no-preview-message {
238
+ color: #666;
239
+ font-size: 16px;
240
+ text-align: center;
241
+ padding: 20px;
242
+ }
243
  """
244
 
245
  def clean_markdown_text(text):
246
+ """Clean markdown text from HTML tags and excessive newlines"""
247
  if not text:
248
  return ""
249
  text = re.sub(r'<[^>]+>', '', text)
250
  text = re.sub(r'\n{3,}', '\n\n', text)
251
  return text.strip()
252
 
253
+ def pdf_to_images(pdf_path, dpi=150):
254
+ """Convert PDF to list of images with fallback methods"""
255
+ images = []
256
+
257
+ if PDF2IMAGE_AVAILABLE:
258
+ try:
259
+ images = pdf2image.convert_from_path(pdf_path, dpi=dpi)
260
+ return images
261
+ except Exception as e:
262
+ print(f"pdf2image conversion failed: {str(e)}")
263
+
264
+ if PYGMUPDF_AVAILABLE:
265
+ try:
266
+ doc = fitz.open(pdf_path)
267
+ for page in doc:
268
+ pix = page.get_pixmap(dpi=dpi)
269
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
270
+ images.append(img)
271
+ return images
272
+ except Exception as e:
273
+ print(f"PyMuPDF conversion failed: {str(e)}")
274
+
275
+ return None
276
+
277
+ def create_pdf_preview(pdf_path):
278
+ """Create PDF preview HTML with embedded viewer"""
279
+ if not pdf_path or not Path(pdf_path).exists():
280
+ return '<div class="no-preview-message">No PDF file available</div>'
281
+
282
  try:
283
+ # Convert PDF to base64 for embedding
284
+ with open(pdf_path, "rb") as f:
285
+ pdf_bytes = f.read()
286
+ pdf_base64 = base64.b64encode(pdf_bytes).decode("ascii")
287
+
288
+ return f"""
289
+ <div class="pdf-viewer-container">
290
+ <embed
291
+ src="data:application/pdf;base64,{pdf_base64}"
292
+ type="application/pdf"
293
+ width="100%"
294
+ height="100%"
295
+ >
296
+ </div>
297
+ """
298
+ except Exception as e:
299
+ print(f"Failed to create PDF preview: {str(e)}")
300
+ return '<div class="no-preview-message">PDF preview generation failed</div>'
301
 
302
  def process_file(file_path, file_type):
303
+ """Process uploaded file with API"""
304
  try:
305
+ if not file_path:
306
+ raise ValueError("Please upload a file first")
307
+
308
+ if file_type == "pdf" and not str(file_path).lower().endswith('.pdf'):
309
+ raise ValueError("Please upload a valid PDF file")
310
+
311
+ if file_type == "image" and not str(file_path).lower().endswith(('.jpg', '.jpeg', '.png')):
312
+ raise ValueError("Please upload a valid image file (JPG/JPEG/PNG)")
313
+
314
+ # Read file content
315
  with open(file_path, "rb") as f:
316
  file_bytes = f.read()
317
 
318
+ # Call API for processing
319
  file_data = base64.b64encode(file_bytes).decode("ascii")
320
  headers = {
321
  "Authorization": f"token {TOKEN}",
 
330
  )
331
  response.raise_for_status()
332
 
333
+ # Parse API response
334
  result = response.json()
335
  layout_results = result.get("result", {}).get("layoutParsingResults", [])
336
 
 
338
  clean_markdown_contents = []
339
  for res in layout_results:
340
  markdown = res.get("markdown", {})
341
+ original = markdown if isinstance(markdown, str) else markdown.get("text", "")
 
 
 
 
342
  markdown_contents.append(original)
343
  clean_markdown_contents.append(clean_markdown_text(original))
344
 
345
+ # Generate preview content
346
  if file_type == "pdf":
347
  images = pdf_to_images(file_path)
348
+ pdf_preview = create_pdf_preview(file_path)
349
  else:
350
  images = [Image.open(file_path)]
351
+ pdf_preview = '<div class="no-preview-message">Image file preview</div>'
352
 
353
  return {
354
  "original_file": file_path,
355
+ "file_type": file_type,
356
  "markdown_contents": markdown_contents,
357
  "clean_markdown_contents": clean_markdown_contents,
358
  "pdf_images": images,
359
+ "pdf_preview": pdf_preview,
360
  "api_response": result
361
  }
362
 
363
+ except requests.exceptions.RequestException as e:
364
+ raise gr.Error(f"API request failed: {str(e)}")
365
  except Exception as e:
366
  raise gr.Error(f"Error processing file: {str(e)}")
367
 
 
368
  def create_zip_file(results):
369
+ """Create ZIP file with all analysis results"""
370
  try:
371
+ if not results:
372
+ raise ValueError("No results to export")
373
+
374
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
375
  zip_filename = f"analysis_results_{timestamp}.zip"
376
 
377
  temp_dir = tempfile.mkdtemp()
378
+ temp_dirs.append(temp_dir)
379
  zip_path = os.path.join(temp_dir, zip_filename)
380
 
381
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
382
+ # Add original file
383
  original_path = results.get("original_file", "")
384
  if original_path and Path(original_path).exists():
385
  zipf.write(original_path, f"original/{Path(original_path).name}")
386
 
387
+ # Add markdown content
388
+ for i, (orig_md, clean_md) in enumerate(zip(
389
+ results.get("markdown_contents", []),
390
+ results.get("clean_markdown_contents", [])
391
+ )):
392
+ if orig_md:
393
+ zipf.writestr(f"markdown/original/page_{i+1}.md", orig_md)
394
+ if clean_md:
395
+ zipf.writestr(f"markdown/clean/page_{i+1}.md", clean_md)
396
+
397
+ # Add API response
398
  api_response = results.get("api_response", {})
399
  zipf.writestr("api_response.json", json.dumps(api_response, indent=2, ensure_ascii=False))
400
 
401
+ # Add PDF images if available
402
+ if results.get("file_type") == "pdf" and results.get("pdf_images"):
403
+ for i, img in enumerate(results["pdf_images"]):
404
+ img_path = os.path.join(temp_dir, f"page_{i+1}.jpg")
405
+ img.save(img_path, "JPEG", quality=85)
406
+ zipf.write(img_path, f"images/page_{i+1}.jpg")
407
+
408
  return zip_path
409
 
410
  except Exception as e:
411
  raise gr.Error(f"Error creating ZIP file: {str(e)}")
412
 
 
413
  def export_markdown(results):
414
+ """Export markdown content to file"""
415
  try:
416
+ if not results:
417
+ raise ValueError("No results to export")
418
+
419
  markdowns = results.get("markdown_contents", [])
420
  if not markdowns:
421
  raise gr.Error("No markdown content to export")
422
 
423
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
424
+ filename = f"markdown_export_{timestamp}.md"
425
  content = "\n\n".join(markdowns)
426
 
427
  temp_dir = tempfile.mkdtemp()
428
+ temp_dirs.append(temp_dir)
429
  file_path = os.path.join(temp_dir, filename)
430
 
431
  with open(file_path, 'w', encoding='utf-8') as f:
 
439
  with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
440
  results_state = gr.State()
441
 
442
+ # Header with logo
443
  with gr.Column(elem_classes=["logo-container"]):
444
+ gr.HTML(f'<img src="{logo_base64}" class="logo-img">')
445
 
446
+ # Navigation bar
447
  with gr.Row(elem_classes=["nav-bar"]):
448
  gr.HTML("""
449
  <div class="nav-links">
 
452
  </div>
453
  """)
454
 
455
+ # Upload section
456
  with gr.Column(elem_classes=["upload-section"]):
457
  file_type = gr.Radio(
458
  ["pdf", "image"],
459
+ label="File Type",
460
  value="pdf",
461
  interactive=True
462
  )
463
  file_input = gr.File(
464
+ label="Upload Document",
465
  file_types=[".pdf", ".jpg", ".jpeg", ".png"],
466
  type="filepath"
467
  )
468
+ process_btn = gr.Button("Analyze Document", variant="primary")
469
 
470
  loading_spinner = gr.Column(
471
  visible=False,
 
474
  with loading_spinner:
475
  gr.HTML("""
476
  <div class="loader"></div>
477
+ <p>Processing, please wait...</p>
478
  """)
479
 
480
+ # Results display section
481
  with gr.Row(elem_classes=["result-container"]):
482
  with gr.Column(elem_classes=["pdf-preview"]):
483
+ gr.Markdown("### Original Document Preview")
484
+ pdf_preview = gr.HTML(label="PDF Preview")
485
+ pdf_gallery = gr.Gallery(
486
+ label="PDF Pages",
487
  show_label=False,
488
+ elem_classes=["gallery-container"],
489
+ columns=[1],
490
+ object_fit="contain",
491
+ visible=False
492
  )
493
 
494
  with gr.Column(elem_classes=["markdown-result"]):
 
499
  value="Original Markdown",
500
  interactive=True
501
  )
502
+ markdown_display = gr.Markdown(label="Analysis Results")
503
+
504
+ # Download section
505
  with gr.Column(elem_classes=["download-section"]):
506
  gr.Markdown("### Result Export")
507
  with gr.Row():
508
+ download_md_btn = gr.Button("Download Markdown", variant="secondary")
509
+ download_all_btn = gr.Button("Download Full Results (ZIP)", variant="primary")
510
+ download_file = gr.File(visible=False, label="Download File")
511
 
512
+ # Interaction logic
513
  def toggle_spinner():
514
  return gr.update(visible=True)
515
 
 
516
  def hide_spinner():
517
  return gr.update(visible=False)
518
 
519
+ def update_display(results):
520
+ """Update all display components with processed results"""
521
+ if not results:
522
+ return [
523
+ gr.update(value='<div class="no-preview-message">No file to display</div>'),
524
+ gr.update(visible=False),
525
+ gr.update(value="No content"),
526
+ gr.update(value=[])
527
+ ]
528
+
529
+ # PDF preview
530
+ pdf_preview = results.get("pdf_preview", '<div class="no-preview-message">Preview generation failed</div>')
531
+
532
+ # Image gallery
533
+ images = results.get("pdf_images", [])
534
+ show_gallery = bool(images) and results.get("file_type") == "pdf"
535
+
536
+ # Markdown content
537
+ display_content = results["markdown_contents"][0] if results.get("markdown_contents") else "No content"
538
+
539
+ return [
540
+ gr.update(value=pdf_preview),
541
+ gr.update(visible=show_gallery),
542
+ gr.update(value=display_content),
543
+ gr.update(value=images if show_gallery else [])
544
+ ]
545
 
546
  process_btn.click(
547
  toggle_spinner,
 
553
  ).then(
554
  hide_spinner,
555
  outputs=[loading_spinner]
556
+ ).then(
557
+ update_display,
 
 
 
 
558
  inputs=[results_state],
559
+ outputs=[pdf_preview, pdf_gallery, markdown_display, pdf_gallery]
560
  )
561
 
562
  display_mode.change(
563
  lambda mode, res: (
564
+ res["markdown_contents"][0] if mode == "Original Markdown"
565
  else res["clean_markdown_contents"][0]
566
+ ) if res and res.get("markdown_contents") else "No content",
567
  inputs=[display_mode, results_state],
568
  outputs=[markdown_display]
569
  )
 
573
  inputs=[results_state],
574
  outputs=[download_file]
575
  ).then(
576
+ lambda: gr.update(visible=True),
 
577
  outputs=[download_file]
578
  )
579
 
 
582
  inputs=[results_state],
583
  outputs=[download_file]
584
  ).then(
585
+ lambda: gr.update(visible=True),
 
586
  outputs=[download_file]
587
  )
588
 
589
  if __name__ == "__main__":
590
+ # Check dependencies
591
+ if not PDF2IMAGE_AVAILABLE:
592
+ print("Warning: pdf2image not available, PDF to image conversion limited")
593
+ if not PYGMUPDF_AVAILABLE:
594
+ print("Warning: PyMuPDF not available, PDF fallback conversion disabled")
595
+
596
+ demo.launch(
597
+ server_name="0.0.0.0",
598
+ server_port=7860,
599
+ share=True,
600
+ favicon_path=str(logo_path) if logo_path.exists() else None
601
+ )