xfey commited on
Commit
19d9428
·
1 Parent(s): cc1d913

[feat] support multi-page

Browse files
Files changed (1) hide show
  1. app.py +179 -56
app.py CHANGED
@@ -173,7 +173,7 @@ def cleanup_temp_file(file_path):
173
  except Exception as e:
174
  logger.warning(f"Failed to cleanup temp file {file_path}: {e}")
175
 
176
- def convert_to_image(file_path, target_size=896):
177
  """将输入文件转换为图像格式,长边调整到指定尺寸"""
178
  if file_path is None:
179
  return None
@@ -183,12 +183,15 @@ def convert_to_image(file_path, target_size=896):
183
  file_ext = os.path.splitext(file_path)[1].lower()
184
 
185
  if file_ext == '.pdf':
186
- # PDF文件:转换为图像
187
- logger.info(f"Converting PDF to image: {file_path}")
188
  doc = pymupdf.open(file_path)
189
 
190
- # 只处理第一页
191
- page = doc[0]
 
 
 
192
 
193
  # 计算缩放比例,使长边为target_size
194
  rect = page.rect
@@ -209,7 +212,7 @@ def convert_to_image(file_path, target_size=896):
209
  return tmp_file.name
210
 
211
  else:
212
- # 图像文件:调整尺寸
213
  logger.info(f"Resizing image: {file_path}")
214
  pil_image = Image.open(file_path).convert("RGB")
215
 
@@ -236,15 +239,72 @@ def convert_to_image(file_path, target_size=896):
236
  logger.error(f"Error converting file to image: {e}")
237
  return file_path # 如果转换失败,返回原文件
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  def to_pdf(file_path):
240
  """为了兼容性保留的函数,现在调用convert_to_image"""
241
  return convert_to_image(file_path)
242
 
243
  @spaces.GPU(duration=120)
244
  def process_document(file_path):
245
- """处理文档的主要函数 - 集成完整的推理逻辑"""
246
  if file_path is None:
247
- return "", "", {}, {}
248
 
249
  start_time = time.time()
250
  original_file_path = file_path
@@ -253,57 +313,85 @@ def process_document(file_path):
253
  if model is None:
254
  initialize_model()
255
 
256
- # 转换为图像(长边896像素)
257
- converted_file_path = convert_to_image(file_path, target_size=896)
258
- temp_file_created = converted_file_path != original_file_path
259
-
260
  try:
261
- logger.info(f"Processing document: {file_path}")
262
- logger.info(f"Converted to image: {converted_file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
- # 处理图像
265
- recognition_results = process_page(converted_file_path)
266
 
267
- # 生成Markdown内容
268
- md_content = generate_markdown(recognition_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
  # 计算处理时间
271
  processing_time = time.time() - start_time
272
 
273
- debug_info = {
274
- "original_file": original_file_path,
275
- "converted_file": converted_file_path,
276
- "temp_file_created": temp_file_created,
277
- "file_type": "PDF" if original_file_path.lower().endswith('.pdf') else "Image",
278
- "status": "success",
279
- "processing_time": f"{processing_time:.2f}s",
280
- "total_elements": len(recognition_results)
281
- }
282
 
283
- processing_data = {
284
- "pages": [{"elements": recognition_results}],
285
- "total_elements": len(recognition_results),
286
- "processing_time": f"{processing_time:.2f}s"
 
 
 
287
  }
 
288
 
289
- logger.info(f"Document processed successfully in {processing_time:.2f}s")
290
- return md_content, md_content, processing_data, debug_info
291
 
292
  except Exception as e:
293
  logger.error(f"Error processing document: {str(e)}")
294
- error_info = {
 
 
295
  "original_file": original_file_path,
296
- "converted_file": converted_file_path,
297
- "temp_file_created": temp_file_created,
298
- "status": "error",
299
- "error": str(e)
300
- }
301
- return f"# 处理错误\n\n处理文档时发生错误: {str(e)}", "", {}, error_info
302
 
303
  finally:
304
  # 清理临时文件
305
- if temp_file_created:
306
- cleanup_temp_file(converted_file_path)
 
 
307
 
308
  def process_page(image_path):
309
  """处理单页文档"""
@@ -464,7 +552,6 @@ with gr.Blocks(css=custom_css, title="Dolphin Document Parser") as demo:
464
  with gr.Column(scale=1, elem_classes="preview-panel"):
465
  gr.HTML("文件预览/Preview")
466
  pdf_show = PDF(label="", interactive=False, visible=True, height=600)
467
- debug_output = gr.JSON(label="Debug Info", height=100)
468
 
469
  # 输出面板
470
  with gr.Column(scale=1, elem_classes="output-panel"):
@@ -479,24 +566,60 @@ with gr.Blocks(css=custom_css, title="Dolphin Document Parser") as demo:
479
  )
480
  with gr.Tab("Markdown [Content]"):
481
  md_content = gr.TextArea(lines=30, show_copy_button=True)
482
- with gr.Tab("Processing Data"):
483
  json_output = gr.JSON(label="", height=700)
484
 
485
  # 事件处理 - 预览文件
486
  def preview_file(file_path):
487
- """预览上传的文件,转换为PDF格式用于预览组件"""
488
  if file_path is None:
489
  return None
490
 
491
- with pymupdf.open(file_path) as f:
492
- if f.is_pdf:
 
 
 
493
  return file_path
494
  else:
495
- pdf_bytes = f.convert_to_pdf()
496
- # 使用临时文件保存PDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
498
- tmp_file.write(pdf_bytes)
499
  return tmp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
 
501
  file.change(fn=preview_file, inputs=file, outputs=pdf_show)
502
 
@@ -504,27 +627,27 @@ with gr.Blocks(css=custom_css, title="Dolphin Document Parser") as demo:
504
  def process_with_status(file_path):
505
  """处理文档并更新状态"""
506
  if file_path is None:
507
- return "", "", {}, {}, "Please select a file first"
508
 
509
  # 执行文档处理
510
- md_render_result, md_content_result, json_result, debug_result = process_document(file_path)
511
 
512
- return md_render_result, md_content_result, json_result, debug_result
513
 
514
  submit_btn.click(
515
  fn=process_with_status,
516
  inputs=[file],
517
- outputs=[md_render, md_content, json_output, debug_output],
518
  )
519
 
520
  # 清空所有内容
521
  def reset_all():
522
- return None, None, "", "", {}, {}
523
 
524
  clear_btn.click(
525
  fn=reset_all,
526
  inputs=[],
527
- outputs=[file, pdf_show, md_render, md_content, json_output, debug_output]
528
  )
529
 
530
  # 启动应用
 
173
  except Exception as e:
174
  logger.warning(f"Failed to cleanup temp file {file_path}: {e}")
175
 
176
+ def convert_to_image(file_path, target_size=896, page_num=0):
177
  """将输入文件转换为图像格式,长边调整到指定尺寸"""
178
  if file_path is None:
179
  return None
 
183
  file_ext = os.path.splitext(file_path)[1].lower()
184
 
185
  if file_ext == '.pdf':
186
+ # PDF文件:转换指定页面为图像
187
+ logger.info(f"Converting PDF page {page_num} to image: {file_path}")
188
  doc = pymupdf.open(file_path)
189
 
190
+ # 检查页面数量
191
+ if page_num >= len(doc):
192
+ page_num = 0 # 如果页面超出范围,使用第一页
193
+
194
+ page = doc[page_num]
195
 
196
  # 计算缩放比例,使长边为target_size
197
  rect = page.rect
 
212
  return tmp_file.name
213
 
214
  else:
215
+ # 图像文件:调整尺寸(忽略page_num参数)
216
  logger.info(f"Resizing image: {file_path}")
217
  pil_image = Image.open(file_path).convert("RGB")
218
 
 
239
  logger.error(f"Error converting file to image: {e}")
240
  return file_path # 如果转换失败,返回原文件
241
 
242
+ def get_pdf_page_count(file_path):
243
+ """获取PDF文件的页数"""
244
+ try:
245
+ if file_path and file_path.lower().endswith('.pdf'):
246
+ doc = pymupdf.open(file_path)
247
+ page_count = len(doc)
248
+ doc.close()
249
+ return page_count
250
+ else:
251
+ return 1 # 非PDF文件视为单页
252
+ except Exception as e:
253
+ logger.error(f"Error getting PDF page count: {e}")
254
+ return 1
255
+
256
+ def convert_all_pdf_pages_to_images(file_path, target_size=896):
257
+ """将PDF的所有页面转换为图像列表"""
258
+ if file_path is None:
259
+ return []
260
+
261
+ try:
262
+ file_ext = os.path.splitext(file_path)[1].lower()
263
+
264
+ if file_ext == '.pdf':
265
+ doc = pymupdf.open(file_path)
266
+ image_paths = []
267
+
268
+ for page_num in range(len(doc)):
269
+ page = doc[page_num]
270
+
271
+ # 计算缩放比例
272
+ rect = page.rect
273
+ scale = target_size / max(rect.width, rect.height)
274
+
275
+ # 渲染页面为图像
276
+ mat = pymupdf.Matrix(scale, scale)
277
+ pix = page.get_pixmap(matrix=mat)
278
+
279
+ # 转换为PIL图像
280
+ img_data = pix.tobytes("png")
281
+ pil_image = Image.open(io.BytesIO(img_data))
282
+
283
+ # 保存为临时文件
284
+ with tempfile.NamedTemporaryFile(suffix=f"_page_{page_num}.png", delete=False) as tmp_file:
285
+ pil_image.save(tmp_file.name, "PNG")
286
+ image_paths.append(tmp_file.name)
287
+
288
+ doc.close()
289
+ return image_paths
290
+ else:
291
+ # 非PDF文件,返回调整后的单个图像
292
+ converted_path = convert_to_image(file_path, target_size)
293
+ return [converted_path] if converted_path else []
294
+
295
+ except Exception as e:
296
+ logger.error(f"Error converting PDF pages to images: {e}")
297
+ return []
298
+
299
  def to_pdf(file_path):
300
  """为了兼容性保留的函数,现在调用convert_to_image"""
301
  return convert_to_image(file_path)
302
 
303
  @spaces.GPU(duration=120)
304
  def process_document(file_path):
305
+ """处理文档的主要函数 - 支持多页PDF处理"""
306
  if file_path is None:
307
+ return "", "", []
308
 
309
  start_time = time.time()
310
  original_file_path = file_path
 
313
  if model is None:
314
  initialize_model()
315
 
 
 
 
 
316
  try:
317
+ # 获取页数
318
+ page_count = get_pdf_page_count(file_path)
319
+ logger.info(f"Document has {page_count} page(s)")
320
+
321
+ # 将所有页面转换为图像
322
+ image_paths = convert_all_pdf_pages_to_images(file_path)
323
+ if not image_paths:
324
+ raise Exception("Failed to convert document to images")
325
+
326
+ # 记录需要清理的临时文件
327
+ temp_files_created = []
328
+ file_ext = os.path.splitext(file_path)[1].lower()
329
+ if file_ext == '.pdf':
330
+ temp_files_created.extend(image_paths)
331
+ elif len(image_paths) == 1 and image_paths[0] != original_file_path:
332
+ temp_files_created.append(image_paths[0])
333
 
334
+ all_results = []
335
+ md_contents = []
336
 
337
+ # 逐页处理
338
+ for page_idx, image_path in enumerate(image_paths):
339
+ logger.info(f"Processing page {page_idx + 1}/{len(image_paths)}")
340
+
341
+ # 处理当前页面
342
+ recognition_results = process_page(image_path)
343
+
344
+ # 生成当前页的markdown内容
345
+ page_md_content = generate_markdown(recognition_results)
346
+
347
+ md_contents.append(page_md_content)
348
+
349
+ # 保存当前页的处理数据
350
+ page_data = {
351
+ "page": page_idx + 1,
352
+ "elements": recognition_results,
353
+ "total_elements": len(recognition_results)
354
+ }
355
+ all_results.append(page_data)
356
 
357
  # 计算处理时间
358
  processing_time = time.time() - start_time
359
 
360
+ # 合并所有页面的markdown内容
361
+ if len(md_contents) > 1:
362
+ final_md_content = "\n\n---\n\n".join(md_contents)
363
+ else:
364
+ final_md_content = md_contents[0] if md_contents else ""
 
 
 
 
365
 
366
+ # 在结果数组最后添加总体信息
367
+ summary_data = {
368
+ "summary": True,
369
+ "total_pages": len(image_paths),
370
+ "total_elements": sum(len(page["elements"]) for page in all_results),
371
+ "processing_time": f"{processing_time:.2f}s",
372
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
373
  }
374
+ all_results.append(summary_data)
375
 
376
+ logger.info(f"Document processed successfully in {processing_time:.2f}s - {len(image_paths)} page(s)")
377
+ return final_md_content, final_md_content, all_results
378
 
379
  except Exception as e:
380
  logger.error(f"Error processing document: {str(e)}")
381
+ error_data = [{
382
+ "error": True,
383
+ "message": str(e),
384
  "original_file": original_file_path,
385
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
386
+ }]
387
+ return f"# 处理错误\n\n处理文档时发生错误: {str(e)}", "", error_data
 
 
 
388
 
389
  finally:
390
  # 清理临时文件
391
+ if 'temp_files_created' in locals():
392
+ for temp_file in temp_files_created:
393
+ if temp_file and os.path.exists(temp_file):
394
+ cleanup_temp_file(temp_file)
395
 
396
  def process_page(image_path):
397
  """处理单页文档"""
 
552
  with gr.Column(scale=1, elem_classes="preview-panel"):
553
  gr.HTML("文件预览/Preview")
554
  pdf_show = PDF(label="", interactive=False, visible=True, height=600)
 
555
 
556
  # 输出面板
557
  with gr.Column(scale=1, elem_classes="output-panel"):
 
566
  )
567
  with gr.Tab("Markdown [Content]"):
568
  md_content = gr.TextArea(lines=30, show_copy_button=True)
569
+ with gr.Tab("Json [Content]"):
570
  json_output = gr.JSON(label="", height=700)
571
 
572
  # 事件处理 - 预览文件
573
  def preview_file(file_path):
574
+ """预览上传的文件,对图像先调整尺寸再转换为PDF格式"""
575
  if file_path is None:
576
  return None
577
 
578
+ try:
579
+ file_ext = os.path.splitext(file_path)[1].lower()
580
+
581
+ if file_ext == '.pdf':
582
+ # PDF文件直接返回
583
  return file_path
584
  else:
585
+ # 图像文件:先调整尺寸再转换为PDF
586
+ logger.info(f"Resizing image for preview: {file_path}")
587
+
588
+ # 使用PIL打开图像并调整尺寸
589
+ pil_image = Image.open(file_path).convert("RGB")
590
+ w, h = pil_image.size
591
+
592
+ # 如果图像很大,调整到合适预览尺寸(长边最大896像素)
593
+ max_preview_size = 896
594
+ if max(w, h) > max_preview_size:
595
+ if w > h:
596
+ new_w, new_h = max_preview_size, int(h * max_preview_size / w)
597
+ else:
598
+ new_w, new_h = int(w * max_preview_size / h), max_preview_size
599
+
600
+ pil_image = pil_image.resize((new_w, new_h), Image.Resampling.LANCZOS)
601
+ logger.info(f"Resized from {w}x{h} to {new_w}x{new_h} for preview")
602
+
603
+ # 将调整后的图像转换为PDF
604
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
605
+ pil_image.save(tmp_file.name, "PDF")
606
  return tmp_file.name
607
+
608
+ except Exception as e:
609
+ logger.error(f"Error creating preview: {e}")
610
+ # 出错时使用原来的方法
611
+ try:
612
+ with pymupdf.open(file_path) as f:
613
+ if f.is_pdf:
614
+ return file_path
615
+ else:
616
+ pdf_bytes = f.convert_to_pdf()
617
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
618
+ tmp_file.write(pdf_bytes)
619
+ return tmp_file.name
620
+ except Exception as e2:
621
+ logger.error(f"Fallback preview method also failed: {e2}")
622
+ return None
623
 
624
  file.change(fn=preview_file, inputs=file, outputs=pdf_show)
625
 
 
627
  def process_with_status(file_path):
628
  """处理文档并更新状态"""
629
  if file_path is None:
630
+ return "", "", []
631
 
632
  # 执行文档处理
633
+ md_render_result, md_content_result, json_result = process_document(file_path)
634
 
635
+ return md_render_result, md_content_result, json_result
636
 
637
  submit_btn.click(
638
  fn=process_with_status,
639
  inputs=[file],
640
+ outputs=[md_render, md_content, json_output],
641
  )
642
 
643
  # 清空所有内容
644
  def reset_all():
645
+ return None, None, "", "", []
646
 
647
  clear_btn.click(
648
  fn=reset_all,
649
  inputs=[],
650
+ outputs=[file, pdf_show, md_render, md_content, json_output]
651
  )
652
 
653
  # 启动应用