xfey commited on
Commit
d26b33a
·
1 Parent(s): 74dbd2d

Update processing logic: Convert PDF to images (896px) instead of PDF processing

Browse files
Files changed (1) hide show
  1. app.py +92 -36
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import tempfile
3
  import time
@@ -172,20 +173,72 @@ def cleanup_temp_file(file_path):
172
  except Exception as e:
173
  logger.warning(f"Failed to cleanup temp file {file_path}: {e}")
174
 
175
- def to_pdf(file_path):
176
- """将输入文件转换为PDF格式"""
177
  if file_path is None:
178
  return None
 
 
 
 
179
 
180
- with pymupdf.open(file_path) as f:
181
- if f.is_pdf:
182
- return file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  else:
184
- pdf_bytes = f.convert_to_pdf()
185
- # 使用临时文件而不是保存到磁盘
186
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
187
- tmp_file.write(pdf_bytes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  return tmp_file.name
 
 
 
 
 
 
 
 
189
 
190
  @spaces.GPU(duration=120)
191
  def process_document(file_path):
@@ -200,14 +253,15 @@ def process_document(file_path):
200
  if model is None:
201
  initialize_model()
202
 
203
- # 转换为PDF(如果需要)
204
- converted_file_path = to_pdf(file_path)
205
  temp_file_created = converted_file_path != original_file_path
206
 
207
  try:
208
  logger.info(f"Processing document: {file_path}")
 
209
 
210
- # 处理页面
211
  recognition_results = process_page(converted_file_path)
212
 
213
  # 生成Markdown内容
@@ -220,6 +274,7 @@ def process_document(file_path):
220
  "original_file": original_file_path,
221
  "converted_file": converted_file_path,
222
  "temp_file_created": temp_file_created,
 
223
  "status": "success",
224
  "processing_time": f"{processing_time:.2f}s",
225
  "total_elements": len(recognition_results)
@@ -381,19 +436,11 @@ with gr.Blocks(css=custom_css, title="Dolphin Document Parser") as demo:
381
  elem_id="file-upload"
382
  )
383
 
384
- gr.HTML("选择文件后,点击处理按钮开始解析<br>After selecting the file, click the Process button to start parsing")
385
 
386
  with gr.Row(elem_classes="action-buttons"):
387
  submit_btn = gr.Button("处理文档/Process Document", variant="primary")
388
  clear_btn = gr.ClearButton(value="清空/Clear")
389
-
390
- # 处理状态显示
391
- status_display = gr.Textbox(
392
- label="Processing Status",
393
- value="Ready to process documents",
394
- interactive=False,
395
- max_lines=2
396
- )
397
 
398
  # 示例文件
399
  example_root = os.path.join(os.path.dirname(__file__), "examples")
@@ -437,8 +484,26 @@ with gr.Blocks(css=custom_css, title="Dolphin Document Parser") as demo:
437
  with gr.Tab("Processing Data"):
438
  json_output = gr.JSON(label="", height=700)
439
 
440
- # 事件处理
441
- file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
 
443
  # 文档处理
444
  def process_with_status(file_path):
@@ -446,34 +511,25 @@ with gr.Blocks(css=custom_css, title="Dolphin Document Parser") as demo:
446
  if file_path is None:
447
  return "", "", {}, {}, "Please select a file first"
448
 
449
- # 更新状态为处理中
450
- status = "Processing document..."
451
-
452
  # 执行文档处理
453
  md_render_result, md_content_result, json_result, debug_result = process_document(file_path)
454
 
455
- # 更新完成状态
456
- if "错误" in md_render_result:
457
- status = "Processing failed - see debug info"
458
- else:
459
- status = "Processing completed successfully"
460
-
461
- return md_render_result, md_content_result, json_result, debug_result, status
462
 
463
  submit_btn.click(
464
  fn=process_with_status,
465
  inputs=[file],
466
- outputs=[md_render, md_content, json_output, debug_output, status_display],
467
  )
468
 
469
  # 清空所有内容
470
  def reset_all():
471
- return None, None, "", "", {}, {}, "Ready to process documents"
472
 
473
  clear_btn.click(
474
  fn=reset_all,
475
  inputs=[],
476
- outputs=[file, pdf_show, md_render, md_content, json_output, debug_output, status_display]
477
  )
478
 
479
  # 启动应用
 
1
+ import io
2
  import os
3
  import tempfile
4
  import time
 
173
  except Exception as e:
174
  logger.warning(f"Failed to cleanup temp file {file_path}: {e}")
175
 
176
+ def convert_to_image(file_path, target_size=896):
177
+ """将输入文件转换为图像格式,长边调整到指定尺寸"""
178
  if file_path is None:
179
  return None
180
+
181
+ try:
182
+ # 检查文件扩展名
183
+ file_ext = os.path.splitext(file_path)[1].lower()
184
 
185
+ if file_ext == '.pdf':
186
+ # PDF文件:转换为图像
187
+ logger.info(f"Converting PDF to image: {file_path}")
188
+ doc = pymupdf.open(file_path)
189
+
190
+ # 只处理第一页
191
+ page = doc[0]
192
+
193
+ # 计算缩放比例,使长边为target_size
194
+ rect = page.rect
195
+ scale = target_size / max(rect.width, rect.height)
196
+
197
+ # 渲染页面为图像
198
+ mat = pymupdf.Matrix(scale, scale)
199
+ pix = page.get_pixmap(matrix=mat)
200
+
201
+ # 转换为PIL图像
202
+ img_data = pix.tobytes("png")
203
+ pil_image = Image.open(io.BytesIO(img_data))
204
+
205
+ # 保存为临时文件
206
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
207
+ pil_image.save(tmp_file.name, "PNG")
208
+ doc.close()
209
+ return tmp_file.name
210
+
211
  else:
212
+ # 图像文件:调整尺寸
213
+ logger.info(f"Resizing image: {file_path}")
214
+ pil_image = Image.open(file_path).convert("RGB")
215
+
216
+ # 计算新尺寸,保持长宽比
217
+ w, h = pil_image.size
218
+ if max(w, h) > target_size:
219
+ if w > h:
220
+ new_w, new_h = target_size, int(h * target_size / w)
221
+ else:
222
+ new_w, new_h = int(w * target_size / h), target_size
223
+
224
+ pil_image = pil_image.resize((new_w, new_h), Image.Resampling.LANCZOS)
225
+
226
+ # 如果已是图像且尺寸合适,直接返回原文件
227
+ if max(w, h) <= target_size:
228
+ return file_path
229
+
230
+ # 保存调整后的图像
231
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
232
+ pil_image.save(tmp_file.name, "PNG")
233
  return tmp_file.name
234
+
235
+ except Exception as e:
236
+ logger.error(f"Error converting file to image: {e}")
237
+ return file_path # 如果转换失败,返回原文件
238
+
239
+ def to_pdf(file_path):
240
+ """为了兼容性保留的函数,现在调用convert_to_image"""
241
+ return convert_to_image(file_path)
242
 
243
  @spaces.GPU(duration=120)
244
  def process_document(file_path):
 
253
  if model is None:
254
  initialize_model()
255
 
256
+ # 转换为图像(长边896像素)
257
+ converted_file_path = convert_to_image(file_path, target_size=896)
258
  temp_file_created = converted_file_path != original_file_path
259
 
260
  try:
261
  logger.info(f"Processing document: {file_path}")
262
+ logger.info(f"Converted to image: {converted_file_path}")
263
 
264
+ # 处理图像
265
  recognition_results = process_page(converted_file_path)
266
 
267
  # 生成Markdown内容
 
274
  "original_file": original_file_path,
275
  "converted_file": converted_file_path,
276
  "temp_file_created": temp_file_created,
277
+ "file_type": "PDF" if original_file_path.lower().endswith('.pdf') else "Image",
278
  "status": "success",
279
  "processing_time": f"{processing_time:.2f}s",
280
  "total_elements": len(recognition_results)
 
436
  elem_id="file-upload"
437
  )
438
 
439
+ gr.HTML("支持PDF和图像文件,PDF将转换为图像处理(长边896px)<br>Support PDF and image files, PDF will be converted to images (896px max)")
440
 
441
  with gr.Row(elem_classes="action-buttons"):
442
  submit_btn = gr.Button("处理文档/Process Document", variant="primary")
443
  clear_btn = gr.ClearButton(value="清空/Clear")
 
 
 
 
 
 
 
 
444
 
445
  # 示例文件
446
  example_root = os.path.join(os.path.dirname(__file__), "examples")
 
484
  with gr.Tab("Processing Data"):
485
  json_output = gr.JSON(label="", height=700)
486
 
487
+ # 事件处理 - 预览文件
488
+ def preview_file(file_path):
489
+ """预览上传的文件"""
490
+ if file_path is None:
491
+ return None
492
+
493
+ # 对于PDF文件,转换为图像用于预览
494
+ if file_path.lower().endswith('.pdf'):
495
+ try:
496
+ # 转换PDF第一页为图像
497
+ converted_path = convert_to_image(file_path, target_size=896)
498
+ return converted_path
499
+ except Exception as e:
500
+ logger.error(f"Error converting PDF for preview: {e}")
501
+ return file_path
502
+ else:
503
+ # 图像文件直接返回
504
+ return file_path
505
+
506
+ file.change(fn=preview_file, inputs=file, outputs=pdf_show)
507
 
508
  # 文档处理
509
  def process_with_status(file_path):
 
511
  if file_path is None:
512
  return "", "", {}, {}, "Please select a file first"
513
 
 
 
 
514
  # 执行文档处理
515
  md_render_result, md_content_result, json_result, debug_result = process_document(file_path)
516
 
517
+ return md_render_result, md_content_result, json_result, debug_result
 
 
 
 
 
 
518
 
519
  submit_btn.click(
520
  fn=process_with_status,
521
  inputs=[file],
522
+ outputs=[md_render, md_content, json_output, debug_output],
523
  )
524
 
525
  # 清空所有内容
526
  def reset_all():
527
+ return None, None, "", "", {}, {}
528
 
529
  clear_btn.click(
530
  fn=reset_all,
531
  inputs=[],
532
+ outputs=[file, pdf_show, md_render, md_content, json_output, debug_output]
533
  )
534
 
535
  # 启动应用