Spaces:

ByteDance
/

Dolphin

Running on Zero

App Files Files Community

xfey commited on Jun 11

Commit

dfb1341

1 Parent(s): a75eb7f

[init] update application file

Browse files

Files changed (15) hide show

.gitignore +72 -0
app.py +481 -0
examples/page_1.pdf +3 -0
examples/page_2.pdf +3 -0
examples/page_3.jpeg +3 -0
examples/page_4.png +3 -0
examples/page_5.jpg +3 -0
examples/page_6.jpg +3 -0
header.html +447 -0
inference_hugg.py +287 -0
pyproject.toml +16 -0
requirements.txt +14 -0
static/styles.css +306 -0
utils/markdown_utils.py +442 -0
utils/utils.py +367 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,72 @@

+# Python相关
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# 环境文件
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# 编辑器文件
+.vscode/
+.idea/
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
+# 日志和数据库
+*.log
+*.sqlite
+*.db
+# 系统文件
+.DS_Store
+Thumbs.db
+# 测试相关
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# 输出文件
+*.csv
+*.json
+*.xlsx
+# *.pdf
+out/
+output/
+# Jupyter笔记本
+.ipynb_checkpoints

app.py ADDED Viewed

	@@ -0,0 +1,481 @@

+import os
+import tempfile
+import time
+import uuid
+import cv2
+import gradio as gr
+import pymupdf
+import spaces
+import torch
+from gradio_pdf import PDF
+from loguru import logger
+from PIL import Image
+from transformers import AutoProcessor, VisionEncoderDecoderModel
+from utils.utils import prepare_image, parse_layout_string, process_coordinates, ImageDimensions
+# 读取外部CSS文件
+def load_css():
+    css_path = os.path.join(os.path.dirname(__file__), "static", "styles.css")
+    if os.path.exists(css_path):
+        with open(css_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return ""
+# 全局变量存储模型
+model = None
+processor = None
+tokenizer = None
+# 自动初始化模型
+@spaces.GPU
+def initialize_model():
+    """初始化 Hugging Face 模型"""
+    global model, processor, tokenizer
+    if model is None:
+        logger.info("Loading DOLPHIN model...")
+        model_id = "ByteDance/Dolphin"
+        # 加载处理器和模型
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = VisionEncoderDecoderModel.from_pretrained(model_id)
+        model.eval()
+        # 设置设备和精度
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model.to(device)
+        model = model.half()  # 使用半精度
+        # 设置tokenizer
+        tokenizer = processor.tokenizer
+        logger.info(f"Model loaded successfully on {device}")
+    return "Model ready"
+# 启动时自动初始化模型
+logger.info("Initializing model at startup...")
+try:
+    initialize_model()
+    logger.info("Model initialization completed")
+except Exception as e:
+    logger.error(f"Model initialization failed: {e}")
+    # 模型将在首次使用时重新尝试初始化
+# 模型推理函数
+@spaces.GPU
+def model_chat(prompt, image):
+    """使用模型进行推理"""
+    global model, processor, tokenizer
+    # 确保模型已初始化
+    if model is None:
+        initialize_model()
+    # 检查是否为批处理
+    is_batch = isinstance(image, list)
+    if not is_batch:
+        images = [image]
+        prompts = [prompt]
+    else:
+        images = image
+        prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)
+    # 准备图像
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    batch_inputs = processor(images, return_tensors="pt", padding=True)
+    batch_pixel_values = batch_inputs.pixel_values.half().to(device)
+    # 准备提示
+    prompts = [f"<s>{p} <Answer/>" for p in prompts]
+    batch_prompt_inputs = tokenizer(
+        prompts,
+        add_special_tokens=False,
+        return_tensors="pt"
+    )
+    batch_prompt_ids = batch_prompt_inputs.input_ids.to(device)
+    batch_attention_mask = batch_prompt_inputs.attention_mask.to(device)
+    # 生成文本
+    outputs = model.generate(
+        pixel_values=batch_pixel_values,
+        decoder_input_ids=batch_prompt_ids,
+        decoder_attention_mask=batch_attention_mask,
+        min_length=1,
+        max_length=4096,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        use_cache=True,
+        bad_words_ids=[[tokenizer.unk_token_id]],
+        return_dict_in_generate=True,
+        do_sample=False,
+        num_beams=1,
+        repetition_penalty=1.1
+    )
+    # 处理输出
+    sequences = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)
+    # 清理提示文本
+    results = []
+    for i, sequence in enumerate(sequences):
+        cleaned = sequence.replace(prompts[i], "").replace("<pad>", "").replace("</s>", "").strip()
+        results.append(cleaned)
+    # 返回单个结果或批处理结果
+    if not is_batch:
+        return results[0]
+    return results
+# 处理元素批次
+@spaces.GPU
+def process_element_batch(elements, prompt, max_batch_size=16):
+    """处理同类型元素的批次"""
+    results = []
+    # 确定批次大小
+    batch_size = min(len(elements), max_batch_size)
+    # 分批处理
+    for i in range(0, len(elements), batch_size):
+        batch_elements = elements[i:i+batch_size]
+        crops_list = [elem["crop"] for elem in batch_elements]
+        # 使用相同的提示
+        prompts_list = [prompt] * len(crops_list)
+        # 批量推理
+        batch_results = model_chat(prompts_list, crops_list)
+        # 添加结果
+        for j, result in enumerate(batch_results):
+            elem = batch_elements[j]
+            results.append({
+                "label": elem["label"],
+                "bbox": elem["bbox"],
+                "text": result.strip(),
+                "reading_order": elem["reading_order"],
+            })
+    return results
+# 清理临时文件
+def cleanup_temp_file(file_path):
+    """安全地删除临时文件"""
+    try:
+        if file_path and os.path.exists(file_path):
+            os.unlink(file_path)
+    except Exception as e:
+        logger.warning(f"Failed to cleanup temp file {file_path}: {e}")
+def to_pdf(file_path):
+    """将输入文件转换为PDF格式"""
+    if file_path is None:
+        return None
+    with pymupdf.open(file_path) as f:
+        if f.is_pdf:
+            return file_path
+        else:
+            pdf_bytes = f.convert_to_pdf()
+            # 使用临时文件而不是保存到磁盘
+            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
+                tmp_file.write(pdf_bytes)
+                return tmp_file.name
+@spaces.GPU(duration=120)
+def process_document(file_path):
+    """处理文档的主要函数 - 集成完整的推理逻辑"""
+    if file_path is None:
+        return "", "", {}, {}
+    start_time = time.time()
+    original_file_path = file_path
+    # 确保模型已初始化
+    if model is None:
+        initialize_model()
+    # 转换为PDF（如果需要）
+    converted_file_path = to_pdf(file_path)
+    temp_file_created = converted_file_path != original_file_path
+    try:
+        logger.info(f"Processing document: {file_path}")
+        # 处理页面
+        recognition_results = process_page(converted_file_path)
+        # 生成Markdown内容
+        md_content = generate_markdown(recognition_results)
+        # 计算处理时间
+        processing_time = time.time() - start_time
+        debug_info = {
+            "original_file": original_file_path,
+            "converted_file": converted_file_path,
+            "temp_file_created": temp_file_created,
+            "status": "success",
+            "processing_time": f"{processing_time:.2f}s",
+            "total_elements": len(recognition_results)
+        }
+        processing_data = {
+            "pages": [{"elements": recognition_results}],
+            "total_elements": len(recognition_results),
+            "processing_time": f"{processing_time:.2f}s"
+        }
+        logger.info(f"Document processed successfully in {processing_time:.2f}s")
+        return md_content, md_content, processing_data, debug_info
+    except Exception as e:
+        logger.error(f"Error processing document: {str(e)}")
+        error_info = {
+            "original_file": original_file_path,
+            "converted_file": converted_file_path,
+            "temp_file_created": temp_file_created,
+            "status": "error",
+            "error": str(e)
+        }
+        return f"# 处理错误\n\n处理文档时发生错误: {str(e)}", "", {}, error_info
+    finally:
+        # 清理临时文件
+        if temp_file_created:
+            cleanup_temp_file(converted_file_path)
+def process_page(image_path):
+    """处理单页文档"""
+    # 阶段1: 页面级布局解析
+    pil_image = Image.open(image_path).convert("RGB")
+    layout_output = model_chat("Parse the reading order of this document.", pil_image)
+    # 阶段2: 元素级内容解析
+    padded_image, dims = prepare_image(pil_image)
+    recognition_results = process_elements(layout_output, padded_image, dims)
+    return recognition_results
+def process_elements(layout_results, padded_image, dims, max_batch_size=16):
+    """解析所有文档元素"""
+    layout_results = parse_layout_string(layout_results)
+    # 分别存储不同类型的元素
+    text_elements = []  # 文本元素
+    table_elements = []  # 表格元素
+    figure_results = []  # 图像元素（无需处理）
+    previous_box = None
+    reading_order = 0
+    # 收集要处理的元素并按类型分组
+    for bbox, label in layout_results:
+        try:
+            # 调整坐标
+            x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = process_coordinates(
+                bbox, padded_image, dims, previous_box
+            )
+            # 裁剪并解析元素
+            cropped = padded_image[y1:y2, x1:x2]
+            if cropped.size > 0:
+                if label == "fig":
+                    # 对于图像区域，直接添加空文本结果
+                    figure_results.append(
+                        {
+                            "label": label,
+                            "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
+                            "text": "",
+                            "reading_order": reading_order,
+                        }
+                    )
+                else:
+                    # 准备元素进行解析
+                    pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
+                    element_info = {
+                        "crop": pil_crop,
+                        "label": label,
+                        "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
+                        "reading_order": reading_order,
+                    }
+                    # 按类型分组
+                    if label == "tab":
+                        table_elements.append(element_info)
+                    else:  # 文本元素
+                        text_elements.append(element_info)
+            reading_order += 1
+        except Exception as e:
+            logger.error(f"Error processing bbox with label {label}: {str(e)}")
+            continue
+    # 初始化结果列表
+    recognition_results = figure_results.copy()
+    # 处理文本元素（批量）
+    if text_elements:
+        text_results = process_element_batch(text_elements, "Read text in the image.", max_batch_size)
+        recognition_results.extend(text_results)
+    # 处理表格元素（批量）
+    if table_elements:
+        table_results = process_element_batch(table_elements, "Parse the table in the image.", max_batch_size)
+        recognition_results.extend(table_results)
+    # 按阅读顺序排序
+    recognition_results.sort(key=lambda x: x.get("reading_order", 0))
+    return recognition_results
+def generate_markdown(recognition_results):
+    """从识别结果生成Markdown内容"""
+    markdown_parts = []
+    for result in recognition_results:
+        text = result.get("text", "").strip()
+        label = result.get("label", "")
+        if text:
+            if label == "tab":
+                # 表格内容
+                markdown_parts.append(f"\n{text}\n")
+            else:
+                # 普通文本内容
+                markdown_parts.append(text)
+    return "\n\n".join(markdown_parts)
+# LaTeX 渲染配置
+latex_delimiters = [
+    {"left": "$$", "right": "$$", "display": True},
+    {"left": "$", "right": "$", "display": False},
+    {"left": "\\[", "right": "\\]", "display": True},
+    {"left": "\\(", "right": "\\)", "display": False},
+]
+# 加载自定义CSS
+custom_css = load_css()
+# 读取页面头部
+with open("header.html", "r", encoding="utf-8") as file:
+    header = file.read()
+# 创建 Gradio 界面
+with gr.Blocks(css=custom_css, title="Dolphin Document Parser") as demo:
+    gr.HTML(header)
+    with gr.Row():
+        # 侧边栏 - 文件上传和控制
+        with gr.Column(scale=1, elem_classes="sidebar"):
+            # 文件上传组件
+            file = gr.File(
+                label="Choose PDF or image file",
+                file_types=[".pdf", ".png", ".jpeg", ".jpg"],
+                elem_id="file-upload"
+            )
+            gr.HTML("选择文件后，点击处理按钮开始解析<br>After selecting the file, click the Process button to start parsing")
+            with gr.Row(elem_classes="action-buttons"):
+                submit_btn = gr.Button("处理文档/Process Document", variant="primary")
+                clear_btn = gr.ClearButton(value="清空/Clear")
+            # 处理状态显示
+            status_display = gr.Textbox(
+                label="Processing Status",
+                value="Ready to process documents",
+                interactive=False,
+                max_lines=2
+            )
+            # 示例文件
+            example_root = os.path.join(os.path.dirname(__file__), "examples")
+            if os.path.exists(example_root):
+                gr.HTML("示例文件/Example Files")
+                example_files = [
+                    os.path.join(example_root, f)
+                    for f in os.listdir(example_root)
+                    if not f.endswith(".py")
+                ]
+                examples = gr.Examples(
+                    examples=example_files,
+                    inputs=file,
+                    examples_per_page=10,
+                    elem_id="example-files"
+                )
+        # 主体内容区域
+        with gr.Column(scale=7):
+            with gr.Row(elem_classes="main-content"):
+                # 预览面板
+                with gr.Column(scale=1, elem_classes="preview-panel"):
+                    gr.HTML("文件预览/Preview")
+                    pdf_show = PDF(label="", interactive=False, visible=True, height=600)
+                    debug_output = gr.JSON(label="Debug Info", height=100)
+                # 输出面板
+                with gr.Column(scale=1, elem_classes="output-panel"):
+                    with gr.Tabs():
+                        with gr.Tab("Markdown [Render]"):
+                            md_render = gr.Markdown(
+                                label="",
+                                height=700,
+                                show_copy_button=True,
+                                latex_delimiters=latex_delimiters,
+                                line_breaks=True,
+                            )
+                        with gr.Tab("Markdown [Content]"):
+                            md_content = gr.TextArea(lines=30, show_copy_button=True)
+                        with gr.Tab("Processing Data"):
+                            json_output = gr.JSON(label="", height=700)
+    # 事件处理
+    file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
+    # 文档处理
+    def process_with_status(file_path):
+        """处理文档并更新状态"""
+        if file_path is None:
+            return "", "", {}, {}, "Please select a file first"
+        # 更新状态为处理中
+        status = "Processing document..."
+        # 执行文档处理
+        md_render_result, md_content_result, json_result, debug_result = process_document(file_path)
+        # 更新完成状态
+        if "错误" in md_render_result:
+            status = "Processing failed - see debug info"
+        else:
+            status = "Processing completed successfully"
+        return md_render_result, md_content_result, json_result, debug_result, status
+    submit_btn.click(
+        fn=process_with_status,
+        inputs=[file],
+        outputs=[md_render, md_content, json_output, debug_output, status_display],
+    )
+    # 清空所有内容
+    def reset_all():
+        return None, None, "", "", {}, {}, "Ready to process documents"
+    clear_btn.click(
+        fn=reset_all,
+        inputs=[],
+        outputs=[file, pdf_show, md_render, md_content, json_output, debug_output, status_display]
+    )
+# 启动应用
+if __name__ == "__main__":
+    demo.launch()

examples/page_1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8984e6b0bffa46e13809b4969e2be559df89e2cf9d6b3d7fb1a78f25aed8e570
+size 1523572

examples/page_2.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4f4785470676739e2998f04bfc8daaf2e7ae227bf374614f07821ec5a315143
+size 1478409

examples/page_3.jpeg ADDED Viewed

Git LFS Details

SHA256: fe6e35a3c888c77ec36cf48cb762556e489e288d30a457a353ac6bba6fab9251
Pointer size: 131 Bytes
Size of remote file: 449 kB

examples/page_4.png ADDED Viewed

Git LFS Details

SHA256: 497cdabe38a4db8318284c0f8963304a876ceceebb796059903703834e4713ed
Pointer size: 131 Bytes
Size of remote file: 372 kB

examples/page_5.jpg ADDED Viewed

Git LFS Details

SHA256: 17cdc261fcd7eb8db4a0bdfb56dc2b1f77c8890956f8451f810695e115f6f894
Pointer size: 131 Bytes
Size of remote file: 641 kB

examples/page_6.jpg ADDED Viewed

Git LFS Details

SHA256: 0e4dfe55790db38d64ff0d4cf2707859e2d17d4c6e254e398fa21ab4239fd6ec
Pointer size: 131 Bytes
Size of remote file: 975 kB

header.html ADDED Viewed

	@@ -0,0 +1,447 @@

+<!DOCTYPE html>
+<html lang="en" style="color-scheme: light;">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <meta name="color-scheme" content="light">
+  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
+  <style>
+    :root {
+      /* 主色调 */
+      --primary-color: #dceaf6;
+      --primary-light: #f8f9fa;
+      --primary-dark: #9ec9e3;
+      /* 辅助色调 */
+      --accent-color: #bfe2f8;
+      --accent-light: #dceaf6;
+      /* 背景色 */
+      --bg-color: #e8eff5;
+      --card-bg: #ffffff;
+      /* 文本色 */
+      --dark-text: #2b2d42;
+      --light-text: #f8f9fa;
+      --muted-text: rgba(43, 45, 66, 0.7);
+      /* 边框和阴影 */
+      --border-color: rgba(168, 168, 168, 0.432);
+      --card-shadow: 0 4px 20px rgba(104, 104, 104, 0.1);
+      /* 交互状态 */
+      --hover-bg: rgba(255, 255, 255, 0.5);
+      --active-color: #bfe2f8;
+    }
+    .header-container {
+      display: flex;
+      flex-direction: row;
+      justify-content: space-between;
+      align-items: flex-start;
+      background: linear-gradient(135deg,
+        #e4deff 0%,
+        #d8f7ff 100%
+      );
+      padding: 1.8rem;
+      border-radius: 12px;
+      margin-bottom: 1.5rem;
+      box-shadow: var(--card-shadow);
+      position: relative;
+      overflow: hidden;
+    }
+    .header-container::before {
+      content: '';
+      position: absolute;
+      top: 0;
+      left: 0;
+      right: 0;
+      bottom: 0;
+      background: linear-gradient(135deg,
+        rgba(255, 255, 255, 0.2) 0%,
+        rgba(255, 255, 255, 0) 100%
+      );
+      pointer-events: none;
+    }
+    .header-content {
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      text-align: center;
+      max-width: 100%;
+      width: 100%;
+    }
+    .header-buttons {
+      display: none;
+    }
+    .logo-title-container {
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      margin-bottom: 1.5rem;
+      max-width: 100%;
+      text-align: center;
+    }
+    .logo {
+      width: 350px;
+      height: auto;
+      margin-bottom: 1rem;
+      margin-right: 0;
+    }
+    .header-title {
+      font-size: 2.2rem;
+      font-weight: 700;
+      color: var(--dark-text);
+      margin: 0;
+      font-family: 'Poppins', 'Segoe UI', sans-serif;
+      line-height: 1.2;
+      text-align: center;
+      max-width: 100%;
+    }
+    .header-subtitle {
+      font-size: 1.1rem;
+      color: var(--muted-text);
+      margin: 0 0 1.5rem 0;
+      line-height: 1.6;
+      max-width: 100%;
+      text-align: center;
+      margin-left: auto;
+      margin-right: auto;
+    }
+    .link-button {
+      display: flex;
+      align-items: center;
+      padding: 0.7rem 1.2rem;
+      background-color: var(--hover-bg);
+      border-radius: 8px;
+      color: var(--dark-text) !important;
+      text-decoration: none !important;
+      font-weight: 700;
+      font-size: 1.1rem;
+      transition: all 0.3s ease;
+      backdrop-filter: blur(5px);
+      border: 1px solid var(--border-color);
+      width: 100%;
+      margin-bottom: 0.5rem;
+    }
+    .link-button:hover {
+      background-color: var(--hover-bg);
+      transform: translateY(-2px);
+      box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
+      text-decoration: none !important;
+      color: var(--dark-text) !important;
+    }
+    .link-button i {
+      margin-right: 0.8rem;
+      font-size: 1.2rem;
+      color: var(--primary-dark);
+      min-width: 20px;
+      text-align: center;
+    }
+    .link-button * {
+      text-decoration: none !important;
+      color: inherit !important;
+    }
+    .feature-grid {
+      display: flex;
+      flex-direction: row;
+      align-items: flex-start;
+      justify-content: center;
+      margin-top: 1.5rem;
+      width: 100%;
+      margin-left: auto;
+      margin-right: auto;
+    }
+    .feature-card {
+      flex: 1;
+      padding: 1rem 1rem;
+      background-color: transparent;
+      border: none;
+      box-shadow: none;
+      transition: none;
+      text-align: center;
+      position: relative;
+    }
+    .feature-card:hover {
+      transform: none;
+      box-shadow: none;
+    }
+    .feature-separator {
+      width: 1px;
+      align-self: stretch;
+      background-color: var(--border-color);
+      margin: 0 1rem;
+    }
+    .feature-icon {
+      font-size: 2rem;
+      color: var(--primary-dark);
+      margin-bottom: 1rem;
+    }
+    .feature-title {
+      font-weight: 600;
+      color: var(--dark-text);
+      margin-bottom: 0.8rem;
+      font-size: 1.2rem;
+    }
+    .feature-desc {
+      font-size: 0.85rem;
+      color: var(--muted-text);
+      line-height: 1.5;
+    }
+    /* 新的导航按钮样式 */
+    .nav-buttons {
+      display: flex;
+      flex-direction: row;
+      align-items: center;
+      justify-content: center;
+      margin-top: 1rem;
+      margin-bottom: 2rem;
+      background-color: rgba(255, 255, 255, 0.7);
+      border-radius: 12px;
+      border: 1px solid var(--border-color);
+      padding: 0.5rem 1rem;
+      max-width: none;
+      width: auto;
+      align-self: center;
+      margin-left: auto;
+      margin-right: auto;
+    }
+    .nav-link {
+      display: flex;
+      align-items: center;
+      padding: 0.5rem 1rem;
+      color: var(--dark-text) !important;
+      text-decoration: none !important;
+      font-weight: 600;
+      font-size: 1rem;
+      transition: all 0.3s ease;
+    }
+    .nav-link:hover {
+      transform: translateY(-3px);
+      color: var(--primary-dark) !important;
+      background-color: rgba(255, 255, 255, 0.8);
+    }
+    .nav-link i {
+      margin-right: 0.5rem;
+      font-size: 1.1rem;
+      color: var(--primary-dark);
+    }
+    .nav-separator {
+      height: 20px;
+      width: 1px;
+      background-color: var(--border-color);
+      margin: 0 0.5rem;
+    }
+    @media (max-width: 960px) {
+      .header-container {
+        flex-direction: column;
+        padding: 1.5rem;
+      }
+      .header-content {
+        max-width: 100%;
+        margin-bottom: 2rem;
+      }
+      .header-buttons {
+        width: 100%;
+        margin-left: 0;
+      }
+      .logo-title-container {
+        flex-direction: column;
+        align-items: center;
+      }
+      .logo {
+        width: 250px;
+        margin-bottom: 1rem;
+        margin-right: 0;
+      }
+      .header-title {
+        font-size: 1.8rem;
+      }
+      .feature-grid {
+        flex-direction: column;
+      }
+      .feature-card {
+        width: 100%;
+        padding: 1rem 0;
+      }
+      .feature-separator {
+        width: 100%;
+        height: 1px;
+        margin: 0.5rem 0;
+      }
+      .nav-buttons {
+        flex-wrap: wrap;
+        justify-content: center;
+      }
+      .nav-link {
+        padding: 0.5rem;
+        font-size: 0.9rem;
+      }
+      .nav-separator {
+        display: none;
+      }
+      .feature-desc {
+        font-size: 0.9rem;
+      }
+    }
+    /* 添加禁用夜间模式的样式 */
+    @media (prefers-color-scheme: dark) {
+      /* 强制使用明亮模式颜色 */
+      .header-container,
+      .header-content,
+      .logo-title-container,
+      .header-title,
+      .header-subtitle,
+      .nav-buttons,
+      .nav-link,
+      .feature-grid,
+      .feature-card,
+      .feature-title,
+      .feature-desc,
+      body,
+      * {
+        color-scheme: light !important; /* 强制使用明亮模式配色方案 */
+        color: var(--dark-text) !important; /* 强制使用黑色文本 */
+        background-color: initial; /* 保持原有背景色 */
+      }
+    }
+    /* 添加全局样式覆盖，确保所有文本都使用我们指定的颜色 */
+    body, p, h1, h2, h3, h4, h5, h6, span, div, a {
+      color: var(--dark-text) !important;
+    }
+    .feature-desc {
+      color: var(--muted-text) !important;
+    }
+    /* 确保图标颜色也不受夜间模式影响 */
+    .feature-icon i, .nav-link i {
+      color: var(--primary-dark) !important;
+    }
+    /* 导航链接悬停效果 */
+    .nav-link:hover {
+      color: var(--primary-dark) !important;
+    }
+  </style>
+</head>
+<body>
+  <div class="header-container">
+    <div class="header-content">
+      <div class="logo-title-container">
+        <img src="https://raw.githubusercontent.com/bytedance/Dolphin/master/assets/dolphin.png" alt="Dolphin Logo" class="logo">
+        <h1 class="header-title">Document Image Parsing via Heterogeneous Anchor Prompting</h1>
+      </div>
+      <p class="header-subtitle">
+        A novel multimodal document image parsing model, following an analyze-then-parse paradigm for parallel decoding
+        <!-- <br>
+        Stage 1: Comprehensive page-level layout analysis by generating element sequence in natural reading order.
+        <br>
+        Stage 2: Efficient parallel parsing of document elements using heterogeneous anchors and task-specific prompts. -->
+      </p>
+      <!-- 新的导航按钮 -->
+      <div class="nav-buttons">
+        <!-- <a href="https://mineru.org.cn/home?source=huggingface" class="nav-link">
+          <i class="fas fa-home"></i> 主页/Homepage
+        </a> -->
+        <!-- <div class="nav-separator"></div> -->
+        <a href="https://arxiv.org/abs/2505.14059" class="nav-link">
+          <i class="fas fa-file-alt"></i> 论文/Paper
+        </a>
+        <div class="nav-separator"></div>
+        <a href="https://huggingface.co/ByteDance/Dolphin" class="nav-link">
+          <i class="fas fa-cube"></i> 模型/Model
+        </a>
+        <div class="nav-separator"></div>
+        <a href="https://github.com/bytedance/Dolphin" class="nav-link">
+          <i class="fas fa-code"></i> 代码/Code
+        </a>
+        <div class="nav-separator"></div>
+        <a href="https://opensource.org/licenses/MIT" class="nav-link">
+          <i class="fas fa-balance-scale"></i> 许可证/License
+        </a>
+      </div>
+      <div class="feature-grid">
+        <div class="feature-card">
+          <div class="feature-icon"><i class="fas fa-file-import"></i></div>
+          <div class="feature-title">支持格式/Support Format</div>
+          <div class="feature-desc">支持多页PDF、单页图像<br>Multi-page PDF, single document image (JPEG/PNG)</div>
+        </div>
+        <div class="feature-separator"></div>
+        <div class="feature-card">
+          <div class="feature-icon"><i class="fas fa-feather-alt"></i></div>
+          <div class="feature-title">轻量级模型/Lightweight Model</div>
+          <div class="feature-desc">Dolphin模型参数量322M，高效易部署<br>Lightweight (322M) and efficient, easy to deploy</div>
+        </div>
+        <div class="feature-separator"></div>
+        <div class="feature-card">
+          <div class="feature-icon"><i class="fas fa-tasks"></i></div>
+          <div class="feature-title">并行解析/Parallel Parsing</div>
+          <div class="feature-desc">Dolphin并行解析多个文本块<br>Parsing several text blocks in a batch for speed up</div>
+        </div>
+        <div class="feature-separator"></div>
+        <div class="feature-card">
+          <div class="feature-icon"><i class="fas fa-superscript"></i></div>
+          <div class="feature-title">公式和表格/Formula and Table</div>
+          <div class="feature-desc">支持公式(LaTeX格式)、表格(HTML格式)输出<br>Support formulas (LaTeX format) and tables (HTML format)</div>
+        </div>
+      </div>
+      <!-- 添加免责声明 -->
+      <p style="
+        font-size: 0.8rem;
+        color: var(--muted-text) !important;
+        margin-top: 1.5rem;
+        text-align: center;
+      ">内容由 AI 生成，请仔细甄别</p>
+    </div>
+  </div>
+</body>
+</html>

inference_hugg.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+SPDX-License-Identifier: MIT
+"""
+import argparse
+import glob
+import os
+import cv2
+import torch
+from PIL import Image
+from transformers import AutoProcessor, VisionEncoderDecoderModel
+from utils.utils import *
+class DOLPHIN:
+    def __init__(self, model_id_or_path):
+        """Initialize the Hugging Face model
+        Args:
+            model_id_or_path: Path to local model or Hugging Face model ID
+        """
+        # Load model from local path or Hugging Face hub
+        self.processor = AutoProcessor.from_pretrained(model_id_or_path)
+        self.model = VisionEncoderDecoderModel.from_pretrained(model_id_or_path)
+        self.model.eval()
+        # Set device and precision
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+        self.model = self.model.half()  # Always use half precision by default
+        # set tokenizer
+        self.tokenizer = self.processor.tokenizer
+    def chat(self, prompt, image):
+        """Process an image or batch of images with the given prompt(s)
+        Args:
+            prompt: Text prompt or list of prompts to guide the model
+            image: PIL Image or list of PIL Images to process
+        Returns:
+            Generated text or list of texts from the model
+        """
+        # Check if we're dealing with a batch
+        is_batch = isinstance(image, list)
+        if not is_batch:
+            # Single image, wrap it in a list for consistent processing
+            images = [image]
+            prompts = [prompt]
+        else:
+            # Batch of images
+            images = image
+            prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)
+        # Prepare image
+        batch_inputs = self.processor(images, return_tensors="pt", padding=True)
+        batch_pixel_values = batch_inputs.pixel_values.half().to(self.device)
+        # Prepare prompt
+        prompts = [f"<s>{p} <Answer/>" for p in prompts]
+        batch_prompt_inputs = self.tokenizer(
+            prompts,
+            add_special_tokens=False,
+            return_tensors="pt"
+        )
+        batch_prompt_ids = batch_prompt_inputs.input_ids.to(self.device)
+        batch_attention_mask = batch_prompt_inputs.attention_mask.to(self.device)
+        # Generate text
+        outputs = self.model.generate(
+            pixel_values=batch_pixel_values,
+            decoder_input_ids=batch_prompt_ids,
+            decoder_attention_mask=batch_attention_mask,
+            min_length=1,
+            max_length=4096,
+            pad_token_id=self.tokenizer.pad_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            use_cache=True,
+            bad_words_ids=[[self.tokenizer.unk_token_id]],
+            return_dict_in_generate=True,
+            do_sample=False,
+            num_beams=1,
+            repetition_penalty=1.1
+        )
+        # Process output
+        sequences = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)
+        # Clean prompt text from output
+        results = []
+        for i, sequence in enumerate(sequences):
+            cleaned = sequence.replace(prompts[i], "").replace("<pad>", "").replace("</s>", "").strip()
+            results.append(cleaned)
+        # Return a single result for single image input
+        if not is_batch:
+            return results[0]
+        return results
+def process_page(image_path, model, save_dir, max_batch_size=None):
+    """Parse document images with two stages"""
+    # Stage 1: Page-level layout and reading order parsing
+    pil_image = Image.open(image_path).convert("RGB")
+    layout_output = model.chat("Parse the reading order of this document.", pil_image)
+    # Stage 2: Element-level content parsing
+    padded_image, dims = prepare_image(pil_image)
+    recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size)
+    # Save outputs
+    json_path = save_outputs(recognition_results, image_path, save_dir)
+    return json_path, recognition_results
+def process_elements(layout_results, padded_image, dims, model, max_batch_size=None):
+    """Parse all document elements with parallel decoding"""
+    layout_results = parse_layout_string(layout_results)
+    # Store text and table elements separately
+    text_elements = []  # Text elements
+    table_elements = []  # Table elements
+    figure_results = []  # Image elements (no processing needed)
+    previous_box = None
+    reading_order = 0
+    # Collect elements to process and group by type
+    for bbox, label in layout_results:
+        try:
+            # Adjust coordinates
+            x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = process_coordinates(
+                bbox, padded_image, dims, previous_box
+            )
+            # Crop and parse element
+            cropped = padded_image[y1:y2, x1:x2]
+            if cropped.size > 0:
+                if label == "fig":
+                    # For figure regions, add empty text result immediately
+                    figure_results.append(
+                        {
+                            "label": label,
+                            "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
+                            "text": "",
+                            "reading_order": reading_order,
+                        }
+                    )
+                else:
+                    # Prepare element for parsing
+                    pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
+                    element_info = {
+                        "crop": pil_crop,
+                        "label": label,
+                        "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
+                        "reading_order": reading_order,
+                    }
+                    # Group by type
+                    if label == "tab":
+                        table_elements.append(element_info)
+                    else:  # Text elements
+                        text_elements.append(element_info)
+            reading_order += 1
+        except Exception as e:
+            print(f"Error processing bbox with label {label}: {str(e)}")
+            continue
+    # Initialize results list
+    recognition_results = figure_results.copy()
+    # Process text elements (in batches)
+    if text_elements:
+        text_results = process_element_batch(text_elements, model, "Read text in the image.", max_batch_size)
+        recognition_results.extend(text_results)
+    # Process table elements (in batches)
+    if table_elements:
+        table_results = process_element_batch(table_elements, model, "Parse the table in the image.", max_batch_size)
+        recognition_results.extend(table_results)
+    # Sort elements by reading order
+    recognition_results.sort(key=lambda x: x.get("reading_order", 0))
+    return recognition_results
+def process_element_batch(elements, model, prompt, max_batch_size=None):
+    """Process elements of the same type in batches"""
+    results = []
+    # Determine batch size
+    batch_size = len(elements)
+    if max_batch_size is not None and max_batch_size > 0:
+        batch_size = min(batch_size, max_batch_size)
+    # Process in batches
+    for i in range(0, len(elements), batch_size):
+        batch_elements = elements[i:i+batch_size]
+        crops_list = [elem["crop"] for elem in batch_elements]
+        # Use the same prompt for all elements in the batch
+        prompts_list = [prompt] * len(crops_list)
+        # Batch inference
+        batch_results = model.chat(prompts_list, crops_list)
+        # Add results
+        for j, result in enumerate(batch_results):
+            elem = batch_elements[j]
+            results.append({
+                "label": elem["label"],
+                "bbox": elem["bbox"],
+                "text": result.strip(),
+                "reading_order": elem["reading_order"],
+            })
+    return results
+def main():
+    parser = argparse.ArgumentParser(description="Document processing tool using DOLPHIN model")
+    parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image or directory of images")
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default=None,
+        help="Directory to save parsing results (default: same as input directory)",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=16,
+        help="Maximum number of document elements to parse in a single batch (default: 16)",
+    )
+    args = parser.parse_args()
+    # Load Model
+    model = DOLPHIN("ByteDance/Dolphin")
+    # Collect Document Images
+    if os.path.isdir(args.input_path):
+        image_files = []
+        for ext in [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]:
+            image_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
+        image_files = sorted(image_files)
+    else:
+        if not os.path.exists(args.input_path):
+            raise FileNotFoundError(f"Input path {args.input_path} does not exist")
+        image_files = [args.input_path]
+    save_dir = args.save_dir or (
+        args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)
+    )
+    setup_output_dirs(save_dir)
+    total_samples = len(image_files)
+    print(f"\nTotal samples to process: {total_samples}")
+    # Process All Document Images
+    for image_path in image_files:
+        print(f"\nProcessing {image_path}")
+        try:
+            json_path, recognition_results = process_page(
+                image_path=image_path,
+                model=model,
+                save_dir=save_dir,
+                max_batch_size=args.max_batch_size,
+            )
+            print(f"Processing completed. Results saved to {save_dir}")
+        except Exception as e:
+            print(f"Error processing {image_path}: {str(e)}")
+            continue
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[tool.black]
+line-length = 120
+include = '\.pyi?$'
+exclude = '''
+/(
+    \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | _build
+  | buck-out
+  | build
+  | dist
+)/
+'''

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+gradio==5.24.0
+gradio_pdf==0.0.22
+pymupdf==1.25.5
+loguru==0.7.3
+torch==2.1.0
+transformers==4.47.0
+opencv-python==4.11.0.86
+opencv-python-headless==4.5.5.64
+Pillow==9.3.0
+numpy==1.24.4
+spaces
+albumentations==1.4.0
+requests==2.32.3
+httpx==0.23.0

static/styles.css ADDED Viewed

	@@ -0,0 +1,306 @@

+:root {
+  /* 主色调 */
+  --primary-color: #dceaf6;
+  --primary-light: #f8f9fa;
+  --primary-dark: #9ec9e3;
+  /* 辅助色调 */
+  --accent-color: #bfe2f8;
+  --accent-light: #dceaf6;
+  /* 背景色 */
+  --bg-color: #e8eff5;
+  --card-bg: #ffffff;
+  /* 文本色 */
+  --dark-text: #2b2d42;
+  --light-text: #f8f9fa;
+  --muted-text: rgba(43, 45, 66, 0.7);
+  /* 边框和阴影 */
+  --border-color: rgba(168, 168, 168, 0.432);
+  --card-shadow: 0 4px 20px rgba(104, 104, 104, 0.1);
+  /* 交互状态 */
+  --hover-bg: rgba(255, 255, 255, 0.5);
+  --active-color: #bfe2f8;
+}
+body {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    background-color: var(--bg-color);
+}
+/* 卡片样式 */
+.gradio-container {
+    max-width: 95% !important;
+    width: 95% !important;
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+/* 面板样式 */
+.panel {
+    border-radius: 12px !important;
+    border: 1px solid var(--border-color) !important;
+    box-shadow: var(--card-shadow) !important;
+    background-color: var(--card-bg) !important;
+    padding: 1.5rem !important;
+}
+/* 按钮样式 */
+button.primary {
+    border-radius: 8px !important;
+}
+button {
+    border-radius: 8px !important;
+    border: 1px solid var(--border-color) !important;
+    background-color: var(--hover-bg) !important;
+    color: var(--dark-text) !important;
+    transition: all 0.3s ease !important;
+}
+button:hover {
+    transform: translateY(-2px) !important;
+    box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1) !important;
+    background-color: var(--hover-bg) !important;
+}
+/* 文件上传区域 */
+.file-preview {
+    border-radius: 8px !important;
+    border: 1px dashed var(--border-color) !important;
+}
+.file-preview:hover {
+    border-color: var(--primary-dark) !important;
+}
+/* 确保所有链接按钮样式正确 */
+.header-buttons a,
+.header-buttons a:hover,
+.header-buttons a:visited,
+.header-buttons a:active {
+    text-decoration: none !important;
+    color: var(--dark-text) !important;
+}
+/* 覆盖任何可能的内联样式 */
+.header-buttons a[style] {
+    text-decoration: none !important;
+    color: var(--dark-text) !important;
+}
+/* 确保链接内的所有元素都没有下划线 */
+.header-buttons a *,
+.header-buttons a:hover * {
+    text-decoration: none !important;
+}
+/* 隐藏页面底部信息 */
+footer, .footer, .footer-links, .gradio-footer {
+    display: none !important;
+}
+/* 隐藏底部工具栏 */
+.gradio-container > div:last-child {
+    display: none !important;
+}
+/* 隐藏底部API按钮和设置按钮 */
+.fixed-bottom {
+    display: none !important;
+}
+/* 隐藏Gradio品牌信息 */
+.gr-prose p:last-child {
+    display: none !important;
+}
+/* 隐藏底部的所有可能元素 */
+[class*="footer"], [id*="footer"], [class*="bottom-bar"], [id*="bottom-bar"] {
+    display: none !important;
+}
+/* 侧边栏样式 */
+.sidebar {
+    background-color: var(--card-bg);
+    border-radius: 12px;
+    border: 1px solid var(--border-color);
+    box-shadow: var(--card-shadow);
+    padding: 1rem;
+    margin-right: 1rem;
+}
+/* 上传按钮样式 */
+.upload-button {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    border: 2px dashed var(--border-color);
+    padding: 1rem;
+    margin-bottom: 1rem;
+    cursor: pointer;
+    transition: all 0.3s ease;
+}
+.upload-button:hover {
+    border-color: var(--primary-dark);
+    background-color: rgba(158, 201, 227, 0.1);
+}
+.upload-button i {
+    font-size: 1.5rem;
+    color: var(--primary-dark);
+    margin-right: 0.5rem;
+}
+/* 示例文件列表样式 */
+.example-list {
+    list-style-type: none;
+    padding: 0;
+    margin: 0;
+}
+.example-item {
+    display: flex;
+    align-items: center;
+    padding: 0.5rem;
+    border-radius: 8px;
+    margin-bottom: 0.5rem;
+    cursor: pointer;
+    transition: all 0.3s ease;
+}
+.example-item:hover {
+    background-color: rgba(158, 201, 227, 0.1);
+}
+.example-item i {
+    font-size: 1.2rem;
+    color: var(--primary-dark);
+    margin-right: 0.5rem;
+}
+.example-item-name {
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+}
+/* 取消和确认按钮样式 */
+.action-buttons {
+    display: flex;
+    justify-content: flex-end;
+}
+/* 取消按钮样式 */
+button[value="清空/Clear"] {
+    color: #e74c3c !important;
+}
+/* 隐藏原始文件上传组件 */
+.file-upload {
+    display: none !important;
+}
+/* 主体内容样式 */
+.main-content {
+    display: flex;
+    flex: 1;
+}
+/* 预览框样式 */
+.preview-panel {
+    flex: 1;
+    background-color: var(--card-bg);
+    border-radius: 12px;
+    border: 1px solid var(--border-color);
+    box-shadow: var(--card-shadow);
+    padding: 1rem;
+    margin-right: 1rem;
+}
+/* 输出框样式 */
+.output-panel {
+    flex: 1;
+    background-color: var(--card-bg);
+    border-radius: 12px;
+    border: 1px solid var(--border-color);
+    box-shadow: var(--card-shadow);
+    padding: 1rem;
+}
+/* 响应式布局 */
+@media (max-width: 768px) {
+    .main-content {
+        flex-direction: column;
+    }
+    .sidebar, .preview-panel, .output-panel {
+        margin-right: 0;
+        margin-bottom: 1rem;
+        width: 100%;
+    }
+}
+/* 美化文件上传组件 */
+#file-upload {
+    margin-bottom: 1.5rem;
+}
+#file-upload .file-preview {
+    border: 2px dashed var(--border-color);
+    padding: 1.5rem;
+    transition: all 0.3s ease;
+    text-align: center;
+}
+#file-upload .file-preview:hover {
+    border-color: var(--primary-dark);
+    background-color: rgba(158, 201, 227, 0.1);
+}
+/* 隐藏原始标签 */
+#file-upload .label-wrap {
+    display: none;
+}
+/* 美化示例文件列表 */
+#example-files .gr-samples-table {
+    border: none;
+    background: transparent;
+}
+#example-files .gr-samples-table td {
+    border: none;
+    padding: 0.5rem;
+    transition: all 0.3s ease;
+    border-radius: 8px;
+}
+#example-files .gr-samples-table tr:hover td {
+    background-color: rgba(158, 201, 227, 0.1);
+}
+#example-files .gr-samples-table td a {
+    display: flex;
+    align-items: center;
+    color: var(--dark-text);
+    text-decoration: none;
+}
+#example-files .gr-samples-table td a::before {
+    content: "\f1c1";
+    font-family: "Font Awesome 6 Free";
+    font-weight: 900;
+    margin-right: 0.5rem;
+    color: var(--primary-dark);
+    font-size: 1.2rem;
+}
+/* 隐藏分页控件 */
+#example-files .gr-samples-pagination {
+    display: none;
+}

utils/markdown_utils.py ADDED Viewed

	@@ -0,0 +1,442 @@

+"""
+Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+SPDX-License-Identifier: MIT
+"""
+import re
+import base64
+from typing import List, Dict, Any, Optional
+"""
+    Example input:
+    [
+        {"label": "tab", "bbox": [0.176, 0.74, 0.824, 0.82], "text": "<table><tr><td></td><td>HellaSwag</td><td>Obqa</td><td>WinoGrande</td><td>ARC-c</td><td>ARC-e</td><td>boolq</td><td>piqa</td><td>Avg</td></tr><tr><td>OPT-1.3B</td><td>53.65</td><td>33.40</td><td>59.59</td><td>29.44</td><td>50.80</td><td>60.83</td><td>72.36</td><td>51.44</td></tr><tr><td>Pythia-1.0B</td><td>47.16</td><td>31.40</td><td>53.43</td><td>27.05</td><td>48.99</td><td>57.83</td><td>69.21</td><td>48.30</td></tr><tr><td>Pythia-1.4B</td><td>52.01</td><td>33.20</td><td>57.38</td><td>28.50</td><td>54.00</td><td>63.27</td><td>70.95</td><td>51.33</td></tr><tr><td>TinyLlama-1.1B</td><td>59.20</td><td>36.00</td><td>59.12</td><td>30.10</td><td>55.25</td><td>57.83</td><td>73.29</td><td>52.99</td></tr></table>", "reading_order": 6},
+        {"label": "cap", "bbox": [0.28, 0.729, 0.711, 0.74], "text": "Table 2: Zero-shot performance on commonsense reasoning tasks", "reading_order": 7},
+        {"label": "para", "bbox": [0.176, 0.848, 0.826, 0.873], "text": "We of performance during training We tracked the accuracy of TinyLlama on common-\nsense reasoning benchmarks during its pre-training, as shown in Fig. 2 . Generally, the performance of", "reading_order": 8},
+        {"label": "fnote", "bbox": [0.176, 0.88, 0.824, 0.912], "text": "${ }^{4}$ Due to a bug in the config file, the learning rate did not decrease immediately after warmup and remained at\nthe maximum value for several steps before we fixed this.", "reading_order": 9},
+        {"label": "foot", "bbox": [0.496, 0.939, 0.501, 0.95], "text": "14", "reading_order": 10}
+    ]
+"""
+def extract_table_from_html(html_string):
+    """Extract and clean table tags from HTML string"""
+    try:
+        table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
+        tables = table_pattern.findall(html_string)
+        tables = [re.sub(r'<table[^>]*>', '<table>', table) for table in tables]
+        return '\n'.join(tables)
+    except Exception as e:
+        print(f"extract_table_from_html error: {str(e)}")
+        return f"<table><tr><td>Error extracting table: {str(e)}</td></tr></table>"
+class MarkdownConverter:
+    """Convert structured recognition results to Markdown format"""
+    def __init__(self):
+        # Define heading levels for different section types
+        self.heading_levels = {
+            'title': '#',
+            'sec': '##',
+            'sub_sec': '###'
+        }
+        # Define which labels need special handling
+        self.special_labels = {
+            'tab', 'fig', 'title', 'sec', 'sub_sec',
+            'list', 'formula', 'reference', 'alg'
+        }
+    def try_remove_newline(self, text: str) -> str:
+        try:
+            # Preprocess text to handle line breaks
+            text = text.strip()
+            text = text.replace('-\n', '')
+            # Handle Chinese text line breaks
+            def is_chinese(char):
+                return '\u4e00' <= char <= '\u9fff'
+            lines = text.split('\n')
+            processed_lines = []
+            # Process all lines except the last one
+            for i in range(len(lines)-1):
+                current_line = lines[i].strip()
+                next_line = lines[i+1].strip()
+                # Always add the current line, but determine if we need a newline
+                if current_line:  # If current line is not empty
+                    if next_line:  # If next line is not empty
+                        # For Chinese text handling
+                        if is_chinese(current_line[-1]) and is_chinese(next_line[0]):
+                            processed_lines.append(current_line)
+                        else:
+                            processed_lines.append(current_line + ' ')
+                    else:
+                        # Next line is empty, add current line with newline
+                        processed_lines.append(current_line + '\n')
+                else:
+                    # Current line is empty, add an empty line
+                    processed_lines.append('\n')
+            # Add the last line
+            if lines and lines[-1].strip():
+                processed_lines.append(lines[-1].strip())
+            text = ''.join(processed_lines)
+            return text
+        except Exception as e:
+            print(f"try_remove_newline error: {str(e)}")
+            return text  # Return original text on error
+    def _handle_text(self, text: str) -> str:
+        """
+        Process regular text content, preserving paragraph structure
+        """
+        try:
+            if not text:
+                return ""
+            if text.strip().startswith("\\begin{array}") and text.strip().endswith("\\end{array}"):
+                text = "$$" + text + "$$"
+            elif ("_{" in text or "^{" in text or "\\" in text or "_ {" in text or "^ {" in text) and ("$" not in text) and ("\\begin" not in text):
+                text = "$" + text + "$"
+            # Process formulas in text before handling other text processing
+            text = self._process_formulas_in_text(text)
+            text = self.try_remove_newline(text)
+            # Return processed text
+            return text
+        except Exception as e:
+            print(f"_handle_text error: {str(e)}")
+            return text  # Return original text on error
+    def _process_formulas_in_text(self, text: str) -> str:
+        """
+        Process mathematical formulas in text by iteratively finding and replacing formulas.
+        - Identify inline and block formulas
+        - Replace newlines within formulas with \\
+        """
+        try:
+            # Define formula delimiters and their corresponding patterns
+            delimiters = [
+                ('$$', '$$'),  # Block formula with $$
+                ('\\[', '\\]'),  # Block formula with \[ \]
+                ('$', '$'),  # Inline formula with $
+                ('\\(', '\\)')  # Inline formula with \( \)
+            ]
+            # Process the text by iterating through each delimiter type
+            result = text
+            for start_delim, end_delim in delimiters:
+                # Create a pattern that matches from start to end delimiter
+                # Using a custom approach to avoid issues with nested delimiters
+                current_pos = 0
+                processed_parts = []
+                while current_pos < len(result):
+                    # Find the next start delimiter
+                    start_pos = result.find(start_delim, current_pos)
+                    if start_pos == -1:
+                        # No more formulas of this type
+                        processed_parts.append(result[current_pos:])
+                        break
+                    # Add text before the formula
+                    processed_parts.append(result[current_pos:start_pos])
+                    # Find the matching end delimiter
+                    end_pos = result.find(end_delim, start_pos + len(start_delim))
+                    if end_pos == -1:
+                        # No matching end delimiter, treat as regular text
+                        processed_parts.append(result[start_pos:])
+                        break
+                    # Extract the formula content (without delimiters)
+                    formula_content = result[start_pos + len(start_delim):end_pos]
+                    # Process the formula content - replace newlines with \\
+                    processed_formula = formula_content.replace('\n', ' \\\\ ')
+                    # Add the processed formula with its delimiters
+                    processed_parts.append(f"{start_delim}{processed_formula}{end_delim}")
+                    # Move past this formula
+                    current_pos = end_pos + len(end_delim)
+                # Update the result with processed text
+                result = ''.join(processed_parts)
+            return result
+        except Exception as e:
+            print(f"_process_formulas_in_text error: {str(e)}")
+            return text  # Return original text on error
+    def _remove_newline_in_heading(self, text: str) -> str:
+        """
+        Remove newline in heading
+        """
+        try:
+            # Handle Chinese text line breaks
+            def is_chinese(char):
+                return '\u4e00' <= char <= '\u9fff'
+            # Check if the text contains Chinese characters
+            if any(is_chinese(char) for char in text):
+                return text.replace('\n', '')
+            else:
+                return text.replace('\n', ' ')
+        except Exception as e:
+            print(f"_remove_newline_in_heading error: {str(e)}")
+            return text
+    def _handle_heading(self, text: str, label: str) -> str:
+        """
+        Convert section headings to appropriate markdown format
+        """
+        try:
+            level = self.heading_levels.get(label, '#')
+            text = text.strip()
+            text = self._remove_newline_in_heading(text)
+            text = self._handle_text(text)
+            return f"{level} {text}\n\n"
+        except Exception as e:
+            print(f"_handle_heading error: {str(e)}")
+            return f"# Error processing heading: {text}\n\n"
+    def _handle_list_item(self, text: str) -> str:
+        """
+        Convert list items to markdown list format
+        """
+        try:
+            return f"- {text.strip()}\n"
+        except Exception as e:
+            print(f"_handle_list_item error: {str(e)}")
+            return f"- Error processing list item: {text}\n"
+    def _handle_figure(self, text: str, section_count: int) -> str:
+        """
+        Convert base64 encoded image to markdown image syntax
+        """
+        try:
+            # Determine image format (assuming PNG if not specified)
+            img_format = "png"
+            if text.startswith("data:image/"):
+                # Extract format from data URI
+                img_format = text.split(";")[0].split("/")[1]
+            elif ";" in text and "," in text:
+                # Already in data URI format
+                return f"![Figure {section_count}]({text})\n\n"
+            else:
+                # Raw base64, convert to data URI
+                data_uri = f"data:image/{img_format};base64,{text}"
+                return f"![Figure {section_count}]({data_uri})\n\n"
+        except Exception as e:
+            print(f"_handle_figure error: {str(e)}")
+            return f"*[Error processing figure: {str(e)}]*\n\n"
+    def _handle_table(self, text: str) -> str:
+        """
+        Convert table content to markdown format
+        """
+        try:
+            markdown_content = []
+            if '<table' in text.lower() or '<tr' in text.lower():
+                markdown_table = extract_table_from_html(text)
+                markdown_content.append(markdown_table + "\n")
+            else:
+                table_lines = text.split('\n')
+                if table_lines:
+                    col_count = len(table_lines[0].split()) if table_lines[0] else 1
+                    header = '| ' + ' | '.join(table_lines[0].split()) + ' |'
+                    markdown_content.append(header)
+                    markdown_content.append('| ' + ' | '.join(['---'] * col_count) + ' |')
+                    for line in table_lines[1:]:
+                        cells = line.split()
+                        while len(cells) < col_count:
+                            cells.append('')
+                        markdown_content.append('| ' + ' | '.join(cells) + ' |')
+            return '\n'.join(markdown_content) + '\n\n'
+        except Exception as e:
+            print(f"_handle_table error: {str(e)}")
+            return f"*[Error processing table: {str(e)}]*\n\n"
+    def _handle_algorithm(self, text: str) -> str:
+        """
+        Process algorithm blocks with proper formatting
+        """
+        try:
+            # Remove algorithm environment tags if present
+            text = re.sub(r'\\begin\{algorithm\}(.*?)\\end\{algorithm\}', r'\1', text, flags=re.DOTALL)
+            text = text.replace('\\begin{algorithm}', '').replace('\\end{algorithm}', '')
+            text = text.replace('\\begin{algorithmic}', '').replace('\\end{algorithmic}', '')
+            # Process the algorithm text
+            lines = text.strip().split('\n')
+            # Check if there's a caption or label
+            caption = ""
+            algorithm_text = []
+            for line in lines:
+                if '\\caption' in line:
+                    # Extract caption text
+                    caption_match = re.search(r'\\caption\{(.*?)\}', line)
+                    if caption_match:
+                        caption = f"**{caption_match.group(1)}**\n\n"
+                    continue
+                elif '\\label' in line:
+                    continue  # Skip label lines
+                else:
+                    algorithm_text.append(line)
+            # Join the algorithm text and wrap in code block
+            formatted_text = '\n'.join(algorithm_text)
+            # Return the formatted algorithm with caption
+            return f"{caption}```\n{formatted_text}\n```\n\n"
+        except Exception as e:
+            print(f"_handle_algorithm error: {str(e)}")
+            return f"*[Error processing algorithm: {str(e)}]*\n\n{text}\n\n"
+    def _handle_formula(self, text: str) -> str:
+        """
+        Handle formula-specific content
+        """
+        try:
+            # Process the formula content
+            processed_text = self._process_formulas_in_text(text)
+            # For formula blocks, ensure they're properly formatted in markdown
+            if '$$' not in processed_text and '\\[' not in processed_text:
+                # If no block formula delimiters are present, wrap in $$ for block formula
+                processed_text = f'$${processed_text}$$'
+            return f"{processed_text}\n\n"
+        except Exception as e:
+            print(f"_handle_formula error: {str(e)}")
+            return f"*[Error processing formula: {str(e)}]*\n\n"
+    def convert(self, recognition_results: List[Dict[str, Any]]) -> str:
+        """
+        Convert recognition results to markdown format
+        """
+        try:
+            markdown_content = []
+            for section_count, result in enumerate(recognition_results):
+                try:
+                    label = result.get('label', '')
+                    text = result.get('text', '').strip()
+                    # Skip empty text
+                    if not text:
+                        continue
+                    # Handle different content types
+                    if label in {'title', 'sec', 'sub_sec'}:
+                        markdown_content.append(self._handle_heading(text, label))
+                    elif label == 'list':
+                        markdown_content.append(self._handle_list_item(text))
+                    elif label == 'fig':
+                        markdown_content.append(self._handle_figure(text, section_count))
+                    elif label == 'tab':
+                        markdown_content.append(self._handle_table(text))
+                    elif label == 'alg':
+                        markdown_content.append(self._handle_algorithm(text))
+                    elif label == 'formula':
+                        markdown_content.append(self._handle_formula(text))
+                    elif label not in self.special_labels:
+                        # Handle regular text (paragraphs, etc.)
+                        processed_text = self._handle_text(text)
+                        markdown_content.append(f"{processed_text}\n\n")
+                except Exception as e:
+                    print(f"Error processing item {section_count}: {str(e)}")
+                    # Add a placeholder for the failed item
+                    markdown_content.append(f"*[Error processing content]*\n\n")
+            # Join all content and apply post-processing
+            result = ''.join(markdown_content)
+            return self._post_process(result)
+        except Exception as e:
+            print(f"convert error: {str(e)}")
+            return f"Error generating markdown content: {str(e)}"
+    def _post_process(self, markdown_content: str) -> str:
+        """
+        Apply post-processing fixes to the generated markdown content
+        """
+        try:
+            # Handle author information
+            author_pattern = re.compile(r'\\author\{(.*?)\}', re.DOTALL)
+            def process_author_match(match):
+                # Extract author content
+                author_content = match.group(1)
+                # Process the author content
+                return self._handle_text(author_content)
+            # Replace \author{...} with processed content
+            markdown_content = author_pattern.sub(process_author_match, markdown_content)
+            # Handle special case where author is inside math environment
+            math_author_pattern = re.compile(r'\$(\\author\{.*?\})\$', re.DOTALL)
+            match = math_author_pattern.search(markdown_content)
+            if match:
+                # Extract the author command
+                author_cmd = match.group(1)
+                # Extract content from author command
+                author_content_match = re.search(r'\\author\{(.*?)\}', author_cmd, re.DOTALL)
+                if author_content_match:
+                    # Get author content and process it
+                    author_content = author_content_match.group(1)
+                    processed_content = self._handle_text(author_content)
+                    # Replace the entire $\author{...}$ block with processed content
+                    markdown_content = markdown_content.replace(match.group(0), processed_content)
+            # Replace LaTeX abstract environment with plain text
+            markdown_content = re.sub(r'\\begin\{abstract\}(.*?)\\end\{abstract\}',
+                                     r'**Abstract** \1',
+                                     markdown_content,
+                                     flags=re.DOTALL)
+            # Replace standalone \begin{abstract} (without matching end)
+            markdown_content = re.sub(r'\\begin\{abstract\}',
+                                     r'**Abstract**',
+                                     markdown_content)
+            # Replace LaTeX equation numbers with tag format, handling cases with extra backslashes
+            markdown_content = re.sub(r'\\eqno\{\((.*?)\)\}',
+                                    r'\\tag{\1}',
+                                    markdown_content)
+            # Find the starting tag of the formula
+            markdown_content = markdown_content.replace("\[ \\\\", "$$ \\\\")
+            # Find the ending tag of the formula (ensure this is the only ending tag)
+            markdown_content = markdown_content.replace("\\\\ \]", "\\\\ $$")
+            # Fix other common LaTeX issues
+            replacements = [
+                # Fix spacing issues in subscripts and superscripts
+                (r'_ {', r'_{'),
+                (r'^ {', r'^{'),
+                # Fix potential issues with multiple consecutive newlines
+                (r'\n{3,}', r'\n\n')
+            ]
+            for old, new in replacements:
+                markdown_content = re.sub(old, new, markdown_content)
+            return markdown_content
+        except Exception as e:
+            print(f"_post_process error: {str(e)}")
+            return markdown_content  # Return original content if post-processing fails

utils/utils.py ADDED Viewed

	@@ -0,0 +1,367 @@

+"""
+Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+SPDX-License-Identifier: MIT
+"""
+import copy
+import json
+import os
+import re
+from dataclasses import dataclass
+from typing import List, Tuple
+import albumentations as alb
+import cv2
+import numpy as np
+from albumentations.pytorch import ToTensorV2
+from PIL import Image
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from torchvision.transforms.functional import resize
+from utils.markdown_utils import MarkdownConverter
+def alb_wrapper(transform):
+    def f(im):
+        return transform(image=np.asarray(im))["image"]
+    return f
+test_transform = alb_wrapper(
+    alb.Compose(
+        [
+            alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
+            ToTensorV2(),
+        ]
+    )
+)
+def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
+    # print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}")
+    if x2 <= x1 or y2 <= y1:
+        return False, f"[{x1}, {y1}, {x2}, {y2}]"
+    if x1 < 0 or y1 < 0:
+        return False, f"[{x1}, {y1}, {x2}, {y2}]"
+    if not abs_coord:
+        if x2 > 1 or y2 > 1:
+            return False, f"[{x1}, {y1}, {x2}, {y2}]"
+    elif image_size is not None: # has image size
+        if x2 > image_size[0] or y2 > image_size[1]:
+            return False, f"[{x1}, {y1}, {x2}, {y2}]"
+    return True, None
+def adjust_box_edges(image, boxes: List[List[float]], max_pixels=15, threshold=0.2):
+    """
+    Image: cv2.image object, or Path
+    Input: boxes: list of boxes [[x1, y1, x2, y2]]. Using absolute coordinates.
+    """
+    if isinstance(image, str):
+        image = cv2.imread(image)
+    img_h, img_w = image.shape[:2]
+    new_boxes = []
+    for box in boxes:
+        best_box = copy.deepcopy(box)
+        def check_edge(img, current_box, i, is_vertical):
+            edge = current_box[i]
+            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+            _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+            if is_vertical:
+                line = binary[current_box[1] : current_box[3] + 1, edge]
+            else:
+                line = binary[edge, current_box[0] : current_box[2] + 1]
+            transitions = np.abs(np.diff(line))
+            return np.sum(transitions) / len(transitions)
+        # Only widen the box
+        edges = [(0, -1, True), (2, 1, True), (1, -1, False), (3, 1, False)]
+        current_box = copy.deepcopy(box)
+        # make sure the box is within the image
+        current_box[0] = min(max(current_box[0], 0), img_w - 1)
+        current_box[1] = min(max(current_box[1], 0), img_h - 1)
+        current_box[2] = min(max(current_box[2], 0), img_w - 1)
+        current_box[3] = min(max(current_box[3], 0), img_h - 1)
+        for i, direction, is_vertical in edges:
+            best_score = check_edge(image, current_box, i, is_vertical)
+            if best_score <= threshold:
+                continue
+            for step in range(max_pixels):
+                current_box[i] += direction
+                if i == 0 or i == 2:
+                    current_box[i] = min(max(current_box[i], 0), img_w - 1)
+                else:
+                    current_box[i] = min(max(current_box[i], 0), img_h - 1)
+                score = check_edge(image, current_box, i, is_vertical)
+                if score < best_score:
+                    best_score = score
+                    best_box = copy.deepcopy(current_box)
+                if score <= threshold:
+                    break
+        new_boxes.append(best_box)
+    return new_boxes
+def parse_layout_string(bbox_str):
+    """Parse layout string using regular expressions"""
+    pattern = r"\[(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+),\s*(\d*\.?\d+)\]\s*(\w+)"
+    matches = re.finditer(pattern, bbox_str)
+    parsed_results = []
+    for match in matches:
+        coords = [float(match.group(i)) for i in range(1, 5)]
+        label = match.group(5).strip()
+        parsed_results.append((coords, label))
+    return parsed_results
+@dataclass
+class ImageDimensions:
+    """Class to store image dimensions"""
+    original_w: int
+    original_h: int
+    padded_w: int
+    padded_h: int
+def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[int, int, int, int]:
+    """Map coordinates from padded image back to original image
+    Args:
+        x1, y1, x2, y2: Coordinates in padded image
+        dims: Image dimensions object
+    Returns:
+        tuple: (x1, y1, x2, y2) coordinates in original image
+    """
+    try:
+        # Calculate padding offsets
+        top = (dims.padded_h - dims.original_h) // 2
+        left = (dims.padded_w - dims.original_w) // 2
+        # Map back to original coordinates
+        orig_x1 = max(0, x1 - left)
+        orig_y1 = max(0, y1 - top)
+        orig_x2 = min(dims.original_w, x2 - left)
+        orig_y2 = min(dims.original_h, y2 - top)
+        # Ensure we have a valid box (width and height > 0)
+        if orig_x2 <= orig_x1:
+            orig_x2 = min(orig_x1 + 1, dims.original_w)
+        if orig_y2 <= orig_y1:
+            orig_y2 = min(orig_y1 + 1, dims.original_h)
+        return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
+    except Exception as e:
+        print(f"map_to_original_coordinates error: {str(e)}")
+        # Return safe coordinates
+        return 0, 0, min(100, dims.original_w), min(100, dims.original_h)
+def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
+    """
+        From absolute coordinates to relevant coordinates
+        e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
+    """
+    try:
+        x1, y1, x2, y2 = abs_coords
+        return round(x1 / dims.original_w, 3), round(y1 / dims.original_h, 3), round(x2 / dims.original_w, 3), round(y2 / dims.original_h, 3)
+    except Exception as e:
+        print(f"map_to_relevant_coordinates error: {str(e)}")
+        return 0.0, 0.0, 1.0, 1.0  # Return full image coordinates
+def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
+    """Process and adjust coordinates
+    Args:
+        coords: Normalized coordinates [x1, y1, x2, y2]
+        padded_image: Padded image
+        dims: Image dimensions object
+        previous_box: Previous box coordinates for overlap adjustment
+    Returns:
+        tuple: (x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box)
+    """
+    try:
+        # Convert normalized coordinates to absolute coordinates
+        x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
+        x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
+        # Ensure coordinates are within image bounds before adjustment
+        x1 = max(0, min(x1, dims.padded_w - 1))
+        y1 = max(0, min(y1, dims.padded_h - 1))
+        x2 = max(0, min(x2, dims.padded_w))
+        y2 = max(0, min(y2, dims.padded_h))
+        # Ensure width and height are at least 1 pixel
+        if x2 <= x1:
+            x2 = min(x1 + 1, dims.padded_w)
+        if y2 <= y1:
+            y2 = min(y1 + 1, dims.padded_h)
+        # Extend box boundaries
+        new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
+        x1, y1, x2, y2 = new_boxes[0]
+        # Ensure coordinates are still within image bounds after adjustment
+        x1 = max(0, min(x1, dims.padded_w - 1))
+        y1 = max(0, min(y1, dims.padded_h - 1))
+        x2 = max(0, min(x2, dims.padded_w))
+        y2 = max(0, min(y2, dims.padded_h))
+        # Ensure width and height are at least 1 pixel after adjustment
+        if x2 <= x1:
+            x2 = min(x1 + 1, dims.padded_w)
+        if y2 <= y1:
+            y2 = min(y1 + 1, dims.padded_h)
+        # Check for overlap with previous box and adjust
+        if previous_box is not None:
+            prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
+            if (x1 < prev_x2 and x2 > prev_x1) and (y1 < prev_y2 and y2 > prev_y1):
+                y1 = prev_y2
+                # Ensure y1 is still valid
+                y1 = min(y1, dims.padded_h - 1)
+                # Make sure y2 is still greater than y1
+                if y2 <= y1:
+                    y2 = min(y1 + 1, dims.padded_h)
+        # Update previous box
+        new_previous_box = [x1, y1, x2, y2]
+        # Map to original coordinates
+        orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
+            x1, y1, x2, y2, dims
+        )
+        return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
+    except Exception as e:
+        print(f"process_coordinates error: {str(e)}")
+        # Return safe values
+        orig_x1, orig_y1, orig_x2, orig_y2 = 0, 0, min(100, dims.original_w), min(100, dims.original_h)
+        return 0, 0, 100, 100, orig_x1, orig_y1, orig_x2, orig_y2, [0, 0, 100, 100]
+def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
+    """Load and prepare image with padding while maintaining aspect ratio
+    Args:
+        image: PIL image
+    Returns:
+        tuple: (padded_image, image_dimensions)
+    """
+    try:
+        # Convert PIL image to OpenCV format
+        image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        original_h, original_w = image.shape[:2]
+        # Calculate padding to make square image
+        max_size = max(original_h, original_w)
+        top = (max_size - original_h) // 2
+        bottom = max_size - original_h - top
+        left = (max_size - original_w) // 2
+        right = max_size - original_w - left
+        # Apply padding
+        padded_image = cv2.copyMakeBorder(image, top, bottom, left, right,
+                                cv2.BORDER_CONSTANT, value=(0, 0, 0))
+        padded_h, padded_w = padded_image.shape[:2]
+        dimensions = ImageDimensions(
+            original_w=original_w,
+            original_h=original_h,
+            padded_w=padded_w,
+            padded_h=padded_h
+        )
+        return padded_image, dimensions
+    except Exception as e:
+        print(f"prepare_image error: {str(e)}")
+        # Create a minimal valid image and dimensions
+        h, w = image.height, image.width
+        dimensions = ImageDimensions(
+            original_w=w,
+            original_h=h,
+            padded_w=w,
+            padded_h=h
+        )
+        # Return a black image of the same size
+        return np.zeros((h, w, 3), dtype=np.uint8), dimensions
+def setup_output_dirs(save_dir):
+    """Create necessary output directories"""
+    os.makedirs(save_dir, exist_ok=True)
+    os.makedirs(os.path.join(save_dir, "markdown"), exist_ok=True)
+    os.makedirs(os.path.join(save_dir, "recognition_json"), exist_ok=True)
+def save_outputs(recognition_results, image_path, save_dir):
+    """Save JSON and markdown outputs"""
+    basename = os.path.splitext(os.path.basename(image_path))[0]
+    # Save JSON file
+    json_path = os.path.join(save_dir, "recognition_json", f"{basename}.json")
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(recognition_results, f, ensure_ascii=False, indent=2)
+    # Generate and save markdown file
+    markdown_converter = MarkdownConverter()
+    markdown_content = markdown_converter.convert(recognition_results)
+    markdown_path = os.path.join(save_dir, "markdown", f"{basename}.md")
+    with open(markdown_path, "w", encoding="utf-8") as f:
+        f.write(markdown_content)
+    return json_path
+def crop_margin(img: Image.Image) -> Image.Image:
+    """Crop margins from image"""
+    try:
+        width, height = img.size
+        if width == 0 or height == 0:
+            print("Warning: Image has zero width or height")
+            return img
+        data = np.array(img.convert("L"))
+        data = data.astype(np.uint8)
+        max_val = data.max()
+        min_val = data.min()
+        if max_val == min_val:
+            return img
+        data = (data - min_val) / (max_val - min_val) * 255
+        gray = 255 * (data < 200).astype(np.uint8)
+        coords = cv2.findNonZero(gray)  # Find all non-zero points (text)
+        if coords is None:
+            return img
+        a, b, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
+        # Ensure crop coordinates are within image bounds
+        a = max(0, a)
+        b = max(0, b)
+        w = min(w, width - a)
+        h = min(h, height - b)
+        # Only crop if we have a valid region
+        if w > 0 and h > 0:
+            return img.crop((a, b, a + w, b + h))
+        return img
+    except Exception as e:
+        print(f"crop_margin error: {str(e)}")
+        return img  # Return original image on error