Spaces:

ElektrikSpark
/

VLM-playground

Running

App Files Files Community

trevorpfiz commited on 15 days ago

Commit

4aa9a45

1 Parent(s): ae9d014

fix: unexpected keyword argument 'file_name'

Browse files

Files changed (3) hide show

src/vlm_playground/app.py +2 -0
src/vlm_playground/preview_app.py +10 -8
src/vlm_playground/preview_app_local.py +786 -0

src/vlm_playground/app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from .preview_app import create_blocks_app
 def run() -> None:
     demo = create_blocks_app()

 from .preview_app import create_blocks_app
+# from .preview_app_local import create_blocks_app as create_blocks_app_local
 def run() -> None:
     demo = create_blocks_app()

src/vlm_playground/preview_app.py CHANGED Viewed

@@ -512,12 +512,8 @@ def create_blocks_app():
                         processed_view = gr.Image(type="pil", height=520)
                 with gr.Row():
-                    download_jsonl = gr.DownloadButton(
-                        label="Download JSONL", file_name="results.jsonl"
-                    )
-                    download_markdown = gr.DownloadButton(
-                        label="Download Markdown", file_name="results.md"
-                    )
         # ===== Handlers =====
         def on_template_change(choice: str) -> str:
@@ -734,7 +730,10 @@ def create_blocks_app():
                     obj = {"page": i + 1, "layout": res["layout_result"]}
                     lines.append(json.dumps(obj, ensure_ascii=False))
             content = "\n".join(lines) if lines else ""
-            return gr.DownloadButton.update(value=content.encode("utf-8"))
         def download_current_markdown(state: Dict[str, Any]):
             if not state.get("parsed"):
@@ -744,7 +743,10 @@ def create_blocks_app():
                 if res and res.get("markdown"):
                     chunks.append(f"## Page {i + 1}\n\n{res['markdown']}")
             content = "\n\n---\n\n".join(chunks) if chunks else ""
-            return gr.DownloadButton.update(value=content.encode("utf-8"))
         # Wire events
         template.change(on_template_change, inputs=[template], outputs=[prompt_text])

                         processed_view = gr.Image(type="pil", height=520)
                 with gr.Row():
+                    download_jsonl = gr.DownloadButton(label="Download JSONL")
+                    download_markdown = gr.DownloadButton(label="Download Markdown")
         # ===== Handlers =====
         def on_template_change(choice: str) -> str:
                     obj = {"page": i + 1, "layout": res["layout_result"]}
                     lines.append(json.dumps(obj, ensure_ascii=False))
             content = "\n".join(lines) if lines else ""
+            out_path = os.path.join(TMP_DIR, "results.jsonl")
+            with open(out_path, "w", encoding="utf-8") as f:
+                f.write(content)
+            return gr.DownloadButton.update(value=out_path)
         def download_current_markdown(state: Dict[str, Any]):
             if not state.get("parsed"):
                 if res and res.get("markdown"):
                     chunks.append(f"## Page {i + 1}\n\n{res['markdown']}")
             content = "\n\n---\n\n".join(chunks) if chunks else ""
+            out_path = os.path.join(TMP_DIR, "results.md")
+            with open(out_path, "w", encoding="utf-8") as f:
+                f.write(content)
+            return gr.DownloadButton.update(value=out_path)
         # Wire events
         template.change(on_template_change, inputs=[template], outputs=[prompt_text])

src/vlm_playground/preview_app_local.py ADDED Viewed

	@@ -0,0 +1,786 @@

+import gc
+import types
+import sys
+import hashlib
+import json
+import math
+import os
+import re
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+import fitz  # PyMuPDF
+import gradio as gr
+import requests
+import torch
+from huggingface_hub import snapshot_download
+from PIL import Image, ImageDraw, ImageFont
+from qwen_vl_utils import process_vision_info
+from transformers import AutoModelForCausalLM, AutoProcessor
+from .utils.constants import IMAGE_FACTOR, MAX_PIXELS, MIN_PIXELS
+from .utils.prompts import dict_promptmode_to_prompt
+APP_TITLE = "PreviewSpace — VLM Playground (Local)"
+TMP_DIR = "/tmp/previewspace"
+MODELS_DIR = os.path.join(TMP_DIR, "models")
+DOTS_REPO_ID = "rednote-hilab/dots.ocr"
+DOTS_LOCAL_DIR = os.path.join(MODELS_DIR, "dots.ocr")
+LOCAL_DEFAULT_MAX_NEW_TOKENS = 2048
+os.makedirs(TMP_DIR, exist_ok=True)
+os.makedirs(MODELS_DIR, exist_ok=True)
+def round_by_factor(number: int, factor: int) -> int:
+    return round(number / factor) * factor
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = IMAGE_FACTOR,
+    min_pixels: int = MIN_PIXELS,
+    max_pixels: int = MAX_PIXELS,
+) -> Tuple[int, int]:
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError("absolute aspect ratio must be smaller than 200")
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = round_by_factor(height / beta, factor)
+        w_bar = round_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = round_by_factor(height * beta, factor)
+        w_bar = round_by_factor(width * beta, factor)
+    return int(h_bar), int(w_bar)
+def fetch_image(image_input: Any) -> Image.Image:
+    if isinstance(image_input, str):
+        if image_input.startswith(("http://", "https://")):
+            response = requests.get(image_input, timeout=60)
+            image = Image.open(BytesIO(response.content)).convert("RGB")
+        else:
+            image = Image.open(image_input).convert("RGB")
+    elif isinstance(image_input, Image.Image):
+        image = image_input.convert("RGB")
+    else:
+        raise ValueError(f"Invalid image input type: {type(image_input)}")
+    return image
+def load_images_from_pdf(pdf_path: str) -> List[Image.Image]:
+    images: List[Image.Image] = []
+    pdf_document = fitz.open(pdf_path)
+    try:
+        for page_idx in range(len(pdf_document)):
+            page = pdf_document.load_page(page_idx)
+            pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
+            img_data = pix.tobytes("ppm")
+            image = Image.open(BytesIO(img_data)).convert("RGB")
+            images.append(image)
+    finally:
+        pdf_document.close()
+    return images
+def file_checksum(path: str, chunk_size: int = 1 << 20) -> str:
+    hasher = hashlib.sha256()
+    with open(path, "rb") as f:
+        while True:
+            chunk = f.read(chunk_size)
+            if not chunk:
+                break
+            hasher.update(chunk)
+    return hasher.hexdigest()
+def draw_layout_on_image(image: Image.Image, layout_data: List[Dict]) -> Image.Image:
+    img = image.copy()
+    draw = ImageDraw.Draw(img)
+    colors = {
+        "Caption": "#FF6B6B",
+        "Footnote": "#4ECDC4",
+        "Formula": "#45B7D1",
+        "List-item": "#96CEB4",
+        "Page-footer": "#FFEAA7",
+        "Page-header": "#DDA0DD",
+        "Picture": "#FFD93D",
+        "Section-header": "#6C5CE7",
+        "Table": "#FD79A8",
+        "Text": "#74B9FF",
+        "Title": "#E17055",
+    }
+    try:
+        try:
+            font = ImageFont.truetype(
+                "/System/Library/Fonts/Supplemental/Arial Bold.ttf", 12
+            )
+        except Exception:
+            try:
+                font = ImageFont.truetype(
+                    "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 12
+                )
+            except Exception:
+                font = ImageFont.load_default()
+        for item in layout_data:
+            bbox = item.get("bbox")
+            category = item.get("category")
+            if not bbox or not category:
+                continue
+            color = colors.get(category, "#000000")
+            draw.rectangle(bbox, outline=color, width=2)
+            label = str(category)
+            label_bbox = draw.textbbox((0, 0), label, font=font)
+            label_w = label_bbox[2] - label_bbox[0]
+            label_h = label_bbox[3] - label_bbox[1]
+            x1, y1 = int(bbox[0]), int(bbox[1])
+            lx = x1
+            ly = max(0, y1 - label_h - 2)
+            draw.rectangle([lx, ly, lx + label_w + 4, ly + label_h + 2], fill=color)
+            draw.text((lx + 2, ly + 1), label, fill="white", font=font)
+    except Exception:
+        pass
+    return img
+def is_arabic_text(text: str) -> bool:
+    if not text:
+        return False
+    header_pattern = r"^#{1,6}\s+(.+)$"
+    paragraph_pattern = r"^(?!#{1,6}\s|!\[|```|\||\s*[-*+]\s|\s*\d+\.\s)(.+)$"
+    content_lines: List[str] = []
+    for line in text.split("\n"):
+        s = line.strip()
+        if not s:
+            continue
+        m = re.match(header_pattern, s)
+        if m:
+            content_lines.append(m.group(1))
+            continue
+        if re.match(paragraph_pattern, s):
+            content_lines.append(s)
+    if not content_lines:
+        return False
+    combined = " ".join(content_lines)
+    arabic = 0
+    total = 0
+    for ch in combined:
+        if ch.isalpha():
+            total += 1
+            if (
+                ("\u0600" <= ch <= "\u06ff")
+                or ("\u0750" <= ch <= "\u077f")
+                or ("\u08a0" <= ch <= "\u08ff")
+            ):
+                arabic += 1
+    if total == 0:
+        return False
+    return (arabic / total) > 0.5
+def extract_json(text: str) -> Optional[Dict[str, Any]]:
+    if not text:
+        return None
+    try:
+        return json.loads(text)
+    except Exception:
+        pass
+    brace_start = text.find("{")
+    brace_end = text.rfind("}")
+    if 0 <= brace_start < brace_end:
+        snippet = text[brace_start : brace_end + 1]
+        try:
+            return json.loads(snippet)
+        except Exception:
+            pass
+    fenced = re.findall(r"```json\s*([\s\S]*?)\s*```", text)
+    for block in fenced:
+        try:
+            return json.loads(block)
+        except Exception:
+            continue
+    return None
+model: Optional[AutoModelForCausalLM] = None
+processor: Optional[AutoProcessor] = None
+def ensure_model_loaded() -> Tuple[AutoModelForCausalLM, AutoProcessor]:
+    global model, processor
+    if model is not None and processor is not None:
+        return model, processor
+    os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1")
+    snapshot_download(
+        repo_id=DOTS_REPO_ID,
+        local_dir=DOTS_LOCAL_DIR,
+        local_dir_use_symlinks=False,
+    )
+    # Work around transformers dynamic module parent package issue with repo name containing a dot
+    # Ensure 'transformers_modules' and 'transformers_modules.dots' exist as packages
+    if "transformers_modules" not in sys.modules:
+        pkg = types.ModuleType("transformers_modules")
+        pkg.__path__ = []  # type: ignore[attr-defined]
+        sys.modules["transformers_modules"] = pkg
+    if "transformers_modules.dots" not in sys.modules:
+        subpkg = types.ModuleType("transformers_modules.dots")
+        subpkg.__path__ = []  # type: ignore[attr-defined]
+        sys.modules["transformers_modules.dots"] = subpkg
+    use_mps = torch.backends.mps.is_available()
+    dtype = (
+        torch.float16
+        if use_mps
+        else (torch.bfloat16 if torch.cuda.is_available() else torch.float32)
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        DOTS_LOCAL_DIR,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+    )
+    if use_mps:
+        model.to("mps")
+    proc = AutoProcessor.from_pretrained(DOTS_LOCAL_DIR, trust_remote_code=True)
+    processor = proc
+    return model, processor
+def run_inference(
+    image: Image.Image,
+    prompt_text: str,
+    max_new_tokens: int = LOCAL_DEFAULT_MAX_NEW_TOKENS,
+) -> str:
+    mdl, proc = ensure_model_loaded()
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt_text},
+            ],
+        }
+    ]
+    text = proc.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = proc(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    device = (
+        "mps"
+        if torch.backends.mps.is_available()
+        else ("cuda" if torch.cuda.is_available() else "cpu")
+    )
+    inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
+    with torch.no_grad():
+        generated_ids = mdl.generate(
+            **inputs,
+            max_new_tokens=int(max_new_tokens),
+            do_sample=False,
+            temperature=0.1,
+        )
+    trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0] if output_text else ""
+def process_single_image(
+    image: Image.Image,
+    prompt_text: str,
+    max_new_tokens: int,
+) -> Dict[str, Any]:
+    img = fetch_image(image)
+    raw = run_inference(img, prompt_text, max_new_tokens=max_new_tokens)
+    result: Dict[str, Any] = {
+        "original_image": img,
+        "processed_image": img,
+        "raw_output": raw,
+        "layout_result": None,
+        "markdown": None,
+    }
+    data = extract_json(raw)
+    if isinstance(data, dict):
+        result["layout_result"] = data
+        items = data.get("elements", data.get("elements_list", data.get("content", [])))
+        if isinstance(items, list):
+            result["processed_image"] = draw_layout_on_image(img, items)
+            result["markdown"] = layoutjson2md(img, items)
+    if result["markdown"] is None:
+        result["markdown"] = raw
+    return result
+def layoutjson2md(
+    image: Image.Image, layout_data: List[Dict], text_key: str = "text"
+) -> str:
+    lines: List[str] = []
+    try:
+        items = sorted(
+            layout_data,
+            key=lambda x: (
+                x.get("bbox", [0, 0, 0, 0])[1],
+                x.get("bbox", [0, 0, 0, 0])[0],
+            ),
+        )
+        for item in items:
+            category = item.get("category", "")
+            text = item.get(text_key, "")
+            if category == "Title" and text:
+                lines.append(f"# {text}\n")
+            elif category == "Section-header" and text:
+                lines.append(f"## {text}\n")
+            elif category == "List-item" and text:
+                lines.append(f"- {text}\n")
+            elif category == "Table" and text:
+                if text.strip().startswith("<"):
+                    lines.append(text + "\n")
+                else:
+                    lines.append(f"**Table:** {text}\n")
+            elif category == "Formula" and text:
+                if text.strip().startswith("$") or "\\" in text:
+                    lines.append(f"$$\n{text}\n$$\n")
+                else:
+                    lines.append(f"**Formula:** {text}\n")
+            elif category == "Caption" and text:
+                lines.append(f"*{text}*\n")
+            elif category in ["Page-header", "Page-footer"]:
+                continue
+            elif category == "Picture":
+                continue
+            elif text:
+                lines.append(f"{text}\n")
+            lines.append("")
+    except Exception:
+        return json.dumps(layout_data, ensure_ascii=False)
+    return "\n".join(lines)
+def create_blocks_app():
+    css = """
+    .main-container { max-width: 1500px; margin: 0 auto; }
+    .header-text { text-align: center; color: #1f2937; margin-bottom: 12px; }
+    .page-info { text-align: center; padding: 8px 16px; border-radius: 20px; font-weight: 600; }
+    .process-button { border: none !important; color: white !important; font-weight: 700 !important; }
+    """
+    with gr.Blocks(theme=gr.themes.Soft(), css=css, title=APP_TITLE) as demo:
+        doc_state = gr.State(
+            {
+                "images": [],
+                "current_page": 0,
+                "total_pages": 0,
+                "file_type": None,
+                "checksum": None,
+                "results": [],
+                "parsed": False,
+            }
+        )
+        cache_state = gr.State({})
+        gr.HTML(
+            """
+            <div class=\"header-text\">
+                <h2>VLM Playground — dots.ocr (Local)</h2>
+                <p>Optimized defaults for Apple Silicon / CPU dev.</p>
+            </div>
+            """
+        )
+        with gr.Row(elem_classes=["main-container"]):
+            with gr.Column(scale=4):
+                file_input = gr.File(
+                    label="Upload PDF or Image",
+                    file_types=[
+                        ".pdf",
+                        ".png",
+                        ".jpg",
+                        ".jpeg",
+                        ".bmp",
+                        ".tiff",
+                        ".webp",
+                    ],
+                    type="filepath",
+                )
+                with gr.Group():
+                    template = gr.Dropdown(
+                        label="Prompt Template",
+                        choices=["Layout Extraction"],
+                        value="Layout Extraction",
+                    )
+                    prompt_text = gr.Textbox(
+                        label="Current Prompt",
+                        value=dict_promptmode_to_prompt.get("prompt_layout_all_en", ""),
+                        lines=6,
+                    )
+                with gr.Row():
+                    parse_button = gr.Button(
+                        "Parse", variant="primary", elem_classes=["process-button"]
+                    )
+                    clear_button = gr.Button("Clear")
+                with gr.Accordion("Advanced", open=False):
+                    max_new_tokens = gr.Slider(
+                        minimum=256,
+                        maximum=8192,
+                        value=LOCAL_DEFAULT_MAX_NEW_TOKENS,
+                        step=128,
+                        label="Max new tokens",
+                    )
+                    page_range = gr.Textbox(
+                        label="Page selection",
+                        placeholder="e.g., 1-3,5 (blank = current page, 'all' = all pages)",
+                    )
+            with gr.Column(scale=5):
+                preview_image = gr.Image(label="Page Preview", type="pil", height=520)
+                with gr.Row():
+                    prev_btn = gr.Button("◀ Prev")
+                    page_info = gr.HTML('<div class="page-info">No file</div>')
+                    next_btn = gr.Button("Next ▶")
+                with gr.Row():
+                    page_jump = gr.Number(value=1, label="Page #", precision=0)
+                    jump_btn = gr.Button("Go")
+            with gr.Column(scale=6):
+                with gr.Tabs():
+                    with gr.Tab("Markdown Render"):
+                        md_render = gr.Markdown(
+                            value="Upload and parse to view results", height=520
+                        )
+                    with gr.Tab("Raw Markdown"):
+                        md_raw = gr.Textbox(value="", lines=20)
+                    with gr.Tab("Current Page JSON"):
+                        json_view = gr.JSON(value=None)
+                    with gr.Tab("Processed Image"):
+                        processed_view = gr.Image(type="pil", height=520)
+                with gr.Row():
+                    download_jsonl = gr.DownloadButton(label="Download JSONL")
+                    download_markdown = gr.DownloadButton(label="Download Markdown")
+        def on_template_change(choice: str) -> str:
+            return dict_promptmode_to_prompt.get("prompt_layout_all_en", "")
+        def on_file_change(path: Optional[str]):
+            if not path or not os.path.exists(path):
+                return (
+                    {
+                        "images": [],
+                        "current_page": 0,
+                        "total_pages": 0,
+                        "file_type": None,
+                        "checksum": None,
+                        "results": [],
+                        "parsed": False,
+                    },
+                    None,
+                    '<div class="page-info">No file</div>',
+                )
+            checksum = file_checksum(path)
+            ext = os.path.splitext(path)[1].lower()
+            if ext == ".pdf":
+                images = load_images_from_pdf(path)
+                state = {
+                    "images": images,
+                    "current_page": 0,
+                    "total_pages": len(images),
+                    "file_type": "pdf",
+                    "checksum": checksum,
+                    "results": [None] * len(images),
+                    "parsed": False,
+                }
+                return (
+                    state,
+                    images[0] if images else None,
+                    f'<div class="page-info">Page 1 / {len(images)}</div>',
+                )
+            else:
+                image = Image.open(path).convert("RGB")
+                state = {
+                    "images": [image],
+                    "current_page": 0,
+                    "total_pages": 1,
+                    "file_type": "image",
+                    "checksum": checksum,
+                    "results": [None],
+                    "parsed": False,
+                }
+                return state, image, '<div class="page-info">Page 1 / 1</div>'
+        def nav_page(state: Dict[str, Any], direction: str):
+            if not state.get("images"):
+                return (
+                    state,
+                    None,
+                    '<div class="page-info">No file</div>',
+                    "No results",
+                    "",
+                    None,
+                    None,
+                )
+            if direction == "prev":
+                state["current_page"] = max(0, state["current_page"] - 1)
+            elif direction == "next":
+                state["current_page"] = min(
+                    state["total_pages"] - 1, state["current_page"] + 1
+                )
+            idx = state["current_page"]
+            img = state["images"][idx]
+            info = (
+                f'<div class="page-info">Page {idx + 1} / {state["total_pages"]}</div>'
+            )
+            result = (
+                state["results"][idx]
+                if state.get("parsed") and idx < len(state["results"])
+                else None
+            )
+            md = result.get("markdown") if result else "Page not processed yet"
+            md_out = gr.update(value=md, rtl=True) if is_arabic_text(md) else md
+            md_raw_text = md
+            proc_img = result.get("processed_image") if result else None
+            js = result.get("layout_result") if result else None
+            return state, img, info, md_out, md_raw_text, proc_img, js
+        def jump_to_page(state: Dict[str, Any], page_num: Any):
+            if not state.get("images"):
+                return (
+                    state,
+                    None,
+                    '<div class="page-info">No file</div>',
+                    "No results",
+                    "",
+                    None,
+                    None,
+                )
+            try:
+                n = int(page_num)
+            except Exception:
+                n = 1
+            n = max(1, min(state["total_pages"], n))
+            state["current_page"] = n - 1
+            return nav_page(state, direction="stay")
+        def parse_pages(
+            state: Dict[str, Any],
+            prompt: str,
+            max_tokens: int,
+            selection: Optional[str],
+        ):
+            if not state.get("images"):
+                return state, None, "No file", "No content", "", None, None
+            indices: List[int] = []
+            if not selection or selection.strip() == "":
+                indices = [state["current_page"]]
+            elif selection.strip().lower() == "all":
+                indices = list(range(state["total_pages"]))
+            else:
+                parts = [p.strip() for p in selection.split(",") if p.strip()]
+                for p in parts:
+                    if "-" in p:
+                        a, b = p.split("-", 1)
+                        try:
+                            a_i = max(1, int(a))
+                            b_i = min(state["total_pages"], int(b))
+                            for i in range(a_i - 1, b_i):
+                                indices.append(i)
+                        except Exception:
+                            continue
+                    else:
+                        try:
+                            i = max(1, min(state["total_pages"], int(p)))
+                            indices.append(i - 1)
+                        except Exception:
+                            continue
+                indices = sorted(
+                    set([i for i in indices if 0 <= i < state["total_pages"]])
+                )
+            results = state.get("results") or [None] * state["total_pages"]
+            for i in indices:
+                img = state["images"][i]
+                prompt_hash = hashlib.sha256(prompt.encode("utf-8")).hexdigest()[:16]
+                cache_key = (
+                    state["checksum"],
+                    i,
+                    prompt_hash,
+                    int(max_tokens),
+                )
+                cached = cache_state.value.get(cache_key)
+                if cached:
+                    results[i] = cached
+                    continue
+                res = process_single_image(
+                    img,
+                    prompt_text=prompt,
+                    max_new_tokens=int(max_tokens),
+                )
+                results[i] = res
+                cache_state.value[cache_key] = res
+            state["results"] = results
+            state["parsed"] = True
+            idx = state["current_page"]
+            curr = results[idx]
+            md = curr.get("markdown") if curr else "No content"
+            md_out = gr.update(value=md, rtl=True) if is_arabic_text(md) else md
+            md_raw_text = md
+            proc_img = curr.get("processed_image") if curr else None
+            js = curr.get("layout_result") if curr else None
+            info = (
+                f'<div class="page-info">Page {idx + 1} / {state["total_pages"]}</div>'
+            )
+            prev = state["images"][idx]
+            return state, prev, info, md_out, md_raw_text, proc_img, js
+        def clear_all():
+            gc.collect()
+            return (
+                {
+                    "images": [],
+                    "current_page": 0,
+                    "total_pages": 0,
+                    "file_type": None,
+                    "checksum": None,
+                    "results": [],
+                    "parsed": False,
+                },
+                None,
+                '<div class="page-info">No file</div>',
+                "Upload and parse to view results",
+                "",
+                None,
+                None,
+            )
+        def download_current_jsonl(state: Dict[str, Any]):
+            if not state.get("parsed"):
+                return gr.DownloadButton.update(value=b"")
+            lines: List[str] = []
+            for i, res in enumerate(state.get("results", [])):
+                if res and res.get("layout_result") is not None:
+                    obj = {"page": i + 1, "layout": res["layout_result"]}
+                    lines.append(json.dumps(obj, ensure_ascii=False))
+            content = "\n".join(lines) if lines else ""
+            out_path = os.path.join(TMP_DIR, "results.jsonl")
+            with open(out_path, "w", encoding="utf-8") as f:
+                f.write(content)
+            return gr.DownloadButton.update(value=out_path)
+        def download_current_markdown(state: Dict[str, Any]):
+            if not state.get("parsed"):
+                return gr.DownloadButton.update(value=b"")
+            chunks: List[str] = []
+            for i, res in enumerate(state.get("results", [])):
+                if res and res.get("markdown"):
+                    chunks.append(f"## Page {i + 1}\n\n{res['markdown']}")
+            content = "\n\n---\n\n".join(chunks) if chunks else ""
+            out_path = os.path.join(TMP_DIR, "results.md")
+            with open(out_path, "w", encoding="utf-8") as f:
+                f.write(content)
+            return gr.DownloadButton.update(value=out_path)
+        template.change(on_template_change, inputs=[template], outputs=[prompt_text])
+        file_input.change(
+            on_file_change,
+            inputs=[file_input],
+            outputs=[doc_state, preview_image, page_info],
+        )
+        prev_btn.click(
+            lambda s: nav_page(s, "prev"),
+            inputs=[doc_state],
+            outputs=[
+                doc_state,
+                preview_image,
+                page_info,
+                md_render,
+                md_raw,
+                processed_view,
+                json_view,
+            ],
+        )
+        next_btn.click(
+            lambda s: nav_page(s, "next"),
+            inputs=[doc_state],
+            outputs=[
+                doc_state,
+                preview_image,
+                page_info,
+                md_render,
+                md_raw,
+                processed_view,
+                json_view,
+            ],
+        )
+        jump_btn.click(
+            jump_to_page,
+            inputs=[doc_state, page_jump],
+            outputs=[
+                doc_state,
+                preview_image,
+                page_info,
+                md_render,
+                md_raw,
+                processed_view,
+                json_view,
+            ],
+        )
+        parse_button.click(
+            parse_pages,
+            inputs=[doc_state, prompt_text, max_new_tokens, page_range],
+            outputs=[
+                doc_state,
+                preview_image,
+                page_info,
+                md_render,
+                md_raw,
+                processed_view,
+                json_view,
+            ],
+        )
+        clear_button.click(
+            clear_all,
+            outputs=[
+                doc_state,
+                preview_image,
+                page_info,
+                md_render,
+                md_raw,
+                processed_view,
+                json_view,
+            ],
+        )
+        download_jsonl.click(
+            download_current_jsonl, inputs=[doc_state], outputs=[download_jsonl]
+        )
+        download_markdown.click(
+            download_current_markdown, inputs=[doc_state], outputs=[download_markdown]
+        )
+        return demo