import os import shutil import subprocess import tempfile import json import gradio as gr def process_upload(pdf_file, output_format): out_dir = tempfile.mkdtemp() fmt = "markdown" if output_format == "markdown" else "json" cmd = [ "marker_single", pdf_file.name, "--output_format", fmt, "--output_dir", out_dir, "--paginate_output" ] subprocess.run(cmd, check=True) # Recursively find only .md/.json files collected = [] for root, _, files in os.walk(out_dir): for fname in sorted(files): if fmt == "markdown" and fname.lower().endswith(".md"): collected.append(os.path.join(root, fname)) elif fmt == "json" and fname.lower().endswith(".json"): collected.append(os.path.join(root, fname)) pages = [] for path in collected: with open(path, 'r', encoding='utf-8') as f: pages.append(f.read()) shutil.rmtree(out_dir) if output_format == "markdown": return "\n\n---\n\n".join(pages) else: return json.dumps({"pages": pages}, indent=2, ensure_ascii=False) demo = gr.Interface( fn=process_upload, inputs=[ gr.File(label="Upload PDF", file_types=[".pdf"]), gr.Radio(["markdown","json"], value="markdown", label="Output format") ], outputs=gr.Code(label="Converted Output"), title="PDF → Markdown/JSON with LaTeX Support", description=( "Upload a PDF and get back Markdown or structured JSON, " "with math preserved as LaTeX." ) ) if __name__=="__main__": demo.launch(server_name="0.0.0.0", server_port=7860)