import os, shutil, subprocess, tempfile, json import gradio as gr def process_upload(pdf_file, output_format): # Create temp output directory out_dir = tempfile.mkdtemp() # Run Marker CLI: outputs files marker-0000.md or marker-0000.json fmt = "markdown" if output_format=="markdown" else "json" cmd = [ "marker_single", pdf_file.name, "--output_format", fmt, "--output_dir", out_dir, "--paginate_output" # page separators ] subprocess.run(cmd, check=True) # Read and combine results results = [] for fname in sorted(os.listdir(out_dir)): path = os.path.join(out_dir, fname) with open(path, 'r', encoding='utf-8') as f: results.append(f.read()) # Cleanup shutil.rmtree(out_dir) if output_format == "markdown": return "\n\n---\n\n".join(results) else: # If JSON, combine into list of pages return json.dumps({"pages": results}, indent=2, ensure_ascii=False) demo = gr.Interface( fn=process_upload, inputs=[ gr.File(label="Upload PDF", file_types=[".pdf"]), gr.Radio(["markdown","json"], value="markdown", label="Output format") ], outputs=gr.Code(label="Output"), title="PDF → Markdown/JSON with LaTeX Support", description=( "Uploads a PDF and uses Marker to extract text, structure, and LaTeX math. " "Choose Markdown to get a single .md with `$...$`/`$$...$$` math, " "or JSON for a page-by-page array." ) ) if __name__=="__main__": demo.launch(server_name="0.0.0.0", server_port=7860)