# app.py import os, json import gradio as gr # MinerU API imports from magic_pdf.data.read_api import read_local_pdfs from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod def convert_with_mineru(pdf_path, out_format): # 1) Read file into MinerU dataset(s) datasets = read_local_pdfs(pdf_path) # returns list[PymuDocDataset] :contentReference[oaicite:3]{index=3} # Prepare writers tmp_dir = "output" img_dir = os.path.join(tmp_dir, "images") os.makedirs(img_dir, exist_ok=True) md_writer = FileBasedDataWriter(tmp_dir) img_writer = FileBasedDataWriter(img_dir) all_pages = [] for ds in datasets: # 2) Classify & infer if ds.classify() == SupportedPdfParseMethod.OCR: infer = ds.apply(doc_analyze, ocr=True) pipe = infer.pipe_ocr_mode(img_writer) else: infer = ds.apply(doc_analyze, ocr=False) pipe = infer.pipe_txt_mode(img_writer) # 3) Dump per‐document Markdown + collect basename = os.path.splitext(os.path.basename(pdf_path))[0] md_fname = f"{basename}.md" pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir)) with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f: page_md = f.read() # 4) Collect structured JSON (middle JSON) json_fname = f"{basename}_content_list.json" pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir)) with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f: content_list = json.load(f) all_pages.append({ "markdown": page_md, "content_list": content_list }) # 5) Return desired format if out_format == "markdown": # Concatenate all documents return "\n\n---\n\n".join(p["markdown"] for p in all_pages) else: return json.dumps(all_pages, ensure_ascii=False, indent=2) # Gradio interface demo = gr.Interface( fn=convert_with_mineru, inputs=[ gr.File(label="Upload PDF", file_types=[".pdf"]), gr.Radio(["markdown","json"], value="markdown", label="Output format") ], outputs=gr.Code(label="Result"), title="MinerU-Powered PDF → Markdown/JSON", description=( "Leverage the advanced MinerU engine to extract text, images, tables, " "and formulas from your PDF into clean Markdown or structured JSON." ) ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)