import os import json import gradio as gr # Ensure default config for magic-pdf CONFIG_PATH = os.path.expanduser("~/magic-pdf.json") if not os.path.exists(CONFIG_PATH): default_cfg = { "device": "cpu", # force CPU inference "layout_model": "layout/mobilenetv3", "formula_enable": True, "table_enable": True } with open(CONFIG_PATH, "w", encoding="utf-8") as cfg: json.dump(default_cfg, cfg, ensure_ascii=False, indent=2) # MinerU API imports from magic_pdf.data.read_api import read_local_pdfs from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod def convert_with_mineru(pdf_path, out_format): # 1) Read file into MinerU dataset(s) datasets = read_local_pdfs(pdf_path) tmp_dir = "output" img_dir = os.path.join(tmp_dir, "images") os.makedirs(img_dir, exist_ok=True) md_writer = FileBasedDataWriter(tmp_dir) img_writer = FileBasedDataWriter(img_dir) all_pages = [] for ds in datasets: # 2) Classify & infer, with OCR fallback method = ds.classify() infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR)) pipe = (infer.pipe_ocr_mode(img_writer) if method == SupportedPdfParseMethod.OCR else infer.pipe_txt_mode(img_writer)) # 3) Dump per‐document Markdown basename = os.path.splitext(os.path.basename(pdf_path))[0] md_fname = f"{basename}.md" pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir)) with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f: page_md = f.read() # 4) Dump structured JSON json_fname = f"{basename}_content_list.json" pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir)) with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f: content_list = json.load(f) all_pages.append({ "markdown": page_md, "content_list": content_list }) # 5) Return desired format if out_format == "markdown": return "\n\n---\n\n".join(p["markdown"] for p in all_pages) else: return json.dumps(all_pages, ensure_ascii=False, indent=2) # Gradio interface demo = gr.Interface( fn=convert_with_mineru, inputs=[ gr.File(label="Upload PDF", file_types=[".pdf"]), gr.Radio(["markdown", "json"], value="markdown", label="Output format") ], outputs=gr.Code(label="Result"), title="MinerU-Powered PDF → Markdown/JSON", description=( "Leverage the advanced MinerU engine to extract text, images, tables, " "and formulas from your PDF into clean Markdown or structured JSON. " "A default CPU-only config is auto-generated if none is found." ) ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)