import os import json from huggingface_hub import snapshot_download import gradio as gr # 1) Pre-download all MinerU model weights under models/ MODEL_REPO = "opendatalab/pdf-extract-kit-1.0" LOCAL_MODELS = "./models" # Grab both YOLO and MFR weights in one go snapshot_download( repo_id = MODEL_REPO, local_dir = LOCAL_MODELS, allow_patterns = [ "models/MFD/YOLO/*", "models/MFR/*" ], max_workers = 4 ) # 2) Write magic-pdf.json pointing at the nested 'models' directory CFG_PATH = os.path.expanduser("~/magic-pdf.json") if not os.path.exists(CFG_PATH): cfg = { "device": "cpu", "models-dir": os.path.join(LOCAL_MODELS, "models"), "layout-model": "layoutlmv3", "formula-enable": True, "table-enable": True } with open(CFG_PATH, "w", encoding="utf-8") as f: json.dump(cfg, f, ensure_ascii=False, indent=2) # 3) MinerU imports from magic_pdf.data.read_api import read_local_pdfs from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.config.enums import SupportedPdfParseMethod def convert_with_mineru(pdf_file, out_fmt): datasets = read_local_pdfs(pdf_file.name) tmp, img_dir = "output", os.path.join("output", "images") os.makedirs(img_dir, exist_ok=True) md_writer = FileBasedDataWriter(tmp) img_writer = FileBasedDataWriter(img_dir) results = [] for ds in datasets: method = ds.classify() infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR)) pipe = ( infer.pipe_ocr_mode(img_writer) if method == SupportedPdfParseMethod.OCR else infer.pipe_txt_mode(img_writer) ) base = os.path.splitext(os.path.basename(pdf_file.name))[0] md_name = f"{base}.md" pipe.dump_md(md_writer, md_name, os.path.basename(img_dir)) with open(os.path.join(tmp, md_name), encoding="utf-8") as f: md_text = f.read() json_name = f"{base}_content_list.json" pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir)) with open(os.path.join(tmp, json_name), encoding="utf-8") as f: content = json.load(f) results.append({"markdown": md_text, "content_list": content}) if out_fmt == "markdown": return "\n\n---\n\n".join(r["markdown"] for r in results) return json.dumps(results, ensure_ascii=False, indent=2) # 4) Gradio UI demo = gr.Interface( fn=convert_with_mineru, inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")], outputs=gr.Code(label="Result"), title="MinerU PDF → Markdown/JSON (Fully Fixed)", description="Pre-downloads all necessary YOLO and MFR weights and configures magic-pdf correctly." ) if __name__ == "__main__": # Ensure HF_HUB_CACHE aligns with our models folder os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS) demo.launch(server_name="0.0.0.0", server_port=7860)