Spaces:
Running
Running
import os | |
import json | |
from huggingface_hub import snapshot_download | |
import gradio as gr | |
# 1) Pre-download all MinerU model weights under models/ | |
MODEL_REPO = "opendatalab/pdf-extract-kit-1.0" | |
LOCAL_MODELS = "./models" | |
# Grab both YOLO and MFR weights in one go | |
snapshot_download( | |
repo_id = MODEL_REPO, | |
local_dir = LOCAL_MODELS, | |
allow_patterns = [ | |
"models/MFD/YOLO/*", | |
"models/MFR/*" | |
], | |
max_workers = 4 | |
) | |
# 2) Write magic-pdf.json pointing at the nested 'models' directory | |
CFG_PATH = os.path.expanduser("~/magic-pdf.json") | |
if not os.path.exists(CFG_PATH): | |
cfg = { | |
"device": "cpu", | |
"models-dir": os.path.join(LOCAL_MODELS, "models"), | |
"layout-model": "layoutlmv3", | |
"formula-enable": True, | |
"table-enable": True | |
} | |
with open(CFG_PATH, "w", encoding="utf-8") as f: | |
json.dump(cfg, f, ensure_ascii=False, indent=2) | |
# 3) MinerU imports | |
from magic_pdf.data.read_api import read_local_pdfs | |
from magic_pdf.data.data_reader_writer import FileBasedDataWriter | |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze | |
from magic_pdf.config.enums import SupportedPdfParseMethod | |
def convert_with_mineru(pdf_file, out_fmt): | |
datasets = read_local_pdfs(pdf_file.name) | |
tmp, img_dir = "output", os.path.join("output", "images") | |
os.makedirs(img_dir, exist_ok=True) | |
md_writer = FileBasedDataWriter(tmp) | |
img_writer = FileBasedDataWriter(img_dir) | |
results = [] | |
for ds in datasets: | |
method = ds.classify() | |
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR)) | |
pipe = ( | |
infer.pipe_ocr_mode(img_writer) | |
if method == SupportedPdfParseMethod.OCR | |
else infer.pipe_txt_mode(img_writer) | |
) | |
base = os.path.splitext(os.path.basename(pdf_file.name))[0] | |
md_name = f"{base}.md" | |
pipe.dump_md(md_writer, md_name, os.path.basename(img_dir)) | |
with open(os.path.join(tmp, md_name), encoding="utf-8") as f: | |
md_text = f.read() | |
json_name = f"{base}_content_list.json" | |
pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir)) | |
with open(os.path.join(tmp, json_name), encoding="utf-8") as f: | |
content = json.load(f) | |
results.append({"markdown": md_text, "content_list": content}) | |
if out_fmt == "markdown": | |
return "\n\n---\n\n".join(r["markdown"] for r in results) | |
return json.dumps(results, ensure_ascii=False, indent=2) | |
# 4) Gradio UI | |
demo = gr.Interface( | |
fn=convert_with_mineru, | |
inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")], | |
outputs=gr.Code(label="Result"), | |
title="MinerU PDF β Markdown/JSON (Fully Fixed)", | |
description="Pre-downloads all necessary YOLO and MFR weights and configures magic-pdf correctly." | |
) | |
if __name__ == "__main__": | |
# Ensure HF_HUB_CACHE aligns with our models folder | |
os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS) | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |