euler314's picture
Update app.py
c842df1 verified
raw
history blame
3.1 kB
import os
import json
from huggingface_hub import snapshot_download
import gradio as gr
# 1) Pre-download all MinerU model weights under models/
MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
LOCAL_MODELS = "./models"
# Grab both YOLO and MFR weights in one go
snapshot_download(
repo_id = MODEL_REPO,
local_dir = LOCAL_MODELS,
allow_patterns = [
"models/MFD/YOLO/*",
"models/MFR/*"
],
max_workers = 4
)
# 2) Write magic-pdf.json pointing at the nested 'models' directory
CFG_PATH = os.path.expanduser("~/magic-pdf.json")
if not os.path.exists(CFG_PATH):
cfg = {
"device": "cpu",
"models-dir": os.path.join(LOCAL_MODELS, "models"),
"layout-model": "layoutlmv3",
"formula-enable": True,
"table-enable": True
}
with open(CFG_PATH, "w", encoding="utf-8") as f:
json.dump(cfg, f, ensure_ascii=False, indent=2)
# 3) MinerU imports
from magic_pdf.data.read_api import read_local_pdfs
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
def convert_with_mineru(pdf_file, out_fmt):
datasets = read_local_pdfs(pdf_file.name)
tmp, img_dir = "output", os.path.join("output", "images")
os.makedirs(img_dir, exist_ok=True)
md_writer = FileBasedDataWriter(tmp)
img_writer = FileBasedDataWriter(img_dir)
results = []
for ds in datasets:
method = ds.classify()
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
pipe = (
infer.pipe_ocr_mode(img_writer)
if method == SupportedPdfParseMethod.OCR
else infer.pipe_txt_mode(img_writer)
)
base = os.path.splitext(os.path.basename(pdf_file.name))[0]
md_name = f"{base}.md"
pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
md_text = f.read()
json_name = f"{base}_content_list.json"
pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir))
with open(os.path.join(tmp, json_name), encoding="utf-8") as f:
content = json.load(f)
results.append({"markdown": md_text, "content_list": content})
if out_fmt == "markdown":
return "\n\n---\n\n".join(r["markdown"] for r in results)
return json.dumps(results, ensure_ascii=False, indent=2)
# 4) Gradio UI
demo = gr.Interface(
fn=convert_with_mineru,
inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
outputs=gr.Code(label="Result"),
title="MinerU PDF β†’ Markdown/JSON (Fully Fixed)",
description="Pre-downloads all necessary YOLO and MFR weights and configures magic-pdf correctly."
)
if __name__ == "__main__":
# Ensure HF_HUB_CACHE aligns with our models folder
os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
demo.launch(server_name="0.0.0.0", server_port=7860)