File size: 3,099 Bytes
dfce863
 
ecf768f
683fa93
 
c842df1
 
 
 
 
ecf768f
27b468a
 
c842df1
 
 
 
27b468a
ecf768f
 
c842df1
ecf768f
 
 
27b468a
 
 
ecf768f
27b468a
dfce863
ecf768f
 
dfce863
ecf768f
ec386e0
 
 
 
 
ecf768f
 
 
ec386e0
27b468a
ec386e0
 
ecf768f
ec386e0
dfce863
27b468a
c842df1
 
 
 
 
27b468a
 
ecf768f
 
 
 
ec386e0
ecf768f
 
 
 
ec386e0
ecf768f
ec386e0
ecf768f
 
 
683fa93
ecf768f
683fa93
ec386e0
ecf768f
ec386e0
c842df1
 
683fa93
 
 
c842df1
ecf768f
683fa93
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import json
from huggingface_hub import snapshot_download
import gradio as gr

# 1) Pre-download all MinerU model weights under models/
MODEL_REPO   = "opendatalab/pdf-extract-kit-1.0"
LOCAL_MODELS = "./models"

# Grab both YOLO and MFR weights in one go
snapshot_download(
    repo_id      = MODEL_REPO,
    local_dir    = LOCAL_MODELS,
    allow_patterns = [
        "models/MFD/YOLO/*",
        "models/MFR/*"
    ],
    max_workers  = 4
)

# 2) Write magic-pdf.json pointing at the nested 'models' directory
CFG_PATH = os.path.expanduser("~/magic-pdf.json")
if not os.path.exists(CFG_PATH):
    cfg = {
        "device":         "cpu",
        "models-dir":     os.path.join(LOCAL_MODELS, "models"),
        "layout-model":   "layoutlmv3",
        "formula-enable": True,
        "table-enable":   True
    }
    with open(CFG_PATH, "w", encoding="utf-8") as f:
        json.dump(cfg, f, ensure_ascii=False, indent=2)

# 3) MinerU imports
from magic_pdf.data.read_api import read_local_pdfs
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

def convert_with_mineru(pdf_file, out_fmt):
    datasets = read_local_pdfs(pdf_file.name)
    tmp, img_dir = "output", os.path.join("output", "images")
    os.makedirs(img_dir, exist_ok=True)
    md_writer  = FileBasedDataWriter(tmp)
    img_writer = FileBasedDataWriter(img_dir)

    results = []
    for ds in datasets:
        method = ds.classify()
        infer  = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
        pipe   = (
            infer.pipe_ocr_mode(img_writer)
            if method == SupportedPdfParseMethod.OCR
            else infer.pipe_txt_mode(img_writer)
        )

        base    = os.path.splitext(os.path.basename(pdf_file.name))[0]
        md_name = f"{base}.md"
        pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
        with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
            md_text = f.read()

        json_name = f"{base}_content_list.json"
        pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir))
        with open(os.path.join(tmp, json_name), encoding="utf-8") as f:
            content = json.load(f)

        results.append({"markdown": md_text, "content_list": content})

    if out_fmt == "markdown":
        return "\n\n---\n\n".join(r["markdown"] for r in results)
    return json.dumps(results, ensure_ascii=False, indent=2)

# 4) Gradio UI
demo = gr.Interface(
    fn=convert_with_mineru,
    inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
    outputs=gr.Code(label="Result"),
    title="MinerU PDF → Markdown/JSON (Fully Fixed)",
    description="Pre-downloads all necessary YOLO and MFR weights and configures magic-pdf correctly."
)

if __name__ == "__main__":
    # Ensure HF_HUB_CACHE aligns with our models folder
    os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
    demo.launch(server_name="0.0.0.0", server_port=7860)