File size: 2,673 Bytes
683fa93
ec386e0
683fa93
 
ec386e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683fa93
ec386e0
683fa93
 
 
ec386e0
683fa93
ec386e0
 
683fa93
ec386e0
 
683fa93
ec386e0
 
 
683fa93
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# app.py
import os, json
import gradio as gr

# MinerU API imports
from magic_pdf.data.read_api import read_local_pdfs
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

def convert_with_mineru(pdf_path, out_format):
    # 1) Read file into MinerU dataset(s)
    datasets = read_local_pdfs(pdf_path)  # returns list[PymuDocDataset] :contentReference[oaicite:3]{index=3}

    # Prepare writers
    tmp_dir = "output"
    img_dir = os.path.join(tmp_dir, "images")
    os.makedirs(img_dir, exist_ok=True)
    md_writer = FileBasedDataWriter(tmp_dir)
    img_writer = FileBasedDataWriter(img_dir)

    all_pages = []

    for ds in datasets:
        # 2) Classify & infer
        if ds.classify() == SupportedPdfParseMethod.OCR:
            infer = ds.apply(doc_analyze, ocr=True)
            pipe = infer.pipe_ocr_mode(img_writer)
        else:
            infer = ds.apply(doc_analyze, ocr=False)
            pipe = infer.pipe_txt_mode(img_writer)

        # 3) Dump per‐document Markdown + collect
        basename = os.path.splitext(os.path.basename(pdf_path))[0]
        md_fname = f"{basename}.md"
        pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir))
        with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f:
            page_md = f.read()

        # 4) Collect structured JSON (middle JSON)
        json_fname = f"{basename}_content_list.json"
        pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir))
        with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f:
            content_list = json.load(f)

        all_pages.append({
            "markdown": page_md,
            "content_list": content_list
        })

    # 5) Return desired format
    if out_format == "markdown":
        # Concatenate all documents
        return "\n\n---\n\n".join(p["markdown"] for p in all_pages)
    else:
        return json.dumps(all_pages, ensure_ascii=False, indent=2)

# Gradio interface
demo = gr.Interface(
    fn=convert_with_mineru,
    inputs=[
        gr.File(label="Upload PDF", file_types=[".pdf"]),
        gr.Radio(["markdown","json"], value="markdown", label="Output format")
    ],
    outputs=gr.Code(label="Result"),
    title="MinerU-Powered PDF → Markdown/JSON",
    description=(
        "Leverage the advanced MinerU engine to extract text, images, tables, "
        "and formulas from your PDF into clean Markdown or structured JSON."
    )
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)