Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,84 +1,80 @@
|
|
1 |
import os
|
2 |
import json
|
|
|
3 |
import gradio as gr
|
4 |
|
5 |
-
#
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
}
|
14 |
-
with open(
|
15 |
-
json.dump(
|
16 |
|
17 |
-
# MinerU
|
18 |
from magic_pdf.data.read_api import read_local_pdfs
|
19 |
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
20 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
21 |
from magic_pdf.config.enums import SupportedPdfParseMethod
|
22 |
|
23 |
-
def convert_with_mineru(
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
tmp_dir = "output"
|
28 |
-
img_dir = os.path.join(tmp_dir, "images")
|
29 |
os.makedirs(img_dir, exist_ok=True)
|
30 |
-
md_writer = FileBasedDataWriter(
|
31 |
img_writer = FileBasedDataWriter(img_dir)
|
32 |
|
33 |
-
|
34 |
-
|
35 |
for ds in datasets:
|
36 |
-
# 2) Classify & infer, with OCR fallback
|
37 |
method = ds.classify()
|
38 |
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
|
39 |
-
pipe = (
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
pipe.dump_md(md_writer,
|
47 |
-
with open(os.path.join(
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
content_list = json.load(f)
|
55 |
|
56 |
-
|
57 |
-
"markdown": page_md,
|
58 |
-
"content_list": content_list
|
59 |
-
})
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
else:
|
65 |
-
return json.dumps(all_pages, ensure_ascii=False, indent=2)
|
66 |
|
67 |
-
# Gradio
|
68 |
demo = gr.Interface(
|
69 |
fn=convert_with_mineru,
|
70 |
-
inputs=[
|
71 |
-
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
72 |
-
gr.Radio(["markdown", "json"], value="markdown", label="Output format")
|
73 |
-
],
|
74 |
outputs=gr.Code(label="Result"),
|
75 |
-
title="MinerU
|
76 |
-
description=
|
77 |
-
"Leverage the advanced MinerU engine to extract text, images, tables, "
|
78 |
-
"and formulas from your PDF into clean Markdown or structured JSON. "
|
79 |
-
"A default CPU-only config is auto-generated if none is found."
|
80 |
-
)
|
81 |
)
|
82 |
|
83 |
if __name__ == "__main__":
|
|
|
|
|
84 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
1 |
import os
|
2 |
import json
|
3 |
+
from huggingface_hub import snapshot_download
|
4 |
import gradio as gr
|
5 |
|
6 |
+
# 1) Pre-download only the MFD/YOLO weights
|
7 |
+
MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
|
8 |
+
LOCAL_MODELS = "./models"
|
9 |
+
snapshot_download(
|
10 |
+
repo_id=MODEL_REPO,
|
11 |
+
local_dir=LOCAL_MODELS,
|
12 |
+
allow_patterns="models/MFD/YOLO/*",
|
13 |
+
max_workers=4
|
14 |
+
)
|
15 |
+
|
16 |
+
# 2) Write a minimal magic-pdf.json pointing to our models
|
17 |
+
CFG_PATH = os.path.expanduser("~/magic-pdf.json")
|
18 |
+
if not os.path.exists(CFG_PATH):
|
19 |
+
cfg = {
|
20 |
+
"device": "cpu", # CPU fallback
|
21 |
+
"models-dir": LOCAL_MODELS, # where we downloaded yolo_v8_ft.pt
|
22 |
+
"layout-model": "layoutlmv3",
|
23 |
+
"formula-enable": True,
|
24 |
+
"table-enable": True
|
25 |
}
|
26 |
+
with open(CFG_PATH, "w", encoding="utf-8") as f:
|
27 |
+
json.dump(cfg, f, ensure_ascii=False, indent=2)
|
28 |
|
29 |
+
# 3) MinerU imports
|
30 |
from magic_pdf.data.read_api import read_local_pdfs
|
31 |
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
32 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
33 |
from magic_pdf.config.enums import SupportedPdfParseMethod
|
34 |
|
35 |
+
def convert_with_mineru(pdf_file, out_fmt):
|
36 |
+
datasets = read_local_pdfs(pdf_file.name)
|
37 |
+
tmp, img_dir = "output", os.path.join("output", "images")
|
|
|
|
|
|
|
38 |
os.makedirs(img_dir, exist_ok=True)
|
39 |
+
md_writer = FileBasedDataWriter(tmp)
|
40 |
img_writer = FileBasedDataWriter(img_dir)
|
41 |
|
42 |
+
results = []
|
|
|
43 |
for ds in datasets:
|
|
|
44 |
method = ds.classify()
|
45 |
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
|
46 |
+
pipe = (
|
47 |
+
infer.pipe_ocr_mode(img_writer)
|
48 |
+
if method == SupportedPdfParseMethod.OCR
|
49 |
+
else infer.pipe_txt_mode(img_writer)
|
50 |
+
)
|
51 |
+
base = os.path.splitext(os.path.basename(pdf_file.name))[0]
|
52 |
+
md_name = f"{base}.md"
|
53 |
+
pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
|
54 |
+
with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
|
55 |
+
md_text = f.read()
|
56 |
|
57 |
+
json_name = f"{base}_content_list.json"
|
58 |
+
pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir))
|
59 |
+
with open(os.path.join(tmp, json_name), encoding="utf-8") as f:
|
60 |
+
content = json.load(f)
|
|
|
61 |
|
62 |
+
results.append({"markdown": md_text, "content_list": content})
|
|
|
|
|
|
|
63 |
|
64 |
+
if out_fmt == "markdown":
|
65 |
+
return "\n\n---\n\n".join(r["markdown"] for r in results)
|
66 |
+
return json.dumps(results, ensure_ascii=False, indent=2)
|
|
|
|
|
67 |
|
68 |
+
# 4) Gradio UI
|
69 |
demo = gr.Interface(
|
70 |
fn=convert_with_mineru,
|
71 |
+
inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
|
|
|
|
|
|
|
72 |
outputs=gr.Code(label="Result"),
|
73 |
+
title="MinerU PDF → Markdown/JSON (Fixed)",
|
74 |
+
description="Pre-downloads YOLO weights and configures MinerU for Spaces."
|
|
|
|
|
|
|
|
|
75 |
)
|
76 |
|
77 |
if __name__ == "__main__":
|
78 |
+
# Recommended: ensure HF_HUB_CACHE points to ./models
|
79 |
+
os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
|
80 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|