Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,25 +3,26 @@ import json
|
|
3 |
from huggingface_hub import snapshot_download
|
4 |
import gradio as gr
|
5 |
|
6 |
-
# 1) Pre
|
7 |
-
MODEL_REPO
|
8 |
-
LOCAL_MODELS
|
9 |
snapshot_download(
|
10 |
-
repo_id=MODEL_REPO,
|
11 |
-
local_dir=LOCAL_MODELS,
|
12 |
-
allow_patterns="models/MFD/YOLO/*",
|
13 |
-
max_workers=4
|
14 |
)
|
15 |
|
16 |
-
# 2) Write
|
17 |
CFG_PATH = os.path.expanduser("~/magic-pdf.json")
|
18 |
if not os.path.exists(CFG_PATH):
|
19 |
cfg = {
|
20 |
-
"device":
|
21 |
-
|
22 |
-
"
|
|
|
23 |
"formula-enable": True,
|
24 |
-
"table-enable":
|
25 |
}
|
26 |
with open(CFG_PATH, "w", encoding="utf-8") as f:
|
27 |
json.dump(cfg, f, ensure_ascii=False, indent=2)
|
@@ -36,19 +37,18 @@ def convert_with_mineru(pdf_file, out_fmt):
|
|
36 |
datasets = read_local_pdfs(pdf_file.name)
|
37 |
tmp, img_dir = "output", os.path.join("output", "images")
|
38 |
os.makedirs(img_dir, exist_ok=True)
|
39 |
-
md_writer
|
40 |
img_writer = FileBasedDataWriter(img_dir)
|
41 |
|
42 |
results = []
|
43 |
for ds in datasets:
|
44 |
method = ds.classify()
|
45 |
-
infer
|
46 |
-
pipe
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
)
|
51 |
-
base = os.path.splitext(os.path.basename(pdf_file.name))[0]
|
52 |
md_name = f"{base}.md"
|
53 |
pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
|
54 |
with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
|
@@ -71,10 +71,10 @@ demo = gr.Interface(
|
|
71 |
inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
|
72 |
outputs=gr.Code(label="Result"),
|
73 |
title="MinerU PDF β Markdown/JSON (Fixed)",
|
74 |
-
description="
|
75 |
)
|
76 |
|
77 |
if __name__ == "__main__":
|
78 |
-
#
|
79 |
os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
|
80 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
3 |
from huggingface_hub import snapshot_download
|
4 |
import gradio as gr
|
5 |
|
6 |
+
# 1) Preβdownload only the MFD/YOLO weights into ./models
|
7 |
+
MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
|
8 |
+
LOCAL_MODELS = "./models"
|
9 |
snapshot_download(
|
10 |
+
repo_id = MODEL_REPO,
|
11 |
+
local_dir = LOCAL_MODELS,
|
12 |
+
allow_patterns = "models/MFD/YOLO/*",
|
13 |
+
max_workers = 4
|
14 |
)
|
15 |
|
16 |
+
# 2) Write magic-pdf.json pointing at the actual weights path
|
17 |
CFG_PATH = os.path.expanduser("~/magic-pdf.json")
|
18 |
if not os.path.exists(CFG_PATH):
|
19 |
cfg = {
|
20 |
+
"device": "cpu",
|
21 |
+
# β TWEAKED: point into the nested 'models' folder
|
22 |
+
"models-dir": os.path.join(LOCAL_MODELS, "models"),
|
23 |
+
"layout-model": "layoutlmv3",
|
24 |
"formula-enable": True,
|
25 |
+
"table-enable": True
|
26 |
}
|
27 |
with open(CFG_PATH, "w", encoding="utf-8") as f:
|
28 |
json.dump(cfg, f, ensure_ascii=False, indent=2)
|
|
|
37 |
datasets = read_local_pdfs(pdf_file.name)
|
38 |
tmp, img_dir = "output", os.path.join("output", "images")
|
39 |
os.makedirs(img_dir, exist_ok=True)
|
40 |
+
md_writer = FileBasedDataWriter(tmp)
|
41 |
img_writer = FileBasedDataWriter(img_dir)
|
42 |
|
43 |
results = []
|
44 |
for ds in datasets:
|
45 |
method = ds.classify()
|
46 |
+
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
|
47 |
+
pipe = (infer.pipe_ocr_mode(img_writer)
|
48 |
+
if method == SupportedPdfParseMethod.OCR
|
49 |
+
else infer.pipe_txt_mode(img_writer))
|
50 |
+
|
51 |
+
base = os.path.splitext(os.path.basename(pdf_file.name))[0]
|
|
|
52 |
md_name = f"{base}.md"
|
53 |
pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
|
54 |
with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
|
|
|
71 |
inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
|
72 |
outputs=gr.Code(label="Result"),
|
73 |
title="MinerU PDF β Markdown/JSON (Fixed)",
|
74 |
+
description="Downloads YOLO weights and points magic-pdf at the correct folder."
|
75 |
)
|
76 |
|
77 |
if __name__ == "__main__":
|
78 |
+
# Ensure HF_HUB_CACHE is consistent with our models folder
|
79 |
os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
|
80 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|