Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,22 +3,26 @@ import json
|
|
3 |
from huggingface_hub import snapshot_download
|
4 |
import gradio as gr
|
5 |
|
6 |
-
# 1) Pre
|
7 |
-
MODEL_REPO
|
8 |
-
LOCAL_MODELS
|
|
|
|
|
9 |
snapshot_download(
|
10 |
repo_id = MODEL_REPO,
|
11 |
local_dir = LOCAL_MODELS,
|
12 |
-
allow_patterns =
|
|
|
|
|
|
|
13 |
max_workers = 4
|
14 |
)
|
15 |
|
16 |
-
# 2) Write magic-pdf.json pointing at the
|
17 |
CFG_PATH = os.path.expanduser("~/magic-pdf.json")
|
18 |
if not os.path.exists(CFG_PATH):
|
19 |
cfg = {
|
20 |
"device": "cpu",
|
21 |
-
# β TWEAKED: point into the nested 'models' folder
|
22 |
"models-dir": os.path.join(LOCAL_MODELS, "models"),
|
23 |
"layout-model": "layoutlmv3",
|
24 |
"formula-enable": True,
|
@@ -44,9 +48,11 @@ def convert_with_mineru(pdf_file, out_fmt):
|
|
44 |
for ds in datasets:
|
45 |
method = ds.classify()
|
46 |
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
|
47 |
-
pipe = (
|
48 |
-
|
49 |
-
|
|
|
|
|
50 |
|
51 |
base = os.path.splitext(os.path.basename(pdf_file.name))[0]
|
52 |
md_name = f"{base}.md"
|
@@ -70,11 +76,11 @@ demo = gr.Interface(
|
|
70 |
fn=convert_with_mineru,
|
71 |
inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
|
72 |
outputs=gr.Code(label="Result"),
|
73 |
-
title="MinerU PDF β Markdown/JSON (Fixed)",
|
74 |
-
description="
|
75 |
)
|
76 |
|
77 |
if __name__ == "__main__":
|
78 |
-
# Ensure HF_HUB_CACHE
|
79 |
os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
|
80 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
3 |
from huggingface_hub import snapshot_download
|
4 |
import gradio as gr
|
5 |
|
6 |
+
# 1) Pre-download all MinerU model weights under models/
|
7 |
+
MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
|
8 |
+
LOCAL_MODELS = "./models"
|
9 |
+
|
10 |
+
# Grab both YOLO and MFR weights in one go
|
11 |
snapshot_download(
|
12 |
repo_id = MODEL_REPO,
|
13 |
local_dir = LOCAL_MODELS,
|
14 |
+
allow_patterns = [
|
15 |
+
"models/MFD/YOLO/*",
|
16 |
+
"models/MFR/*"
|
17 |
+
],
|
18 |
max_workers = 4
|
19 |
)
|
20 |
|
21 |
+
# 2) Write magic-pdf.json pointing at the nested 'models' directory
|
22 |
CFG_PATH = os.path.expanduser("~/magic-pdf.json")
|
23 |
if not os.path.exists(CFG_PATH):
|
24 |
cfg = {
|
25 |
"device": "cpu",
|
|
|
26 |
"models-dir": os.path.join(LOCAL_MODELS, "models"),
|
27 |
"layout-model": "layoutlmv3",
|
28 |
"formula-enable": True,
|
|
|
48 |
for ds in datasets:
|
49 |
method = ds.classify()
|
50 |
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
|
51 |
+
pipe = (
|
52 |
+
infer.pipe_ocr_mode(img_writer)
|
53 |
+
if method == SupportedPdfParseMethod.OCR
|
54 |
+
else infer.pipe_txt_mode(img_writer)
|
55 |
+
)
|
56 |
|
57 |
base = os.path.splitext(os.path.basename(pdf_file.name))[0]
|
58 |
md_name = f"{base}.md"
|
|
|
76 |
fn=convert_with_mineru,
|
77 |
inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
|
78 |
outputs=gr.Code(label="Result"),
|
79 |
+
title="MinerU PDF β Markdown/JSON (Fully Fixed)",
|
80 |
+
description="Pre-downloads all necessary YOLO and MFR weights and configures magic-pdf correctly."
|
81 |
)
|
82 |
|
83 |
if __name__ == "__main__":
|
84 |
+
# Ensure HF_HUB_CACHE aligns with our models folder
|
85 |
os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
|
86 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|