Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,19 @@
|
|
1 |
-
|
2 |
-
import
|
3 |
import gradio as gr
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
# MinerU API imports
|
6 |
from magic_pdf.data.read_api import read_local_pdfs
|
7 |
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
@@ -10,9 +22,8 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
|
|
10 |
|
11 |
def convert_with_mineru(pdf_path, out_format):
|
12 |
# 1) Read file into MinerU dataset(s)
|
13 |
-
datasets = read_local_pdfs(pdf_path)
|
14 |
|
15 |
-
# Prepare writers
|
16 |
tmp_dir = "output"
|
17 |
img_dir = os.path.join(tmp_dir, "images")
|
18 |
os.makedirs(img_dir, exist_ok=True)
|
@@ -22,22 +33,21 @@ def convert_with_mineru(pdf_path, out_format):
|
|
22 |
all_pages = []
|
23 |
|
24 |
for ds in datasets:
|
25 |
-
# 2) Classify & infer
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
pipe = infer.pipe_txt_mode(img_writer)
|
32 |
|
33 |
-
# 3) Dump per‐document Markdown
|
34 |
basename = os.path.splitext(os.path.basename(pdf_path))[0]
|
35 |
md_fname = f"{basename}.md"
|
36 |
pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir))
|
37 |
with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f:
|
38 |
page_md = f.read()
|
39 |
|
40 |
-
# 4)
|
41 |
json_fname = f"{basename}_content_list.json"
|
42 |
pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir))
|
43 |
with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f:
|
@@ -50,7 +60,6 @@ def convert_with_mineru(pdf_path, out_format):
|
|
50 |
|
51 |
# 5) Return desired format
|
52 |
if out_format == "markdown":
|
53 |
-
# Concatenate all documents
|
54 |
return "\n\n---\n\n".join(p["markdown"] for p in all_pages)
|
55 |
else:
|
56 |
return json.dumps(all_pages, ensure_ascii=False, indent=2)
|
@@ -60,13 +69,14 @@ demo = gr.Interface(
|
|
60 |
fn=convert_with_mineru,
|
61 |
inputs=[
|
62 |
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
63 |
-
gr.Radio(["markdown","json"], value="markdown", label="Output format")
|
64 |
],
|
65 |
outputs=gr.Code(label="Result"),
|
66 |
title="MinerU-Powered PDF → Markdown/JSON",
|
67 |
description=(
|
68 |
"Leverage the advanced MinerU engine to extract text, images, tables, "
|
69 |
-
"and formulas from your PDF into clean Markdown or structured JSON."
|
|
|
70 |
)
|
71 |
)
|
72 |
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
import gradio as gr
|
4 |
|
5 |
+
# Ensure default config for magic-pdf
|
6 |
+
CONFIG_PATH = os.path.expanduser("~/magic-pdf.json")
|
7 |
+
if not os.path.exists(CONFIG_PATH):
|
8 |
+
default_cfg = {
|
9 |
+
"device": "cpu", # force CPU inference
|
10 |
+
"layout_model": "layout/mobilenetv3",
|
11 |
+
"formula_enable": True,
|
12 |
+
"table_enable": True
|
13 |
+
}
|
14 |
+
with open(CONFIG_PATH, "w", encoding="utf-8") as cfg:
|
15 |
+
json.dump(default_cfg, cfg, ensure_ascii=False, indent=2)
|
16 |
+
|
17 |
# MinerU API imports
|
18 |
from magic_pdf.data.read_api import read_local_pdfs
|
19 |
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
|
|
22 |
|
23 |
def convert_with_mineru(pdf_path, out_format):
|
24 |
# 1) Read file into MinerU dataset(s)
|
25 |
+
datasets = read_local_pdfs(pdf_path)
|
26 |
|
|
|
27 |
tmp_dir = "output"
|
28 |
img_dir = os.path.join(tmp_dir, "images")
|
29 |
os.makedirs(img_dir, exist_ok=True)
|
|
|
33 |
all_pages = []
|
34 |
|
35 |
for ds in datasets:
|
36 |
+
# 2) Classify & infer, with OCR fallback
|
37 |
+
method = ds.classify()
|
38 |
+
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
|
39 |
+
pipe = (infer.pipe_ocr_mode(img_writer)
|
40 |
+
if method == SupportedPdfParseMethod.OCR
|
41 |
+
else infer.pipe_txt_mode(img_writer))
|
|
|
42 |
|
43 |
+
# 3) Dump per‐document Markdown
|
44 |
basename = os.path.splitext(os.path.basename(pdf_path))[0]
|
45 |
md_fname = f"{basename}.md"
|
46 |
pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir))
|
47 |
with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f:
|
48 |
page_md = f.read()
|
49 |
|
50 |
+
# 4) Dump structured JSON
|
51 |
json_fname = f"{basename}_content_list.json"
|
52 |
pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir))
|
53 |
with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f:
|
|
|
60 |
|
61 |
# 5) Return desired format
|
62 |
if out_format == "markdown":
|
|
|
63 |
return "\n\n---\n\n".join(p["markdown"] for p in all_pages)
|
64 |
else:
|
65 |
return json.dumps(all_pages, ensure_ascii=False, indent=2)
|
|
|
69 |
fn=convert_with_mineru,
|
70 |
inputs=[
|
71 |
gr.File(label="Upload PDF", file_types=[".pdf"]),
|
72 |
+
gr.Radio(["markdown", "json"], value="markdown", label="Output format")
|
73 |
],
|
74 |
outputs=gr.Code(label="Result"),
|
75 |
title="MinerU-Powered PDF → Markdown/JSON",
|
76 |
description=(
|
77 |
"Leverage the advanced MinerU engine to extract text, images, tables, "
|
78 |
+
"and formulas from your PDF into clean Markdown or structured JSON. "
|
79 |
+
"A default CPU-only config is auto-generated if none is found."
|
80 |
)
|
81 |
)
|
82 |
|