Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,86 +1,56 @@
|
|
1 |
-
|
|
|
|
|
2 |
import json
|
3 |
-
from huggingface_hub import snapshot_download
|
4 |
import gradio as gr
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
"
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
# 3) MinerU imports
|
35 |
-
from magic_pdf.data.read_api import read_local_pdfs
|
36 |
-
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
37 |
-
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
38 |
-
from magic_pdf.config.enums import SupportedPdfParseMethod
|
39 |
-
|
40 |
-
def convert_with_mineru(pdf_file, out_fmt):
|
41 |
-
datasets = read_local_pdfs(pdf_file.name)
|
42 |
-
tmp, img_dir = "output", os.path.join("output", "images")
|
43 |
-
os.makedirs(img_dir, exist_ok=True)
|
44 |
-
md_writer = FileBasedDataWriter(tmp)
|
45 |
-
img_writer = FileBasedDataWriter(img_dir)
|
46 |
-
|
47 |
-
results = []
|
48 |
-
for ds in datasets:
|
49 |
-
method = ds.classify()
|
50 |
-
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
|
51 |
-
pipe = (
|
52 |
-
infer.pipe_ocr_mode(img_writer)
|
53 |
-
if method == SupportedPdfParseMethod.OCR
|
54 |
-
else infer.pipe_txt_mode(img_writer)
|
55 |
-
)
|
56 |
-
|
57 |
-
base = os.path.splitext(os.path.basename(pdf_file.name))[0]
|
58 |
-
md_name = f"{base}.md"
|
59 |
-
pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
|
60 |
-
with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
|
61 |
-
md_text = f.read()
|
62 |
-
|
63 |
-
json_name = f"{base}_content_list.json"
|
64 |
-
pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir))
|
65 |
-
with open(os.path.join(tmp, json_name), encoding="utf-8") as f:
|
66 |
-
content = json.load(f)
|
67 |
-
|
68 |
-
results.append({"markdown": md_text, "content_list": content})
|
69 |
-
|
70 |
-
if out_fmt == "markdown":
|
71 |
-
return "\n\n---\n\n".join(r["markdown"] for r in results)
|
72 |
-
return json.dumps(results, ensure_ascii=False, indent=2)
|
73 |
-
|
74 |
-
# 4) Gradio UI
|
75 |
demo = gr.Interface(
|
76 |
-
fn=
|
77 |
-
inputs=[
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
)
|
82 |
|
83 |
if __name__ == "__main__":
|
84 |
-
# Ensure HF_HUB_CACHE aligns with our models folder
|
85 |
-
os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
|
86 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
1 |
+
# app.py
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
from markdownify import markdownify as md
|
4 |
import json
|
|
|
5 |
import gradio as gr
|
6 |
|
7 |
+
def convert_pdf_to_markdown(path):
|
8 |
+
"""Extract each page as HTML, convert to Markdown."""
|
9 |
+
doc = fitz.open(path)
|
10 |
+
pages_md = []
|
11 |
+
for i, page in enumerate(doc, start=1):
|
12 |
+
html = page.get_text("html") or ""
|
13 |
+
# Clean conversion: collapse multiple newlines
|
14 |
+
page_md = md(html).strip()
|
15 |
+
pages_md.append({"page": i, "markdown": page_md})
|
16 |
+
return pages_md
|
17 |
+
|
18 |
+
def process_upload(pdf_file, output_format):
|
19 |
+
"""
|
20 |
+
pdf_file: tempfile-like object from Gradio
|
21 |
+
output_format: "markdown" or "json"
|
22 |
+
"""
|
23 |
+
# Convert and collect
|
24 |
+
pages = convert_pdf_to_markdown(pdf_file.name)
|
25 |
+
|
26 |
+
if output_format == "markdown":
|
27 |
+
# Join all pages
|
28 |
+
full_md = "\n\n---\n\n".join(p["markdown"] for p in pages)
|
29 |
+
return full_md
|
30 |
+
else:
|
31 |
+
# Return pretty JSON
|
32 |
+
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
|
33 |
+
|
34 |
+
# Gradio interface
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
demo = gr.Interface(
|
36 |
+
fn=process_upload,
|
37 |
+
inputs=[
|
38 |
+
gr.File(label="Upload your PDF", file_types=[".pdf"]),
|
39 |
+
gr.Radio(choices=["markdown", "json"],
|
40 |
+
value="markdown",
|
41 |
+
label="Output format")
|
42 |
+
],
|
43 |
+
outputs=gr.Code(label="Converted Output"),
|
44 |
+
title="PDF → Markdown/JSON Converter",
|
45 |
+
description=(
|
46 |
+
"Upload a PDF and get back a professionally converted Markdown "
|
47 |
+
"or a structured JSON with each page’s Markdown. "
|
48 |
+
"PDFs with images or complex tables may still need manual review."
|
49 |
+
),
|
50 |
+
examples=[
|
51 |
+
# you can add example PDFs here if desired
|
52 |
+
]
|
53 |
)
|
54 |
|
55 |
if __name__ == "__main__":
|
|
|
|
|
56 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|