euler314 commited on
Commit
ec386e0
·
verified ·
1 Parent(s): 9fc1c5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -42
app.py CHANGED
@@ -1,55 +1,73 @@
1
  # app.py
2
- import fitz # PyMuPDF
3
- from markdownify import markdownify as md
4
- import json
5
  import gradio as gr
6
 
7
- def convert_pdf_to_markdown(path):
8
- """Extract each page as HTML, convert to Markdown."""
9
- doc = fitz.open(path)
10
- pages_md = []
11
- for i, page in enumerate(doc, start=1):
12
- html = page.get_text("html") or ""
13
- # Clean conversion: collapse multiple newlines
14
- page_md = md(html).strip()
15
- pages_md.append({"page": i, "markdown": page_md})
16
- return pages_md
17
-
18
- def process_upload(pdf_file, output_format):
19
- """
20
- pdf_file: tempfile-like object from Gradio
21
- output_format: "markdown" or "json"
22
- """
23
- # Convert and collect
24
- pages = convert_pdf_to_markdown(pdf_file.name)
25
-
26
- if output_format == "markdown":
27
- # Join all pages
28
- full_md = "\n\n---\n\n".join(p["markdown"] for p in pages)
29
- return full_md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  else:
31
- # Return pretty JSON
32
- return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
33
 
34
  # Gradio interface
35
  demo = gr.Interface(
36
- fn=process_upload,
37
  inputs=[
38
- gr.File(label="Upload your PDF", file_types=[".pdf"]),
39
- gr.Radio(choices=["markdown", "json"],
40
- value="markdown",
41
- label="Output format")
42
  ],
43
- outputs=gr.Code(label="Converted Output"),
44
- title="PDF → Markdown/JSON Converter",
45
  description=(
46
- "Upload a PDF and get back a professionally converted Markdown "
47
- "or a structured JSON with each page’s Markdown. "
48
- "PDFs with images or complex tables may still need manual review."
49
- ),
50
- examples=[
51
- # you can add example PDFs here if desired
52
- ]
53
  )
54
 
55
  if __name__ == "__main__":
 
1
  # app.py
2
+ import os, json
 
 
3
  import gradio as gr
4
 
5
+ # MinerU API imports
6
+ from magic_pdf.data.read_api import read_local_pdfs
7
+ from magic_pdf.data.data_reader_writer import FileBasedDataWriter
8
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
9
+ from magic_pdf.config.enums import SupportedPdfParseMethod
10
+
11
+ def convert_with_mineru(pdf_path, out_format):
12
+ # 1) Read file into MinerU dataset(s)
13
+ datasets = read_local_pdfs(pdf_path) # returns list[PymuDocDataset] :contentReference[oaicite:3]{index=3}
14
+
15
+ # Prepare writers
16
+ tmp_dir = "output"
17
+ img_dir = os.path.join(tmp_dir, "images")
18
+ os.makedirs(img_dir, exist_ok=True)
19
+ md_writer = FileBasedDataWriter(tmp_dir)
20
+ img_writer = FileBasedDataWriter(img_dir)
21
+
22
+ all_pages = []
23
+
24
+ for ds in datasets:
25
+ # 2) Classify & infer
26
+ if ds.classify() == SupportedPdfParseMethod.OCR:
27
+ infer = ds.apply(doc_analyze, ocr=True)
28
+ pipe = infer.pipe_ocr_mode(img_writer)
29
+ else:
30
+ infer = ds.apply(doc_analyze, ocr=False)
31
+ pipe = infer.pipe_txt_mode(img_writer)
32
+
33
+ # 3) Dump per‐document Markdown + collect
34
+ basename = os.path.splitext(os.path.basename(pdf_path))[0]
35
+ md_fname = f"{basename}.md"
36
+ pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir))
37
+ with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f:
38
+ page_md = f.read()
39
+
40
+ # 4) Collect structured JSON (middle JSON)
41
+ json_fname = f"{basename}_content_list.json"
42
+ pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir))
43
+ with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f:
44
+ content_list = json.load(f)
45
+
46
+ all_pages.append({
47
+ "markdown": page_md,
48
+ "content_list": content_list
49
+ })
50
+
51
+ # 5) Return desired format
52
+ if out_format == "markdown":
53
+ # Concatenate all documents
54
+ return "\n\n---\n\n".join(p["markdown"] for p in all_pages)
55
  else:
56
+ return json.dumps(all_pages, ensure_ascii=False, indent=2)
 
57
 
58
  # Gradio interface
59
  demo = gr.Interface(
60
+ fn=convert_with_mineru,
61
  inputs=[
62
+ gr.File(label="Upload PDF", file_types=[".pdf"]),
63
+ gr.Radio(["markdown","json"], value="markdown", label="Output format")
 
 
64
  ],
65
+ outputs=gr.Code(label="Result"),
66
+ title="MinerU-Powered PDF → Markdown/JSON",
67
  description=(
68
+ "Leverage the advanced MinerU engine to extract text, images, tables, "
69
+ "and formulas from your PDF into clean Markdown or structured JSON."
70
+ )
 
 
 
 
71
  )
72
 
73
  if __name__ == "__main__":