euler314 commited on
Commit
dfce863
·
verified ·
1 Parent(s): b0a0fb0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -16
app.py CHANGED
@@ -1,7 +1,19 @@
1
- # app.py
2
- import os, json
3
  import gradio as gr
4
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  # MinerU API imports
6
  from magic_pdf.data.read_api import read_local_pdfs
7
  from magic_pdf.data.data_reader_writer import FileBasedDataWriter
@@ -10,9 +22,8 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
10
 
11
  def convert_with_mineru(pdf_path, out_format):
12
  # 1) Read file into MinerU dataset(s)
13
- datasets = read_local_pdfs(pdf_path) # returns list[PymuDocDataset] :contentReference[oaicite:3]{index=3}
14
 
15
- # Prepare writers
16
  tmp_dir = "output"
17
  img_dir = os.path.join(tmp_dir, "images")
18
  os.makedirs(img_dir, exist_ok=True)
@@ -22,22 +33,21 @@ def convert_with_mineru(pdf_path, out_format):
22
  all_pages = []
23
 
24
  for ds in datasets:
25
- # 2) Classify & infer
26
- if ds.classify() == SupportedPdfParseMethod.OCR:
27
- infer = ds.apply(doc_analyze, ocr=True)
28
- pipe = infer.pipe_ocr_mode(img_writer)
29
- else:
30
- infer = ds.apply(doc_analyze, ocr=False)
31
- pipe = infer.pipe_txt_mode(img_writer)
32
 
33
- # 3) Dump per‐document Markdown + collect
34
  basename = os.path.splitext(os.path.basename(pdf_path))[0]
35
  md_fname = f"{basename}.md"
36
  pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir))
37
  with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f:
38
  page_md = f.read()
39
 
40
- # 4) Collect structured JSON (middle JSON)
41
  json_fname = f"{basename}_content_list.json"
42
  pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir))
43
  with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f:
@@ -50,7 +60,6 @@ def convert_with_mineru(pdf_path, out_format):
50
 
51
  # 5) Return desired format
52
  if out_format == "markdown":
53
- # Concatenate all documents
54
  return "\n\n---\n\n".join(p["markdown"] for p in all_pages)
55
  else:
56
  return json.dumps(all_pages, ensure_ascii=False, indent=2)
@@ -60,13 +69,14 @@ demo = gr.Interface(
60
  fn=convert_with_mineru,
61
  inputs=[
62
  gr.File(label="Upload PDF", file_types=[".pdf"]),
63
- gr.Radio(["markdown","json"], value="markdown", label="Output format")
64
  ],
65
  outputs=gr.Code(label="Result"),
66
  title="MinerU-Powered PDF → Markdown/JSON",
67
  description=(
68
  "Leverage the advanced MinerU engine to extract text, images, tables, "
69
- "and formulas from your PDF into clean Markdown or structured JSON."
 
70
  )
71
  )
72
 
 
1
+ import os
2
+ import json
3
  import gradio as gr
4
 
5
+ # Ensure default config for magic-pdf
6
+ CONFIG_PATH = os.path.expanduser("~/magic-pdf.json")
7
+ if not os.path.exists(CONFIG_PATH):
8
+ default_cfg = {
9
+ "device": "cpu", # force CPU inference
10
+ "layout_model": "layout/mobilenetv3",
11
+ "formula_enable": True,
12
+ "table_enable": True
13
+ }
14
+ with open(CONFIG_PATH, "w", encoding="utf-8") as cfg:
15
+ json.dump(default_cfg, cfg, ensure_ascii=False, indent=2)
16
+
17
  # MinerU API imports
18
  from magic_pdf.data.read_api import read_local_pdfs
19
  from magic_pdf.data.data_reader_writer import FileBasedDataWriter
 
22
 
23
  def convert_with_mineru(pdf_path, out_format):
24
  # 1) Read file into MinerU dataset(s)
25
+ datasets = read_local_pdfs(pdf_path)
26
 
 
27
  tmp_dir = "output"
28
  img_dir = os.path.join(tmp_dir, "images")
29
  os.makedirs(img_dir, exist_ok=True)
 
33
  all_pages = []
34
 
35
  for ds in datasets:
36
+ # 2) Classify & infer, with OCR fallback
37
+ method = ds.classify()
38
+ infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
39
+ pipe = (infer.pipe_ocr_mode(img_writer)
40
+ if method == SupportedPdfParseMethod.OCR
41
+ else infer.pipe_txt_mode(img_writer))
 
42
 
43
+ # 3) Dump per‐document Markdown
44
  basename = os.path.splitext(os.path.basename(pdf_path))[0]
45
  md_fname = f"{basename}.md"
46
  pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir))
47
  with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f:
48
  page_md = f.read()
49
 
50
+ # 4) Dump structured JSON
51
  json_fname = f"{basename}_content_list.json"
52
  pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir))
53
  with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f:
 
60
 
61
  # 5) Return desired format
62
  if out_format == "markdown":
 
63
  return "\n\n---\n\n".join(p["markdown"] for p in all_pages)
64
  else:
65
  return json.dumps(all_pages, ensure_ascii=False, indent=2)
 
69
  fn=convert_with_mineru,
70
  inputs=[
71
  gr.File(label="Upload PDF", file_types=[".pdf"]),
72
+ gr.Radio(["markdown", "json"], value="markdown", label="Output format")
73
  ],
74
  outputs=gr.Code(label="Result"),
75
  title="MinerU-Powered PDF → Markdown/JSON",
76
  description=(
77
  "Leverage the advanced MinerU engine to extract text, images, tables, "
78
+ "and formulas from your PDF into clean Markdown or structured JSON. "
79
+ "A default CPU-only config is auto-generated if none is found."
80
  )
81
  )
82