euler314 commited on
Commit
ecf768f
·
verified ·
1 Parent(s): dfce863

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -56
app.py CHANGED
@@ -1,84 +1,80 @@
1
  import os
2
  import json
 
3
  import gradio as gr
4
 
5
- # Ensure default config for magic-pdf
6
- CONFIG_PATH = os.path.expanduser("~/magic-pdf.json")
7
- if not os.path.exists(CONFIG_PATH):
8
- default_cfg = {
9
- "device": "cpu", # force CPU inference
10
- "layout_model": "layout/mobilenetv3",
11
- "formula_enable": True,
12
- "table_enable": True
 
 
 
 
 
 
 
 
 
 
 
13
  }
14
- with open(CONFIG_PATH, "w", encoding="utf-8") as cfg:
15
- json.dump(default_cfg, cfg, ensure_ascii=False, indent=2)
16
 
17
- # MinerU API imports
18
  from magic_pdf.data.read_api import read_local_pdfs
19
  from magic_pdf.data.data_reader_writer import FileBasedDataWriter
20
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
21
  from magic_pdf.config.enums import SupportedPdfParseMethod
22
 
23
- def convert_with_mineru(pdf_path, out_format):
24
- # 1) Read file into MinerU dataset(s)
25
- datasets = read_local_pdfs(pdf_path)
26
-
27
- tmp_dir = "output"
28
- img_dir = os.path.join(tmp_dir, "images")
29
  os.makedirs(img_dir, exist_ok=True)
30
- md_writer = FileBasedDataWriter(tmp_dir)
31
  img_writer = FileBasedDataWriter(img_dir)
32
 
33
- all_pages = []
34
-
35
  for ds in datasets:
36
- # 2) Classify & infer, with OCR fallback
37
  method = ds.classify()
38
  infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
39
- pipe = (infer.pipe_ocr_mode(img_writer)
40
- if method == SupportedPdfParseMethod.OCR
41
- else infer.pipe_txt_mode(img_writer))
42
-
43
- # 3) Dump per‐document Markdown
44
- basename = os.path.splitext(os.path.basename(pdf_path))[0]
45
- md_fname = f"{basename}.md"
46
- pipe.dump_md(md_writer, md_fname, os.path.basename(img_dir))
47
- with open(os.path.join(tmp_dir, md_fname), "r", encoding="utf-8") as f:
48
- page_md = f.read()
49
 
50
- # 4) Dump structured JSON
51
- json_fname = f"{basename}_content_list.json"
52
- pipe.dump_content_list(md_writer, json_fname, os.path.basename(img_dir))
53
- with open(os.path.join(tmp_dir, json_fname), "r", encoding="utf-8") as f:
54
- content_list = json.load(f)
55
 
56
- all_pages.append({
57
- "markdown": page_md,
58
- "content_list": content_list
59
- })
60
 
61
- # 5) Return desired format
62
- if out_format == "markdown":
63
- return "\n\n---\n\n".join(p["markdown"] for p in all_pages)
64
- else:
65
- return json.dumps(all_pages, ensure_ascii=False, indent=2)
66
 
67
- # Gradio interface
68
  demo = gr.Interface(
69
  fn=convert_with_mineru,
70
- inputs=[
71
- gr.File(label="Upload PDF", file_types=[".pdf"]),
72
- gr.Radio(["markdown", "json"], value="markdown", label="Output format")
73
- ],
74
  outputs=gr.Code(label="Result"),
75
- title="MinerU-Powered PDF → Markdown/JSON",
76
- description=(
77
- "Leverage the advanced MinerU engine to extract text, images, tables, "
78
- "and formulas from your PDF into clean Markdown or structured JSON. "
79
- "A default CPU-only config is auto-generated if none is found."
80
- )
81
  )
82
 
83
  if __name__ == "__main__":
 
 
84
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import os
2
  import json
3
+ from huggingface_hub import snapshot_download
4
  import gradio as gr
5
 
6
+ # 1) Pre-download only the MFD/YOLO weights
7
+ MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
8
+ LOCAL_MODELS = "./models"
9
+ snapshot_download(
10
+ repo_id=MODEL_REPO,
11
+ local_dir=LOCAL_MODELS,
12
+ allow_patterns="models/MFD/YOLO/*",
13
+ max_workers=4
14
+ )
15
+
16
+ # 2) Write a minimal magic-pdf.json pointing to our models
17
+ CFG_PATH = os.path.expanduser("~/magic-pdf.json")
18
+ if not os.path.exists(CFG_PATH):
19
+ cfg = {
20
+ "device": "cpu", # CPU fallback
21
+ "models-dir": LOCAL_MODELS, # where we downloaded yolo_v8_ft.pt
22
+ "layout-model": "layoutlmv3",
23
+ "formula-enable": True,
24
+ "table-enable": True
25
  }
26
+ with open(CFG_PATH, "w", encoding="utf-8") as f:
27
+ json.dump(cfg, f, ensure_ascii=False, indent=2)
28
 
29
+ # 3) MinerU imports
30
  from magic_pdf.data.read_api import read_local_pdfs
31
  from magic_pdf.data.data_reader_writer import FileBasedDataWriter
32
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
33
  from magic_pdf.config.enums import SupportedPdfParseMethod
34
 
35
+ def convert_with_mineru(pdf_file, out_fmt):
36
+ datasets = read_local_pdfs(pdf_file.name)
37
+ tmp, img_dir = "output", os.path.join("output", "images")
 
 
 
38
  os.makedirs(img_dir, exist_ok=True)
39
+ md_writer = FileBasedDataWriter(tmp)
40
  img_writer = FileBasedDataWriter(img_dir)
41
 
42
+ results = []
 
43
  for ds in datasets:
 
44
  method = ds.classify()
45
  infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
46
+ pipe = (
47
+ infer.pipe_ocr_mode(img_writer)
48
+ if method == SupportedPdfParseMethod.OCR
49
+ else infer.pipe_txt_mode(img_writer)
50
+ )
51
+ base = os.path.splitext(os.path.basename(pdf_file.name))[0]
52
+ md_name = f"{base}.md"
53
+ pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
54
+ with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
55
+ md_text = f.read()
56
 
57
+ json_name = f"{base}_content_list.json"
58
+ pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir))
59
+ with open(os.path.join(tmp, json_name), encoding="utf-8") as f:
60
+ content = json.load(f)
 
61
 
62
+ results.append({"markdown": md_text, "content_list": content})
 
 
 
63
 
64
+ if out_fmt == "markdown":
65
+ return "\n\n---\n\n".join(r["markdown"] for r in results)
66
+ return json.dumps(results, ensure_ascii=False, indent=2)
 
 
67
 
68
+ # 4) Gradio UI
69
  demo = gr.Interface(
70
  fn=convert_with_mineru,
71
+ inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
 
 
 
72
  outputs=gr.Code(label="Result"),
73
+ title="MinerU PDF → Markdown/JSON (Fixed)",
74
+ description="Pre-downloads YOLO weights and configures MinerU for Spaces."
 
 
 
 
75
  )
76
 
77
  if __name__ == "__main__":
78
+ # Recommended: ensure HF_HUB_CACHE points to ./models
79
+ os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
80
  demo.launch(server_name="0.0.0.0", server_port=7860)