euler314 commited on
Commit
27b468a
Β·
verified Β·
1 Parent(s): ecf768f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -22
app.py CHANGED
@@ -3,25 +3,26 @@ import json
3
  from huggingface_hub import snapshot_download
4
  import gradio as gr
5
 
6
- # 1) Pre-download only the MFD/YOLO weights
7
- MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
8
- LOCAL_MODELS = "./models"
9
  snapshot_download(
10
- repo_id=MODEL_REPO,
11
- local_dir=LOCAL_MODELS,
12
- allow_patterns="models/MFD/YOLO/*",
13
- max_workers=4
14
  )
15
 
16
- # 2) Write a minimal magic-pdf.json pointing to our models
17
  CFG_PATH = os.path.expanduser("~/magic-pdf.json")
18
  if not os.path.exists(CFG_PATH):
19
  cfg = {
20
- "device": "cpu", # CPU fallback
21
- "models-dir": LOCAL_MODELS, # where we downloaded yolo_v8_ft.pt
22
- "layout-model": "layoutlmv3",
 
23
  "formula-enable": True,
24
- "table-enable": True
25
  }
26
  with open(CFG_PATH, "w", encoding="utf-8") as f:
27
  json.dump(cfg, f, ensure_ascii=False, indent=2)
@@ -36,19 +37,18 @@ def convert_with_mineru(pdf_file, out_fmt):
36
  datasets = read_local_pdfs(pdf_file.name)
37
  tmp, img_dir = "output", os.path.join("output", "images")
38
  os.makedirs(img_dir, exist_ok=True)
39
- md_writer = FileBasedDataWriter(tmp)
40
  img_writer = FileBasedDataWriter(img_dir)
41
 
42
  results = []
43
  for ds in datasets:
44
  method = ds.classify()
45
- infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
46
- pipe = (
47
- infer.pipe_ocr_mode(img_writer)
48
- if method == SupportedPdfParseMethod.OCR
49
- else infer.pipe_txt_mode(img_writer)
50
- )
51
- base = os.path.splitext(os.path.basename(pdf_file.name))[0]
52
  md_name = f"{base}.md"
53
  pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
54
  with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
@@ -71,10 +71,10 @@ demo = gr.Interface(
71
  inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
72
  outputs=gr.Code(label="Result"),
73
  title="MinerU PDF β†’ Markdown/JSON (Fixed)",
74
- description="Pre-downloads YOLO weights and configures MinerU for Spaces."
75
  )
76
 
77
  if __name__ == "__main__":
78
- # Recommended: ensure HF_HUB_CACHE points to ./models
79
  os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
80
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  from huggingface_hub import snapshot_download
4
  import gradio as gr
5
 
6
+ # 1) Pre‐download only the MFD/YOLO weights into ./models
7
+ MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
8
+ LOCAL_MODELS = "./models"
9
  snapshot_download(
10
+ repo_id = MODEL_REPO,
11
+ local_dir = LOCAL_MODELS,
12
+ allow_patterns = "models/MFD/YOLO/*",
13
+ max_workers = 4
14
  )
15
 
16
+ # 2) Write magic-pdf.json pointing at the actual weights path
17
  CFG_PATH = os.path.expanduser("~/magic-pdf.json")
18
  if not os.path.exists(CFG_PATH):
19
  cfg = {
20
+ "device": "cpu",
21
+ # ← TWEAKED: point into the nested 'models' folder
22
+ "models-dir": os.path.join(LOCAL_MODELS, "models"),
23
+ "layout-model": "layoutlmv3",
24
  "formula-enable": True,
25
+ "table-enable": True
26
  }
27
  with open(CFG_PATH, "w", encoding="utf-8") as f:
28
  json.dump(cfg, f, ensure_ascii=False, indent=2)
 
37
  datasets = read_local_pdfs(pdf_file.name)
38
  tmp, img_dir = "output", os.path.join("output", "images")
39
  os.makedirs(img_dir, exist_ok=True)
40
+ md_writer = FileBasedDataWriter(tmp)
41
  img_writer = FileBasedDataWriter(img_dir)
42
 
43
  results = []
44
  for ds in datasets:
45
  method = ds.classify()
46
+ infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
47
+ pipe = (infer.pipe_ocr_mode(img_writer)
48
+ if method == SupportedPdfParseMethod.OCR
49
+ else infer.pipe_txt_mode(img_writer))
50
+
51
+ base = os.path.splitext(os.path.basename(pdf_file.name))[0]
 
52
  md_name = f"{base}.md"
53
  pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
54
  with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
 
71
  inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
72
  outputs=gr.Code(label="Result"),
73
  title="MinerU PDF β†’ Markdown/JSON (Fixed)",
74
+ description="Downloads YOLO weights and points magic-pdf at the correct folder."
75
  )
76
 
77
  if __name__ == "__main__":
78
+ # Ensure HF_HUB_CACHE is consistent with our models folder
79
  os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
80
  demo.launch(server_name="0.0.0.0", server_port=7860)