euler314 commited on
Commit
c842df1
Β·
verified Β·
1 Parent(s): 27b468a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -12
app.py CHANGED
@@ -3,22 +3,26 @@ import json
3
  from huggingface_hub import snapshot_download
4
  import gradio as gr
5
 
6
- # 1) Pre‐download only the MFD/YOLO weights into ./models
7
- MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
8
- LOCAL_MODELS = "./models"
 
 
9
  snapshot_download(
10
  repo_id = MODEL_REPO,
11
  local_dir = LOCAL_MODELS,
12
- allow_patterns = "models/MFD/YOLO/*",
 
 
 
13
  max_workers = 4
14
  )
15
 
16
- # 2) Write magic-pdf.json pointing at the actual weights path
17
  CFG_PATH = os.path.expanduser("~/magic-pdf.json")
18
  if not os.path.exists(CFG_PATH):
19
  cfg = {
20
  "device": "cpu",
21
- # ← TWEAKED: point into the nested 'models' folder
22
  "models-dir": os.path.join(LOCAL_MODELS, "models"),
23
  "layout-model": "layoutlmv3",
24
  "formula-enable": True,
@@ -44,9 +48,11 @@ def convert_with_mineru(pdf_file, out_fmt):
44
  for ds in datasets:
45
  method = ds.classify()
46
  infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
47
- pipe = (infer.pipe_ocr_mode(img_writer)
48
- if method == SupportedPdfParseMethod.OCR
49
- else infer.pipe_txt_mode(img_writer))
 
 
50
 
51
  base = os.path.splitext(os.path.basename(pdf_file.name))[0]
52
  md_name = f"{base}.md"
@@ -70,11 +76,11 @@ demo = gr.Interface(
70
  fn=convert_with_mineru,
71
  inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
72
  outputs=gr.Code(label="Result"),
73
- title="MinerU PDF β†’ Markdown/JSON (Fixed)",
74
- description="Downloads YOLO weights and points magic-pdf at the correct folder."
75
  )
76
 
77
  if __name__ == "__main__":
78
- # Ensure HF_HUB_CACHE is consistent with our models folder
79
  os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
80
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  from huggingface_hub import snapshot_download
4
  import gradio as gr
5
 
6
+ # 1) Pre-download all MinerU model weights under models/
7
+ MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
8
+ LOCAL_MODELS = "./models"
9
+
10
+ # Grab both YOLO and MFR weights in one go
11
  snapshot_download(
12
  repo_id = MODEL_REPO,
13
  local_dir = LOCAL_MODELS,
14
+ allow_patterns = [
15
+ "models/MFD/YOLO/*",
16
+ "models/MFR/*"
17
+ ],
18
  max_workers = 4
19
  )
20
 
21
+ # 2) Write magic-pdf.json pointing at the nested 'models' directory
22
  CFG_PATH = os.path.expanduser("~/magic-pdf.json")
23
  if not os.path.exists(CFG_PATH):
24
  cfg = {
25
  "device": "cpu",
 
26
  "models-dir": os.path.join(LOCAL_MODELS, "models"),
27
  "layout-model": "layoutlmv3",
28
  "formula-enable": True,
 
48
  for ds in datasets:
49
  method = ds.classify()
50
  infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
51
+ pipe = (
52
+ infer.pipe_ocr_mode(img_writer)
53
+ if method == SupportedPdfParseMethod.OCR
54
+ else infer.pipe_txt_mode(img_writer)
55
+ )
56
 
57
  base = os.path.splitext(os.path.basename(pdf_file.name))[0]
58
  md_name = f"{base}.md"
 
76
  fn=convert_with_mineru,
77
  inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
78
  outputs=gr.Code(label="Result"),
79
+ title="MinerU PDF β†’ Markdown/JSON (Fully Fixed)",
80
+ description="Pre-downloads all necessary YOLO and MFR weights and configures magic-pdf correctly."
81
  )
82
 
83
  if __name__ == "__main__":
84
+ # Ensure HF_HUB_CACHE aligns with our models folder
85
  os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
86
  demo.launch(server_name="0.0.0.0", server_port=7860)