euler314 commited on
Commit
e219826
·
verified ·
1 Parent(s): e5d8bd1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -78
app.py CHANGED
@@ -1,86 +1,56 @@
1
- import os
 
 
2
  import json
3
- from huggingface_hub import snapshot_download
4
  import gradio as gr
5
 
6
- # 1) Pre-download all MinerU model weights under models/
7
- MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
8
- LOCAL_MODELS = "./models"
9
-
10
- # Grab both YOLO and MFR weights in one go
11
- snapshot_download(
12
- repo_id = MODEL_REPO,
13
- local_dir = LOCAL_MODELS,
14
- allow_patterns = [
15
- "models/MFD/YOLO/*",
16
- "models/MFR/*"
17
- ],
18
- max_workers = 4
19
- )
20
-
21
- # 2) Write magic-pdf.json pointing at the nested 'models' directory
22
- CFG_PATH = os.path.expanduser("~/magic-pdf.json")
23
- if not os.path.exists(CFG_PATH):
24
- cfg = {
25
- "device": "cpu",
26
- "models-dir": os.path.join(LOCAL_MODELS, "models"),
27
- "layout-model": "layoutlmv3",
28
- "formula-enable": True,
29
- "table-enable": True
30
- }
31
- with open(CFG_PATH, "w", encoding="utf-8") as f:
32
- json.dump(cfg, f, ensure_ascii=False, indent=2)
33
-
34
- # 3) MinerU imports
35
- from magic_pdf.data.read_api import read_local_pdfs
36
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter
37
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
38
- from magic_pdf.config.enums import SupportedPdfParseMethod
39
-
40
- def convert_with_mineru(pdf_file, out_fmt):
41
- datasets = read_local_pdfs(pdf_file.name)
42
- tmp, img_dir = "output", os.path.join("output", "images")
43
- os.makedirs(img_dir, exist_ok=True)
44
- md_writer = FileBasedDataWriter(tmp)
45
- img_writer = FileBasedDataWriter(img_dir)
46
-
47
- results = []
48
- for ds in datasets:
49
- method = ds.classify()
50
- infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
51
- pipe = (
52
- infer.pipe_ocr_mode(img_writer)
53
- if method == SupportedPdfParseMethod.OCR
54
- else infer.pipe_txt_mode(img_writer)
55
- )
56
-
57
- base = os.path.splitext(os.path.basename(pdf_file.name))[0]
58
- md_name = f"{base}.md"
59
- pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
60
- with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
61
- md_text = f.read()
62
-
63
- json_name = f"{base}_content_list.json"
64
- pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir))
65
- with open(os.path.join(tmp, json_name), encoding="utf-8") as f:
66
- content = json.load(f)
67
-
68
- results.append({"markdown": md_text, "content_list": content})
69
-
70
- if out_fmt == "markdown":
71
- return "\n\n---\n\n".join(r["markdown"] for r in results)
72
- return json.dumps(results, ensure_ascii=False, indent=2)
73
-
74
- # 4) Gradio UI
75
  demo = gr.Interface(
76
- fn=convert_with_mineru,
77
- inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
78
- outputs=gr.Code(label="Result"),
79
- title="MinerU PDF → Markdown/JSON (Fully Fixed)",
80
- description="Pre-downloads all necessary YOLO and MFR weights and configures magic-pdf correctly."
 
 
 
 
 
 
 
 
 
 
 
 
81
  )
82
 
83
  if __name__ == "__main__":
84
- # Ensure HF_HUB_CACHE aligns with our models folder
85
- os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
86
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ # app.py
2
+ import fitz # PyMuPDF
3
+ from markdownify import markdownify as md
4
  import json
 
5
  import gradio as gr
6
 
7
+ def convert_pdf_to_markdown(path):
8
+ """Extract each page as HTML, convert to Markdown."""
9
+ doc = fitz.open(path)
10
+ pages_md = []
11
+ for i, page in enumerate(doc, start=1):
12
+ html = page.get_text("html") or ""
13
+ # Clean conversion: collapse multiple newlines
14
+ page_md = md(html).strip()
15
+ pages_md.append({"page": i, "markdown": page_md})
16
+ return pages_md
17
+
18
+ def process_upload(pdf_file, output_format):
19
+ """
20
+ pdf_file: tempfile-like object from Gradio
21
+ output_format: "markdown" or "json"
22
+ """
23
+ # Convert and collect
24
+ pages = convert_pdf_to_markdown(pdf_file.name)
25
+
26
+ if output_format == "markdown":
27
+ # Join all pages
28
+ full_md = "\n\n---\n\n".join(p["markdown"] for p in pages)
29
+ return full_md
30
+ else:
31
+ # Return pretty JSON
32
+ return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
33
+
34
+ # Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  demo = gr.Interface(
36
+ fn=process_upload,
37
+ inputs=[
38
+ gr.File(label="Upload your PDF", file_types=[".pdf"]),
39
+ gr.Radio(choices=["markdown", "json"],
40
+ value="markdown",
41
+ label="Output format")
42
+ ],
43
+ outputs=gr.Code(label="Converted Output"),
44
+ title="PDF → Markdown/JSON Converter",
45
+ description=(
46
+ "Upload a PDF and get back a professionally converted Markdown "
47
+ "or a structured JSON with each page’s Markdown. "
48
+ "PDFs with images or complex tables may still need manual review."
49
+ ),
50
+ examples=[
51
+ # you can add example PDFs here if desired
52
+ ]
53
  )
54
 
55
  if __name__ == "__main__":
 
 
56
  demo.launch(server_name="0.0.0.0", server_port=7860)