Spaces:
Running
Running
File size: 3,048 Bytes
dfce863 ecf768f 683fa93 27b468a ecf768f 27b468a ecf768f 27b468a ecf768f 27b468a ecf768f 27b468a dfce863 ecf768f dfce863 ecf768f ec386e0 ecf768f ec386e0 27b468a ec386e0 ecf768f ec386e0 dfce863 27b468a ecf768f ec386e0 ecf768f ec386e0 ecf768f ec386e0 ecf768f 683fa93 ecf768f 683fa93 ec386e0 ecf768f ec386e0 ecf768f 27b468a 683fa93 27b468a ecf768f 683fa93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import os
import json
from huggingface_hub import snapshot_download
import gradio as gr
# 1) Pre‐download only the MFD/YOLO weights into ./models
MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
LOCAL_MODELS = "./models"
snapshot_download(
repo_id = MODEL_REPO,
local_dir = LOCAL_MODELS,
allow_patterns = "models/MFD/YOLO/*",
max_workers = 4
)
# 2) Write magic-pdf.json pointing at the actual weights path
CFG_PATH = os.path.expanduser("~/magic-pdf.json")
if not os.path.exists(CFG_PATH):
cfg = {
"device": "cpu",
# ← TWEAKED: point into the nested 'models' folder
"models-dir": os.path.join(LOCAL_MODELS, "models"),
"layout-model": "layoutlmv3",
"formula-enable": True,
"table-enable": True
}
with open(CFG_PATH, "w", encoding="utf-8") as f:
json.dump(cfg, f, ensure_ascii=False, indent=2)
# 3) MinerU imports
from magic_pdf.data.read_api import read_local_pdfs
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
def convert_with_mineru(pdf_file, out_fmt):
datasets = read_local_pdfs(pdf_file.name)
tmp, img_dir = "output", os.path.join("output", "images")
os.makedirs(img_dir, exist_ok=True)
md_writer = FileBasedDataWriter(tmp)
img_writer = FileBasedDataWriter(img_dir)
results = []
for ds in datasets:
method = ds.classify()
infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
pipe = (infer.pipe_ocr_mode(img_writer)
if method == SupportedPdfParseMethod.OCR
else infer.pipe_txt_mode(img_writer))
base = os.path.splitext(os.path.basename(pdf_file.name))[0]
md_name = f"{base}.md"
pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
md_text = f.read()
json_name = f"{base}_content_list.json"
pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir))
with open(os.path.join(tmp, json_name), encoding="utf-8") as f:
content = json.load(f)
results.append({"markdown": md_text, "content_list": content})
if out_fmt == "markdown":
return "\n\n---\n\n".join(r["markdown"] for r in results)
return json.dumps(results, ensure_ascii=False, indent=2)
# 4) Gradio UI
demo = gr.Interface(
fn=convert_with_mineru,
inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
outputs=gr.Code(label="Result"),
title="MinerU PDF → Markdown/JSON (Fixed)",
description="Downloads YOLO weights and points magic-pdf at the correct folder."
)
if __name__ == "__main__":
# Ensure HF_HUB_CACHE is consistent with our models folder
os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
demo.launch(server_name="0.0.0.0", server_port=7860)
|