Spaces:

euler314
/

file_extension_change

Running

App Files Files Community

file_extension_change / app.py

euler314

Update app.py

c842df1 verified about 1 month ago

raw

history blame

3.1 kB

	import os
	import json
	from huggingface_hub import snapshot_download
	import gradio as gr

	# 1) Pre-download all MinerU model weights under models/
	MODEL_REPO = "opendatalab/pdf-extract-kit-1.0"
	LOCAL_MODELS = "./models"

	# Grab both YOLO and MFR weights in one go
	snapshot_download(
	repo_id = MODEL_REPO,
	local_dir = LOCAL_MODELS,
	allow_patterns = [
	"models/MFD/YOLO/*",
	"models/MFR/*"
	],
	max_workers = 4
	)

	# 2) Write magic-pdf.json pointing at the nested 'models' directory
	CFG_PATH = os.path.expanduser("~/magic-pdf.json")
	if not os.path.exists(CFG_PATH):
	cfg = {
	"device": "cpu",
	"models-dir": os.path.join(LOCAL_MODELS, "models"),
	"layout-model": "layoutlmv3",
	"formula-enable": True,
	"table-enable": True
	}
	with open(CFG_PATH, "w", encoding="utf-8") as f:
	json.dump(cfg, f, ensure_ascii=False, indent=2)

	# 3) MinerU imports
	from magic_pdf.data.read_api import read_local_pdfs
	from magic_pdf.data.data_reader_writer import FileBasedDataWriter
	from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
	from magic_pdf.config.enums import SupportedPdfParseMethod

	def convert_with_mineru(pdf_file, out_fmt):
	datasets = read_local_pdfs(pdf_file.name)
	tmp, img_dir = "output", os.path.join("output", "images")
	os.makedirs(img_dir, exist_ok=True)
	md_writer = FileBasedDataWriter(tmp)
	img_writer = FileBasedDataWriter(img_dir)

	results = []
	for ds in datasets:
	method = ds.classify()
	infer = ds.apply(doc_analyze, ocr=(method == SupportedPdfParseMethod.OCR))
	pipe = (
	infer.pipe_ocr_mode(img_writer)
	if method == SupportedPdfParseMethod.OCR
	else infer.pipe_txt_mode(img_writer)
	)

	base = os.path.splitext(os.path.basename(pdf_file.name))[0]
	md_name = f"{base}.md"
	pipe.dump_md(md_writer, md_name, os.path.basename(img_dir))
	with open(os.path.join(tmp, md_name), encoding="utf-8") as f:
	md_text = f.read()

	json_name = f"{base}_content_list.json"
	pipe.dump_content_list(md_writer, json_name, os.path.basename(img_dir))
	with open(os.path.join(tmp, json_name), encoding="utf-8") as f:
	content = json.load(f)

	results.append({"markdown": md_text, "content_list": content})

	if out_fmt == "markdown":
	return "\n\n---\n\n".join(r["markdown"] for r in results)
	return json.dumps(results, ensure_ascii=False, indent=2)

	# 4) Gradio UI
	demo = gr.Interface(
	fn=convert_with_mineru,
	inputs=[gr.File(label="Upload PDF"), gr.Radio(["markdown", "json"], label="Format")],
	outputs=gr.Code(label="Result"),
	title="MinerU PDF → Markdown/JSON (Fully Fixed)",
	description="Pre-downloads all necessary YOLO and MFR weights and configures magic-pdf correctly."
	)

	if __name__ == "__main__":
	# Ensure HF_HUB_CACHE aligns with our models folder
	os.environ.setdefault("HF_HUB_CACHE", LOCAL_MODELS)
	demo.launch(server_name="0.0.0.0", server_port=7860)