MinerU

Paused

MinerU / mineru_single.py

Vladyslav Nalyvaiko

Fast API update

ca05b65 4 months ago

3.94 kB

	#!/usr/bin/env python3
	import os
	import time
	import base64
	import json
	import re
	import uuid
	from pathlib import Path
	from loguru import logger

	from magic_pdf.data.data_reader_writer import FileBasedDataReader
	from magic_pdf.tools.common import do_parse, prepare_env
	import pymupdf


	def read_fn(path):
	disk_rw = FileBasedDataReader(os.path.dirname(path))
	return disk_rw.read(os.path.basename(path))


	def parse_pdf(
	doc_path,
	output_dir,
	end_page_id,
	is_ocr,
	layout_mode,
	formula_enable,
	table_enable,
	language,
	):
	"""
	Core function that calls MinerU to parse a single PDF into Markdown + images.
	"""
	os.makedirs(output_dir, exist_ok=True)
	try:
	file_name = f"{Path(doc_path).stem}_{int(time.time())}"
	pdf_data = read_fn(doc_path)

	parse_method = "ocr" if is_ocr else "auto"

	local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)

	do_parse(
	output_dir,
	file_name,
	pdf_data,
	[],
	parse_method,
	False,
	end_page_id=end_page_id, # zero-based indexing
	layout_model=layout_mode,
	formula_enable=formula_enable,
	table_enable=table_enable,
	lang=language,
	f_dump_orig_pdf=False,
	)
	return local_md_dir, file_name
	except Exception as e:
	logger.exception(e)
	raise


	def image_to_base64(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")


	def replace_image_with_base64(markdown_text, image_dir_path):
	"""
	Replaces local image references in the Markdown with base64-embedded images
	"""
	pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'

	def replace(match):
	relative_path = match.group(1)
	full_path = os.path.join(image_dir_path, relative_path)
	base64_image = image_to_base64(full_path)
	return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"

	return re.sub(pattern, replace, markdown_text)


	def to_pdf(file_path):
	"""
	If input is not PDF, convert it to PDF using PyMuPDF
	"""
	with pymupdf.open(file_path) as doc:
	if doc.is_pdf:
	return file_path
	else:
	pdf_bytes = doc.convert_to_pdf()
	unique_filename = f"{uuid.uuid4()}.pdf"
	tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
	with open(tmp_file_path, "wb") as tmp_pdf_file:
	tmp_pdf_file.write(pdf_bytes)
	return tmp_file_path


	def to_markdown(
	file_path,
	end_pages=None,
	is_ocr=False,
	layout_mode="doclayout_yolo",
	formula_enable=True,
	table_enable=True,
	language="en",
	output_dir="./output",
	):
	"""
	High-level entry point to parse one PDF -> Markdown (plus images).
	Returns the path to the final .md file with images embedded as base64.
	"""
	# Convert to PDF if needed
	file_path = to_pdf(file_path)

	# If no end_page, read total from PyMuPDF
	with pymupdf.open(file_path) as doc:
	if end_pages is None:
	end_pages = len(doc)

	local_md_dir, file_name = parse_pdf(
	doc_path=file_path,
	output_dir=output_dir,
	end_page_id=end_pages - 1,
	is_ocr=is_ocr,
	layout_mode=layout_mode,
	formula_enable=formula_enable,
	table_enable=table_enable,
	language=language,
	)

	md_path = os.path.join(local_md_dir, file_name + ".md")
	with open(md_path, "r", encoding="utf-8") as f:
	original_md_content = f.read()

	md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir)

	return md_content_with_embeds