#!/usr/bin/env python3 import os import time import base64 import json import re import uuid from pathlib import Path from loguru import logger from magic_pdf.data.data_reader_writer import FileBasedDataReader from magic_pdf.tools.common import do_parse, prepare_env import pymupdf def read_fn(path): disk_rw = FileBasedDataReader(os.path.dirname(path)) return disk_rw.read(os.path.basename(path)) def parse_pdf( doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language, ): """ Core function that calls MinerU to parse a single PDF into Markdown + images. """ os.makedirs(output_dir, exist_ok=True) try: file_name = f"{Path(doc_path).stem}_{int(time.time())}" pdf_data = read_fn(doc_path) parse_method = "ocr" if is_ocr else "auto" local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method) do_parse( output_dir, file_name, pdf_data, [], parse_method, False, end_page_id=end_page_id, # zero-based indexing layout_model=layout_mode, formula_enable=formula_enable, table_enable=table_enable, lang=language, f_dump_orig_pdf=False, ) return local_md_dir, file_name except Exception as e: logger.exception(e) raise def image_to_base64(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") def replace_image_with_base64(markdown_text, image_dir_path): """ Replaces local image references in the Markdown with base64-embedded images """ pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)' def replace(match): relative_path = match.group(1) full_path = os.path.join(image_dir_path, relative_path) base64_image = image_to_base64(full_path) return f"![{relative_path}](data:image/jpeg;base64,{base64_image})" return re.sub(pattern, replace, markdown_text) def to_pdf(file_path): """ If input is not PDF, convert it to PDF using PyMuPDF """ with pymupdf.open(file_path) as doc: if doc.is_pdf: return file_path else: pdf_bytes = doc.convert_to_pdf() unique_filename = f"{uuid.uuid4()}.pdf" tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename) with open(tmp_file_path, "wb") as tmp_pdf_file: tmp_pdf_file.write(pdf_bytes) return tmp_file_path def to_markdown( file_path, end_pages=None, is_ocr=False, layout_mode="doclayout_yolo", formula_enable=True, table_enable=True, language="en", output_dir="./output", ): """ High-level entry point to parse one PDF -> Markdown (plus images). Returns the path to the final .md file with images embedded as base64. """ # Convert to PDF if needed file_path = to_pdf(file_path) # If no end_page, read total from PyMuPDF with pymupdf.open(file_path) as doc: if end_pages is None: end_pages = len(doc) local_md_dir, file_name = parse_pdf( doc_path=file_path, output_dir=output_dir, end_page_id=end_pages - 1, is_ocr=is_ocr, layout_mode=layout_mode, formula_enable=formula_enable, table_enable=table_enable, language=language, ) md_path = os.path.join(local_md_dir, file_name + ".md") with open(md_path, "r", encoding="utf-8") as f: original_md_content = f.read() md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir) return md_content_with_embeds