MinerU

Paused

File size: 3,940 Bytes

#!/usr/bin/env python3
import os
import time
import base64
import json
import re
import uuid
from pathlib import Path
from loguru import logger

from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.tools.common import do_parse, prepare_env
import pymupdf


def read_fn(path):
    disk_rw = FileBasedDataReader(os.path.dirname(path))
    return disk_rw.read(os.path.basename(path))


def parse_pdf(

    doc_path,

    output_dir,

    end_page_id,

    is_ocr,

    layout_mode,

    formula_enable,

    table_enable,

    language,

):
    """

    Core function that calls MinerU to parse a single PDF into Markdown + images.

    """
    os.makedirs(output_dir, exist_ok=True)
    try:
        file_name = f"{Path(doc_path).stem}_{int(time.time())}"
        pdf_data = read_fn(doc_path)

        parse_method = "ocr" if is_ocr else "auto"

        local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)

        do_parse(
            output_dir,
            file_name,
            pdf_data,
            [],
            parse_method,
            False,
            end_page_id=end_page_id,  # zero-based indexing
            layout_model=layout_mode,
            formula_enable=formula_enable,
            table_enable=table_enable,
            lang=language,
            f_dump_orig_pdf=False,
        )
        return local_md_dir, file_name
    except Exception as e:
        logger.exception(e)
        raise


def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def replace_image_with_base64(markdown_text, image_dir_path):
    """

    Replaces local image references in the Markdown with base64-embedded images

    """
    pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'

    def replace(match):
        relative_path = match.group(1)
        full_path = os.path.join(image_dir_path, relative_path)
        base64_image = image_to_base64(full_path)
        return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"

    return re.sub(pattern, replace, markdown_text)


def to_pdf(file_path):
    """

    If input is not PDF, convert it to PDF using PyMuPDF

    """
    with pymupdf.open(file_path) as doc:
        if doc.is_pdf:
            return file_path
        else:
            pdf_bytes = doc.convert_to_pdf()
            unique_filename = f"{uuid.uuid4()}.pdf"
            tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
            with open(tmp_file_path, "wb") as tmp_pdf_file:
                tmp_pdf_file.write(pdf_bytes)
            return tmp_file_path


def to_markdown(

    file_path,

    end_pages=None,

    is_ocr=False,

    layout_mode="doclayout_yolo",

    formula_enable=True,

    table_enable=True,

    language="en",

    output_dir="./output",

):
    """

    High-level entry point to parse one PDF -> Markdown (plus images).

    Returns the path to the final .md file with images embedded as base64.

    """
    # Convert to PDF if needed
    file_path = to_pdf(file_path)

    # If no end_page, read total from PyMuPDF
    with pymupdf.open(file_path) as doc:
        if end_pages is None:
            end_pages = len(doc)

    local_md_dir, file_name = parse_pdf(
        doc_path=file_path,
        output_dir=output_dir,
        end_page_id=end_pages - 1,
        is_ocr=is_ocr,
        layout_mode=layout_mode,
        formula_enable=formula_enable,
        table_enable=table_enable,
        language=language,
    )

    md_path = os.path.join(local_md_dir, file_name + ".md")
    with open(md_path, "r", encoding="utf-8") as f:
        original_md_content = f.read()

    md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir)

    return md_content_with_embeds