|
|
|
import os
|
|
import time
|
|
import base64
|
|
import json
|
|
import re
|
|
import uuid
|
|
from pathlib import Path
|
|
from loguru import logger
|
|
|
|
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
|
from magic_pdf.tools.common import do_parse, prepare_env
|
|
import pymupdf
|
|
|
|
|
|
def read_fn(path):
|
|
disk_rw = FileBasedDataReader(os.path.dirname(path))
|
|
return disk_rw.read(os.path.basename(path))
|
|
|
|
|
|
def parse_pdf(
|
|
doc_path,
|
|
output_dir,
|
|
end_page_id,
|
|
is_ocr,
|
|
layout_mode,
|
|
formula_enable,
|
|
table_enable,
|
|
language,
|
|
):
|
|
"""
|
|
Core function that calls MinerU to parse a single PDF into Markdown + images.
|
|
"""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
try:
|
|
file_name = f"{Path(doc_path).stem}_{int(time.time())}"
|
|
pdf_data = read_fn(doc_path)
|
|
|
|
parse_method = "ocr" if is_ocr else "auto"
|
|
|
|
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
|
|
|
|
do_parse(
|
|
output_dir,
|
|
file_name,
|
|
pdf_data,
|
|
[],
|
|
parse_method,
|
|
False,
|
|
end_page_id=end_page_id,
|
|
layout_model=layout_mode,
|
|
formula_enable=formula_enable,
|
|
table_enable=table_enable,
|
|
lang=language,
|
|
f_dump_orig_pdf=False,
|
|
)
|
|
return local_md_dir, file_name
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
raise
|
|
|
|
|
|
def image_to_base64(image_path):
|
|
with open(image_path, "rb") as image_file:
|
|
return base64.b64encode(image_file.read()).decode("utf-8")
|
|
|
|
|
|
def replace_image_with_base64(markdown_text, image_dir_path):
|
|
"""
|
|
Replaces local image references in the Markdown with base64-embedded images
|
|
"""
|
|
pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
|
|
|
|
def replace(match):
|
|
relative_path = match.group(1)
|
|
full_path = os.path.join(image_dir_path, relative_path)
|
|
base64_image = image_to_base64(full_path)
|
|
return f""
|
|
|
|
return re.sub(pattern, replace, markdown_text)
|
|
|
|
|
|
def to_pdf(file_path):
|
|
"""
|
|
If input is not PDF, convert it to PDF using PyMuPDF
|
|
"""
|
|
with pymupdf.open(file_path) as doc:
|
|
if doc.is_pdf:
|
|
return file_path
|
|
else:
|
|
pdf_bytes = doc.convert_to_pdf()
|
|
unique_filename = f"{uuid.uuid4()}.pdf"
|
|
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
|
with open(tmp_file_path, "wb") as tmp_pdf_file:
|
|
tmp_pdf_file.write(pdf_bytes)
|
|
return tmp_file_path
|
|
|
|
|
|
def to_markdown(
|
|
file_path,
|
|
end_pages=None,
|
|
is_ocr=False,
|
|
layout_mode="doclayout_yolo",
|
|
formula_enable=True,
|
|
table_enable=True,
|
|
language="en",
|
|
output_dir="./output",
|
|
):
|
|
"""
|
|
High-level entry point to parse one PDF -> Markdown (plus images).
|
|
Returns the path to the final .md file with images embedded as base64.
|
|
"""
|
|
|
|
file_path = to_pdf(file_path)
|
|
|
|
|
|
with pymupdf.open(file_path) as doc:
|
|
if end_pages is None:
|
|
end_pages = len(doc)
|
|
|
|
local_md_dir, file_name = parse_pdf(
|
|
doc_path=file_path,
|
|
output_dir=output_dir,
|
|
end_page_id=end_pages - 1,
|
|
is_ocr=is_ocr,
|
|
layout_mode=layout_mode,
|
|
formula_enable=formula_enable,
|
|
table_enable=table_enable,
|
|
language=language,
|
|
)
|
|
|
|
md_path = os.path.join(local_md_dir, file_name + ".md")
|
|
with open(md_path, "r", encoding="utf-8") as f:
|
|
original_md_content = f.read()
|
|
|
|
md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir)
|
|
|
|
return md_content_with_embeds |