|
import base64 |
|
import json |
|
import os |
|
from typing import List |
|
|
|
from marker.pdf.images import render_image |
|
from marker.schema.page import Page |
|
from marker.settings import settings |
|
from PIL import Image |
|
import io |
|
|
|
|
|
def dump_equation_debug_data(doc, images, converted_spans): |
|
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL == 0: |
|
return |
|
|
|
if len(images) == 0: |
|
return |
|
|
|
|
|
assert len(converted_spans) == len(images) |
|
|
|
data_lines = [] |
|
for idx, (pil_image, converted_span) in enumerate(zip(images, converted_spans)): |
|
if converted_span is None: |
|
continue |
|
|
|
img_bytes = io.BytesIO() |
|
pil_image.save(img_bytes, format="WEBP", lossless=True) |
|
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8") |
|
data_lines.append({ |
|
"image": b64_image, |
|
"text": converted_span.text, |
|
"bbox": converted_span.bbox |
|
}) |
|
|
|
|
|
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0] |
|
|
|
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json") |
|
with open(debug_file, "w+") as f: |
|
json.dump(data_lines, f) |
|
|
|
|
|
def dump_bbox_debug_data(doc, blocks: List[Page]): |
|
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2: |
|
return |
|
|
|
|
|
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0] |
|
|
|
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json") |
|
debug_data = [] |
|
for idx, page_blocks in enumerate(blocks): |
|
page = doc[idx] |
|
|
|
png_image = render_image(page, dpi=settings.TEXIFY_DPI) |
|
width, height = png_image.size |
|
max_dimension = 6000 |
|
if width > max_dimension or height > max_dimension: |
|
scaling_factor = min(max_dimension / width, max_dimension / height) |
|
png_image = png_image.resize((int(width * scaling_factor), int(height * scaling_factor)), Image.ANTIALIAS) |
|
|
|
img_bytes = io.BytesIO() |
|
png_image.save(img_bytes, format="WEBP", lossless=True, quality=100) |
|
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8") |
|
|
|
page_data = page_blocks.model_dump() |
|
page_data["image"] = b64_image |
|
debug_data.append(page_data) |
|
|
|
with open(debug_file, "w+") as f: |
|
json.dump(debug_data, f) |
|
|
|
|
|
|
|
|