File size: 2,450 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import base64
import json
import os
from typing import List
from marker.pdf.images import render_image
from marker.schema.page import Page
from marker.settings import settings
from PIL import Image
import io
def dump_equation_debug_data(doc, images, converted_spans):
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL == 0:
return
if len(images) == 0:
return
# We attempted one conversion per image
assert len(converted_spans) == len(images)
data_lines = []
for idx, (pil_image, converted_span) in enumerate(zip(images, converted_spans)):
if converted_span is None:
continue
# Image is a BytesIO object
img_bytes = io.BytesIO()
pil_image.save(img_bytes, format="WEBP", lossless=True)
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
data_lines.append({
"image": b64_image,
"text": converted_span.text,
"bbox": converted_span.bbox
})
# Remove extension from doc name
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
with open(debug_file, "w+") as f:
json.dump(data_lines, f)
def dump_bbox_debug_data(doc, blocks: List[Page]):
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
return
# Remove extension from doc name
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
debug_data = []
for idx, page_blocks in enumerate(blocks):
page = doc[idx]
png_image = render_image(page, dpi=settings.TEXIFY_DPI)
width, height = png_image.size
max_dimension = 6000
if width > max_dimension or height > max_dimension:
scaling_factor = min(max_dimension / width, max_dimension / height)
png_image = png_image.resize((int(width * scaling_factor), int(height * scaling_factor)), Image.ANTIALIAS)
img_bytes = io.BytesIO()
png_image.save(img_bytes, format="WEBP", lossless=True, quality=100)
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
page_data = page_blocks.model_dump()
page_data["image"] = b64_image
debug_data.append(page_data)
with open(debug_file, "w+") as f:
json.dump(debug_data, f)
|