Spaces:
Running
on
T4
Running
on
T4
import base64 | |
import io | |
import re | |
from pathlib import Path | |
from marker.converters.pdf import PdfConverter | |
from marker.models import create_model_dict | |
from marker.output import text_from_rendered | |
from marker.processors.equation import EquationProcessor | |
from marker.settings import settings | |
from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA | |
# Marker init | |
if not ENABLE_FORMULA: | |
PdfConverter.default_processors = ( | |
processor | |
for processor in PdfConverter.default_processors | |
if processor != EquationProcessor | |
) | |
marker_converter = PdfConverter( | |
artifact_dict=create_model_dict(), | |
config={ | |
"debug_pdf_images": ENABLE_DEBUG_MODE, | |
}, | |
) | |
def img_to_html(img, img_alt): | |
img_bytes = io.BytesIO() | |
img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT) | |
img_bytes_value = img_bytes.getvalue() | |
encoded = base64.b64encode(img_bytes_value).decode() | |
img_html = ( | |
f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()}' | |
f';base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">' | |
) | |
return img_html | |
def markdown_insert_images(markdown, images): | |
image_tags = re.findall( | |
r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', | |
markdown, | |
) | |
for image in image_tags: | |
image_markdown = image[0] | |
image_alt = image[1] | |
image_path = image[2] | |
if image_path in images: | |
markdown = markdown.replace( | |
image_markdown, img_to_html(images[image_path], image_alt) | |
) | |
return markdown | |
def convert_marker(path: str, file_name: str): | |
rendered = marker_converter(path) | |
text, _, images = text_from_rendered(rendered) | |
text = markdown_insert_images(text, images) | |
debug_image_dir = Path(rendered.metadata.get("debug_data_path")) | |
if debug_image_dir.exists(): | |
debug_image_paths = [ | |
path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem | |
] | |
else: | |
debug_image_paths = [] | |
return text, debug_image_paths | |