Spaces:
Running
on
T4
Running
on
T4
from pathlib import Path | |
from marker.converters.pdf import PdfConverter | |
from marker.models import create_model_dict | |
from marker.output import text_from_rendered | |
# Marker init | |
marker_converter = PdfConverter( | |
artifact_dict=create_model_dict(), | |
config={ | |
"debug_pdf_images": True, | |
}, | |
) | |
def convert_marker(path: str, file_name: str): | |
rendered = marker_converter(path) | |
text, _, images = text_from_rendered(rendered) | |
debug_image_dir = Path(rendered.metadata.get("debug_data_path")) | |
debug_image_paths = [ | |
path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem | |
] | |
return text, debug_image_paths | |