Spaces:
Running
on
T4
Running
on
T4
import logging | |
from pathlib import Path | |
import sycamore | |
from sycamore import ExecMode | |
from sycamore.data import Document | |
from sycamore.data.document import DocumentPropertyTypes | |
from sycamore.functions.document import DrawBoxes, split_and_convert_to_image | |
from sycamore.transforms.partition import ArynPartitioner | |
from sycamore.utils.markdown import elements_to_markdown | |
from .settings import ENABLE_DEBUG_MODE | |
logging.getLogger().setLevel(logging.INFO) | |
SYCAMORE_DEBUG_PATH = Path("/tmp/sycamore") | |
SYCAMORE_DEBUG_PATH.mkdir(exist_ok=True) | |
paritioner = ArynPartitioner( | |
use_partitioning_service=False, | |
extract_table_structure=True, | |
use_ocr=True, | |
extract_images=True, | |
device="cpu", | |
) | |
context = sycamore.init( | |
exec_mode=ExecMode.LOCAL, | |
) | |
def image_page_filename_fn(doc: Document) -> str: | |
page_num = doc.properties[DocumentPropertyTypes.PAGE_NUMBER] | |
return f"page_{page_num}.png" | |
def convert_sycamore(path: str, file_name: str): | |
docset = context.read.binary(paths=path, binary_format="pdf").partition( | |
partitioner=paritioner, | |
) | |
debug_path = SYCAMORE_DEBUG_PATH / file_name | |
debug_path.mkdir(exist_ok=True) | |
image_paths = [] | |
doc = docset.take_all()[0] | |
md = elements_to_markdown(doc.elements) | |
if ENABLE_DEBUG_MODE: | |
docset.flat_map(split_and_convert_to_image).map_batch( | |
DrawBoxes, f_constructor_kwargs={"draw_table_cells": True} | |
).write.files(str(debug_path), filename_fn=image_page_filename_fn) | |
image_paths = [str(path) for path in debug_path.glob("*.png")] | |
return md, image_paths | |