Spaces:
Running
on
T4
Running
on
T4
import functools | |
from pathlib import Path | |
from matplotlib import font_manager | |
from unstructured.partition.pdf import partition_pdf | |
from unstructured.partition.pdf_image.analysis import bbox_visualisation | |
from .settings import ENABLE_DEBUG_MODE | |
UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured") | |
def convert_elements_to_markdown(elements): | |
lines = [] | |
for e in elements: | |
if e.category == "Title": | |
line = f"\n# {e.text}\n" | |
elif e.category == "ListItem": | |
line = f"- {e.text}" | |
elif e.category == "Table": | |
line = f"\n{e.metadata.text_as_html}\n" | |
elif e.category == "UncategorizedText": | |
line = "" | |
elif e.category == "Image": | |
# base64 image | |
line = f"" | |
else: | |
line = e.text | |
lines.append(line) | |
md = "\n".join(lines) | |
return md | |
def get_font(): | |
preferred_fonts = ["Arial.ttf", "DejaVuSans.ttf"] | |
available_fonts = font_manager.findSystemFonts() | |
if not available_fonts: | |
raise ValueError("No fonts available") | |
for font in preferred_fonts: | |
for available_font in available_fonts: | |
if font in available_font: | |
return available_font | |
return available_fonts[0] | |
# monkey patch | |
bbox_visualisation.get_font = get_font | |
def convert_unstructured(path: str, file_name: str): | |
elements = partition_pdf( | |
filename=path, | |
# mandatory to use ``hi_res`` strategy | |
strategy="hi_res", | |
infer_table_structure=True, | |
extract_image_block_types=["Image", "Table"], | |
extract_image_block_to_payload=True, | |
analysis=ENABLE_DEBUG_MODE, | |
analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH, | |
) | |
text = convert_elements_to_markdown(elements) | |
debug_image_dir = UNSTRUCTURED_DEBUG_PATH / "analysis" / file_name / "bboxes" | |
if debug_image_dir.exists(): | |
debug_image_paths = [ | |
path for path in debug_image_dir.iterdir() if "od_model" in path.stem | |
] | |
else: | |
debug_image_paths = [] | |
return text, debug_image_paths | |