Spaces:
Running
on
T4
Running
on
T4
from pathlib import Path | |
from gmft.auto import AutoFormatConfig, AutoTableFormatter, CroppedTable, TableDetector | |
from gmft.pdf_bindings import PyPDFium2Document | |
from .settings import ENABLE_DEBUG_MODE | |
detector = TableDetector() | |
config = AutoFormatConfig() | |
config.semantic_spanning_cells = True # [Experimental] better spanning cells | |
config.enable_multi_header = True # multi-headers | |
formatter = AutoTableFormatter(config) | |
GMFT_DEBUG_PATH = Path("/tmp/gmft") | |
GMFT_DEBUG_PATH.mkdir(exist_ok=True) | |
def ingest_pdf(pdf_path) -> list[CroppedTable]: | |
doc = PyPDFium2Document(pdf_path) | |
tables = [] | |
for page in doc: | |
tables += detector.extract(page) | |
return tables | |
def convert_gmft(path: str, file_name: str): | |
tables = ingest_pdf(path) | |
formatted_tables = [] | |
debug_image_paths = [] | |
debug_path = GMFT_DEBUG_PATH / file_name | |
debug_path.mkdir(exist_ok=True) | |
for idx, table in enumerate(tables): | |
ft = formatter.extract( | |
table, | |
dpi=72 * 2, | |
) | |
df = ft.df() | |
if df is not None: | |
html = df.fillna("").to_html( | |
index=False, | |
) | |
formatted_tables.append(html) | |
if ENABLE_DEBUG_MODE: | |
image_path = debug_path / f"table_{idx}.png" | |
ft.image().save(image_path) | |
debug_image_paths.append(image_path) | |
content = "\n\n".join(formatted_tables) | |
return content, debug_image_paths | |