Spaces:
Running
on
T4
Running
on
T4
from pathlib import Path | |
import cv2 | |
from img2table.document import PDF | |
from img2table.ocr import SuryaOCR | |
from .settings import ENABLE_DEBUG_MODE | |
ocr = SuryaOCR( | |
langs=["en"], | |
) | |
IMG2TABLE_DEBUG_PATH = Path("/tmp/img2table") | |
IMG2TABLE_DEBUG_PATH.mkdir(exist_ok=True) | |
def convert_img2table(path: str, file_name: str): | |
doc = PDF(path) | |
pages = doc.extract_tables( | |
ocr=ocr, | |
implicit_rows=False, | |
implicit_columns=False, | |
borderless_tables=True, | |
min_confidence=50, | |
) | |
debug_image_paths = [] | |
if ENABLE_DEBUG_MODE: | |
debug_path = IMG2TABLE_DEBUG_PATH / file_name | |
debug_path.mkdir(exist_ok=True) | |
images = doc.images | |
for idx, page_number in enumerate(doc.pages or range(len(images))): | |
page_image = images[idx] | |
for table in pages[page_number]: | |
for row in table.content.values(): | |
for cell in row: | |
cv2.rectangle( | |
page_image, | |
(cell.bbox.x1, cell.bbox.y1), | |
(cell.bbox.x2, cell.bbox.y2), | |
(0, 0, 255), | |
2, | |
) | |
image_path = debug_path / f"page_{idx}.png" | |
debug_image_paths.append(image_path) | |
cv2.imwrite(str(image_path), page_image) | |
content = "\n\n".join( | |
[ | |
(table.title if table.title else "") + "\n\n" + table.html | |
for tables in pages.values() | |
for table in tables | |
] | |
) | |
return content, debug_image_paths | |