File size: 888 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
from typing import List
from pypdfium2 import PdfDocument
from surya.detection import batch_text_detection
from marker.pdf.images import render_image
from marker.schema.page import Page
from marker.settings import settings
def get_batch_size():
if settings.DETECTOR_BATCH_SIZE is not None:
return settings.DETECTOR_BATCH_SIZE
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 4
return 4
def surya_detection(doc: PdfDocument, pages: List[Page], det_model, batch_multiplier=1):
processor = det_model.processor
max_len = min(len(pages), len(doc))
images = [render_image(doc[pnum], dpi=settings.SURYA_DETECTOR_DPI) for pnum in range(max_len)]
predictions = batch_text_detection(images, det_model, processor, batch_size=get_batch_size() * batch_multiplier)
for (page, pred) in zip(pages, predictions):
page.text_lines = pred
|