File size: 5,742 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
from itertools import repeat
from typing import List, Optional, Dict
import pypdfium2 as pdfium
import io
from concurrent.futures import ThreadPoolExecutor
from surya.ocr import run_recognition
from marker.models import setup_recognition_model
from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
from marker.ocr.lang import langs_to_ids
from marker.pdf.images import render_image
from marker.schema.page import Page
from marker.schema.block import Block, Line, Span
from marker.settings import settings
from marker.pdf.extract_text import get_text_blocks
def get_batch_size():
if settings.RECOGNITION_BATCH_SIZE is not None:
return settings.RECOGNITION_BATCH_SIZE
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 32
elif settings.TORCH_DEVICE_MODEL == "mps":
return 32
return 32
def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1) -> (List[Page], Dict):
ocr_pages = 0
ocr_success = 0
ocr_failed = 0
no_text = no_text_found(pages)
ocr_idxs = []
for pnum, page in enumerate(pages):
ocr_needed = should_ocr_page(page, no_text)
if ocr_needed:
ocr_idxs.append(pnum)
ocr_pages += 1
# No pages need OCR
if ocr_pages == 0:
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
ocr_method = settings.OCR_ENGINE
if ocr_method is None:
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
elif ocr_method == "surya":
# Load model just in time if we're not OCRing everything
del_rec_model = False
if rec_model is None:
lang_tokens = langs_to_ids(langs)
rec_model = setup_recognition_model(lang_tokens)
del_rec_model = True
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
if del_rec_model:
del rec_model
elif ocr_method == "ocrmypdf":
new_pages = tesseract_recognition(doc, ocr_idxs, langs)
else:
raise ValueError(f"Unknown OCR method {ocr_method}")
for orig_idx, page in zip(ocr_idxs, new_pages):
if detect_bad_ocr(page.prelim_text) or len(page.prelim_text) == 0:
ocr_failed += 1
else:
ocr_success += 1
pages[orig_idx] = page
return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success, "ocr_engine": ocr_method}
def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page], batch_multiplier=1) -> List[Optional[Page]]:
images = [render_image(doc[pnum], dpi=settings.SURYA_OCR_DPI) for pnum in page_idxs]
processor = rec_model.processor
selected_pages = [p for i, p in enumerate(pages) if i in page_idxs]
surya_langs = [langs] * len(page_idxs)
detection_results = [p.text_lines.bboxes for p in selected_pages]
polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=get_batch_size() * batch_multiplier)
new_pages = []
for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):
text_lines = old_page.text_lines
ocr_results = result.text_lines
blocks = []
for i, line in enumerate(ocr_results):
block = Block(
bbox=line.bbox,
pnum=page_idx,
lines=[Line(
bbox=line.bbox,
spans=[Span(
text=line.text,
bbox=line.bbox,
span_id=f"{page_idx}_{i}",
font="",
font_weight=0,
font_size=0,
)
]
)]
)
blocks.append(block)
page = Page(
blocks=blocks,
pnum=page_idx,
bbox=result.image_bbox,
rotation=0,
text_lines=text_lines,
ocr_method="surya"
)
new_pages.append(page)
return new_pages
def tesseract_recognition(doc, page_idxs, langs: List[str]) -> List[Optional[Page]]:
pdf_pages = generate_single_page_pdfs(doc, page_idxs)
with ThreadPoolExecutor(max_workers=settings.OCR_PARALLEL_WORKERS) as executor:
pages = list(executor.map(_tesseract_recognition, pdf_pages, repeat(langs, len(pdf_pages))))
return pages
def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:
pdf_pages = []
for page_idx in page_idxs:
blank_doc = pdfium.PdfDocument.new()
blank_doc.import_pages(doc, pages=[page_idx])
assert len(blank_doc) == 1, "Failed to import page"
in_pdf = io.BytesIO()
blank_doc.save(in_pdf)
in_pdf.seek(0)
pdf_pages.append(in_pdf)
return pdf_pages
def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
import ocrmypdf
out_pdf = io.BytesIO()
ocrmypdf.ocr(
in_pdf,
out_pdf,
language=langs[0],
output_type="pdf",
redo_ocr=None,
force_ocr=True,
progress_bar=False,
optimize=False,
fast_web_view=1e6,
skip_big=15, # skip images larger than 15 megapixels
tesseract_timeout=settings.TESSERACT_TIMEOUT,
tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT,
)
new_doc = pdfium.PdfDocument(out_pdf.getvalue())
blocks, _ = get_text_blocks(new_doc, max_pages=1)
page = blocks[0]
page.ocr_method = "tesseract"
return page
|