File size: 2,468 Bytes
c8a32e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import re
from typing import List

from marker.ocr.utils import alphanum_ratio
from marker.schema.bbox import rescale_bbox, box_intersection_pct
from marker.schema.page import Page
from marker.settings import settings


def should_ocr_page(page: Page, no_text: bool):
    detected_lines_found = detected_line_coverage(page)

    # OCR page if we got minimal text, or if we got too many spaces
    conditions = [
        no_text , # Full doc has no text, and needs full OCR
        (len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)),  # Bad OCR
        detected_lines_found is False, # didn't extract text for all detected lines
    ]

    return any(conditions) or settings.OCR_ALL_PAGES


def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):
    if len(text) == 0:
        # Assume OCR failed if we have no text
        return True

    spaces = len(re.findall(r'\s+', text))
    alpha_chars = len(re.sub(r'\s+', '', text))
    if spaces / (alpha_chars + spaces) > space_threshold:
        return True

    newlines = len(re.findall(r'\n+', text))
    non_newlines = len(re.sub(r'\n+', '', text))
    if newlines / (newlines + non_newlines) > newline_threshold:
        return True

    if alphanum_ratio(text) < alphanum_threshold: # Garbled text
        return True

    invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
    if invalid_chars > max(4.0, len(text) * .03):
        return True

    return False


def no_text_found(pages: List[Page]):
    full_text = ""
    for page in pages:
        full_text += page.prelim_text
    return len(full_text.strip()) == 0


def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.65):
    found_lines = 0
    for detected_line in page.text_lines.bboxes:

        # Get bbox and rescale to match dimensions of original page
        detected_bbox = detected_line.bbox
        detected_bbox = rescale_bbox(page.text_lines.image_bbox, page.bbox, detected_bbox)

        total_intersection = 0
        for block in page.blocks:
            for line in block.lines:
                intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
                total_intersection += intersection_pct
        if total_intersection > intersect_thresh:
            found_lines += 1

    total_lines = len(page.text_lines.bboxes)
    if total_lines == 0:
        return False
    return found_lines / total_lines > detection_thresh