File size: 1,618 Bytes
c8a32e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from collections import Counter
from typing import List, Optional, Dict, Any

from marker.schema.bbox import BboxElement
from marker.schema.block import Block, Span
from surya.schema import TextDetectionResult, LayoutResult, OrderResult


class Page(BboxElement):
    blocks: List[Block]
    pnum: int
    rotation: Optional[int] = None # Rotation degrees of the page
    text_lines: Optional[TextDetectionResult] = None
    layout: Optional[LayoutResult] = None
    order: Optional[OrderResult] = None
    ocr_method: Optional[str] = None # One of "surya" or "tesseract"
    char_blocks: Optional[List[Dict]] = None # Blocks with character-level data from pdftext
    images: Optional[List[Any]] = None # Images to save along with the page, need Any to avoid pydantic error

    def get_nonblank_lines(self):
        lines = self.get_all_lines()
        nonblank_lines = [l for l in lines if l.prelim_text.strip()]
        return nonblank_lines

    def get_all_lines(self):
        lines = [l for b in self.blocks for l in b.lines]
        return lines

    def get_nonblank_spans(self) -> List[Span]:
        lines = [l for b in self.blocks for l in b.lines]
        spans = [s for l in lines for s in l.spans if s.text.strip()]
        return spans

    def get_font_sizes(self):
        font_sizes = [s.font_size for s in self.get_nonblank_spans()]
        return font_sizes

    def get_line_heights(self):
        heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
        return heights

    @property
    def prelim_text(self):
        return "\n".join([b.prelim_text for b in self.blocks])