File size: 1,618 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
from collections import Counter
from typing import List, Optional, Dict, Any
from marker.schema.bbox import BboxElement
from marker.schema.block import Block, Span
from surya.schema import TextDetectionResult, LayoutResult, OrderResult
class Page(BboxElement):
blocks: List[Block]
pnum: int
rotation: Optional[int] = None # Rotation degrees of the page
text_lines: Optional[TextDetectionResult] = None
layout: Optional[LayoutResult] = None
order: Optional[OrderResult] = None
ocr_method: Optional[str] = None # One of "surya" or "tesseract"
char_blocks: Optional[List[Dict]] = None # Blocks with character-level data from pdftext
images: Optional[List[Any]] = None # Images to save along with the page, need Any to avoid pydantic error
def get_nonblank_lines(self):
lines = self.get_all_lines()
nonblank_lines = [l for l in lines if l.prelim_text.strip()]
return nonblank_lines
def get_all_lines(self):
lines = [l for b in self.blocks for l in b.lines]
return lines
def get_nonblank_spans(self) -> List[Span]:
lines = [l for b in self.blocks for l in b.lines]
spans = [s for l in lines for s in l.spans if s.text.strip()]
return spans
def get_font_sizes(self):
font_sizes = [s.font_size for s in self.get_nonblank_spans()]
return font_sizes
def get_line_heights(self):
heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
return heights
@property
def prelim_text(self):
return "\n".join([b.prelim_text for b in self.blocks])
|