|
from collections import Counter |
|
from typing import List, Optional, Dict, Any |
|
|
|
from marker.schema.bbox import BboxElement |
|
from marker.schema.block import Block, Span |
|
from surya.schema import TextDetectionResult, LayoutResult, OrderResult |
|
|
|
|
|
class Page(BboxElement): |
|
blocks: List[Block] |
|
pnum: int |
|
rotation: Optional[int] = None |
|
text_lines: Optional[TextDetectionResult] = None |
|
layout: Optional[LayoutResult] = None |
|
order: Optional[OrderResult] = None |
|
ocr_method: Optional[str] = None |
|
char_blocks: Optional[List[Dict]] = None |
|
images: Optional[List[Any]] = None |
|
|
|
def get_nonblank_lines(self): |
|
lines = self.get_all_lines() |
|
nonblank_lines = [l for l in lines if l.prelim_text.strip()] |
|
return nonblank_lines |
|
|
|
def get_all_lines(self): |
|
lines = [l for b in self.blocks for l in b.lines] |
|
return lines |
|
|
|
def get_nonblank_spans(self) -> List[Span]: |
|
lines = [l for b in self.blocks for l in b.lines] |
|
spans = [s for l in lines for s in l.spans if s.text.strip()] |
|
return spans |
|
|
|
def get_font_sizes(self): |
|
font_sizes = [s.font_size for s in self.get_nonblank_spans()] |
|
return font_sizes |
|
|
|
def get_line_heights(self): |
|
heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()] |
|
return heights |
|
|
|
@property |
|
def prelim_text(self): |
|
return "\n".join([b.prelim_text for b in self.blocks]) |
|
|