import os from typing import List, Optional, Dict import pypdfium2 as pdfium import pypdfium2.internal as pdfium_i from marker.pdf.utils import font_flags_decomposer from marker.settings import settings from marker.schema.block import Span, Line, Block from marker.schema.page import Page from pdftext.extraction import dictionary_output os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX def pdftext_format_to_blocks(page, pnum: int) -> Page: page_blocks = [] span_id = 0 for block_idx, block in enumerate(page["blocks"]): block_lines = [] for l in block["lines"]: spans = [] for i, s in enumerate(l["spans"]): block_text = s["text"] # Remove trailing newlines and carriage returns (tesseract) while len(block_text) > 0 and block_text[-1] in ["\n", "\r"]: block_text = block_text[:-1] block_text = block_text.replace("-\n", "") # Remove hyphenated line breaks span_obj = Span( text=block_text, # Remove end of line newlines, not spaces bbox=s["bbox"], span_id=f"{pnum}_{span_id}", font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font font_weight=s["font"]["weight"], font_size=s["font"]["size"], ) spans.append(span_obj) # Text, bounding box, span id span_id += 1 line_obj = Line( spans=spans, bbox=l["bbox"], ) # Only select valid lines, with positive bboxes if line_obj.area >= 0: block_lines.append(line_obj) block_obj = Block( lines=block_lines, bbox=block["bbox"], pnum=pnum ) # Only select blocks with lines if len(block_lines) > 0: page_blocks.append(block_obj) page_bbox = page["bbox"] page_width = abs(page_bbox[2] - page_bbox[0]) page_height = abs(page_bbox[3] - page_bbox[1]) rotation = page["rotation"] # Flip width and height if rotated if rotation == 90 or rotation == 270: page_width, page_height = page_height, page_width char_blocks = page["blocks"] page_bbox = [0, 0, page_width, page_height] out_page = Page( blocks=page_blocks, pnum=page["page"], bbox=page_bbox, rotation=rotation, char_blocks=char_blocks ) return out_page def get_text_blocks(doc, max_pages: Optional[int] = None) -> (List[Page], Dict): toc = get_toc(doc) page_range = range(len(doc)) if max_pages: range_end = min(max_pages, len(doc)) page_range = range(range_end) char_blocks = dictionary_output(doc, page_range=page_range, keep_chars=True) marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)] return marker_blocks, toc def naive_get_text(doc): full_text = "" for page_idx in range(len(doc)): page = doc.get_page(page_idx) text_page = page.get_textpage() full_text += text_page.get_text_bounded() + "\n" return full_text def get_toc(doc, max_depth=15): toc = doc.get_toc(max_depth=max_depth) toc_list = [] for item in toc: list_item = { "title": item.title, "level": item.level, "is_closed": item.is_closed, "n_kids": item.n_kids, "page_index": item.page_index, "view_mode": pdfium_i.ViewmodeToStr.get(item.view_mode), "view_pos": item.view_pos, } toc_list.append(list_item) return toc_list def get_length_of_text(fname: str) -> int: doc = pdfium.PdfDocument(fname) text = naive_get_text(doc).strip() return len(text)