import os import json import fitz import pdfplumber import pandas as pd import streamlit as st from tempfile import NamedTemporaryFile from PIL import Image import io def extract_text_images( pdf_path: str, output_folder: str, minimum_font_size: int, extract_text: bool = True, extract_images: bool = True, mode: str = 'headerwise', header_font_sizes: list[float] = None, tolerance: float = 0.01, ) -> dict: """ Extracts text and/or images from a PDF and organizes them either by headers or by pages. Params ------- pdf_path: str Path to the input PDF file. output_folder: str Path to the output folder where extracted data will be saved. extract_text: bool Whether to extract text. extract_images: bool Whether to extract images. mode: str Extraction mode, either 'headerwise' or 'pagewise'. header_font_sizes: list[float] List of font sizes to be considered as headers. tolerance: float Tolerance for font size comparison. Returns ------- dict Dictionary containing extracted text and/or image data. """ if not os.path.exists(output_folder): os.makedirs(output_folder) extraction_data = [] current_header = None current_header_content = [] def add_current_header_content() -> None: """ Adds the current header and its content to the extraction data. """ nonlocal current_header, current_header_content if current_header: extraction_data.append({ 'header': current_header, 'content': current_header_content }) current_header_content = [] current_header = None def is_header_font_size(font_size: float) -> bool: """ Checks if a given font size matches any of the header font sizes. """ return any( abs(font_size - header_font_size) <= tolerance for header_font_size in header_font_sizes ) pdf_document = fitz.open(pdf_path) for page_number in range(pdf_document.page_count): page = pdf_document.load_page(page_number) elements = [] if extract_text: # Extract text blocks with their positions and font sizes text_blocks = page.get_text("dict")["blocks"] lines = {} # Group text blocks by their vertical position (top) to form lines for block in text_blocks: if block["type"] == 0: # Text block for line in block["lines"]: for span in line["spans"]: font_size = span["size"] top = span["bbox"][1] # Skip text blocks with font size less than 10 if font_size < minimum_font_size: continue if top not in lines: lines[top] = [] lines[top].append(span) # Process each line to check if it's a header for top in sorted(lines.keys()): line = lines[top] line_text = " ".join([span['text'] for span in line]) line_font_size = line[0]['size'] elements.append({ 'type': 'text', 'font_size': line_font_size, 'page': page_number + 1, 'content': line_text, 'x0': line[0]['bbox'][0], 'top': top }) if extract_images: # Extract images using PyMuPDF image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = pdf_document.extract_image(xref) image_bytes = base_image["image"] image_filename = os.path.join( output_folder, f"page_{page_number + 1}_img_{img_index + 1}.png" ) with open(image_filename, "wb") as img_file: img_file.write(image_bytes) # Get the position of the image img_rect = page.get_image_bbox(img) elements.append({ 'type': 'image', 'page': page_number + 1, 'path': image_filename, 'x0': img_rect.x0, 'top': img_rect.y0 }) # Sort elements by their vertical position (top) first, # and then by horizontal position (x0) elements.sort(key=lambda e: (e['top'], e['x0'])) if mode == 'headerwise': # Process elements to extract headers and content for element in elements: if element['type'] == 'text' and \ is_header_font_size(element['font_size']): # If a new header is found, # finalize the current header content add_current_header_content() current_header = element['content'] elif element['type'] == 'text': if current_header_content and \ current_header_content[-1]['type'] == 'text': current_header_content[-1]['content'] \ += " " + element['content'] else: current_header_content.append({ 'type': 'text', 'content': element['content'] }) elif element['type'] == 'image': current_header_content.append({ 'type': 'image', 'path': element['path'] }) elif mode == 'pagewise': page_content = [] for element in elements: if element['type'] == 'text': if page_content and \ page_content[-1]['type'] == 'text': page_content[-1]['content'] \ += " " + element['content'] else: page_content.append({ 'type': 'text', 'content': element['content'] }) elif element['type'] == 'image': page_content.append({ 'type': 'image', 'path': element['path'] }) extraction_data.append({ 'page': page_number + 1, 'content': page_content }) # After the loop, finalize any remaining header content if mode == 'headerwise': add_current_header_content() pdf_document.close() return extraction_data def get_word_font_sizes(pdf_path, target_words): word_font_sizes = {word: [] for word in target_words} with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: words = page.extract_words(extra_attrs=['fontname', 'size']) for word in words: text = word['text'].strip() if text in target_words: word_font_sizes[text].append(word['size']) return word_font_sizes def preview_pdf(pdf_path, num_pages=1): pdf_document = fitz.open(pdf_path) preview_images = [] for page_number in range(min(num_pages, pdf_document.page_count)): page = pdf_document.load_page(page_number) pix = page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) preview_images.append(img) pdf_document.close() return preview_images # Streamlit UI import io def main(): # setting page config st.set_page_config( page_title="Object counting", page_icon="🧊", layout="wide", initial_sidebar_state="expanded", menu_items={ 'Get Help': 'https://www.extremelycoolapp.com/help', 'Report a bug': "https://www.extremelycoolapp.com/bug", } ) st.markdown("