import streamlit as st import os import json import fitz from io import BytesIO from PIL import Image import pandas as pd import zipfile import tempfile def extract_text_images( pdf_path: str, output_folder: str, minimum_font_size: int, extraction_type: str = 'both' ) -> dict: """ Extracts text and/or images from a PDF and organizes them by pages. """ if not os.path.exists(output_folder): os.makedirs(output_folder) extraction_data = [] pdf_document = fitz.open(pdf_path) for page_number in range(pdf_document.page_count): page = pdf_document.load_page(page_number) elements = [] if extraction_type in ('text', 'both'): text_blocks = page.get_text("dict")["blocks"] lines = {} for block in text_blocks: if block["type"] == 0: for line in block["lines"]: for span in line["spans"]: font_size = span["size"] top = span["bbox"][1] if font_size < minimum_font_size: continue if top not in lines: lines[top] = [] lines[top].append(span) for top in sorted(lines.keys()): line = lines[top] line_text = " ".join([span['text'] for span in line]) elements.append({ 'type': 'text', 'font_size': line[0]['size'], 'page': page_number + 1, 'content': line_text, 'x0': line[0]['bbox'][0], 'top': top, }) if extraction_type in ('images', 'both'): image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = pdf_document.extract_image(xref) image_bytes = base_image["image"] image_filename = os.path.join( output_folder, f"page_{page_number + 1}_img_{img_index + 1}.png" ) with open(image_filename, "wb") as img_file: img_file.write(image_bytes) img_rect = page.get_image_bbox(img) elements.append({ 'type': 'image', 'page': page_number + 1, 'path': image_filename, 'x0': img_rect.x0, 'top': img_rect.y0 }) elements.sort(key=lambda e: (e['top'], e['x0'])) page_content = [] for element in elements: if element['type'] == 'text': if page_content and page_content[-1]['type'] == 'text': page_content[-1]['content'] += " " + element['content'] else: page_content.append({ 'type': 'text', 'content': element['content'] }) elif element['type'] == 'image': page_content.append({ 'type': 'image', 'path': element['path'] }) extraction_data.append({ 'page': page_number + 1, 'content': page_content }) pdf_document.close() return extraction_data def convert_to_xlsx(data: dict) -> BytesIO: """ Converts the extracted data to an XLSX file. """ rows = [] for item in data: page_number = item['page'] content_list = item['content'] for content in content_list: if content['type'] == 'text': rows.append({ 'Page': page_number, 'Content': content['content'] }) elif content['type'] == 'image': rows.append({ 'Page': page_number, 'Content': f"[Image: {content['path']}]" }) df = pd.DataFrame(rows) output = BytesIO() with pd.ExcelWriter(output, engine='xlsxwriter') as writer: df.to_excel(writer, index=False, sheet_name='Extraction') output.seek(0) return output def create_zip_with_json_and_images(output_folder, extraction_data): """ Creates a ZIP file containing both images and JSON data. """ zip_buffer = BytesIO() with zipfile.ZipFile(zip_buffer, "w") as zip_file: # Add JSON file json_data = json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8') zip_file.writestr("extraction_data.json", json_data) # Add images for item in extraction_data: for content in item['content']: if content['type'] == 'image': image_path = content['path'] image_name = os.path.basename(image_path) zip_file.write(image_path, image_name) zip_buffer.seek(0) return zip_buffer def main(): st.markdown("