import streamlit as st import os import json import fitz from io import BytesIO from PIL import Image import pandas as pd import zipfile import tempfile def extract_text_images( pdf_path: str, output_folder: str, minimum_font_size: int, extraction_type: str = 'both' ) -> dict: """ Extracts text and/or images from a PDF and organizes them by pages. """ if not os.path.exists(output_folder): os.makedirs(output_folder) extraction_data = [] pdf_document = fitz.open(pdf_path) for page_number in range(pdf_document.page_count): page = pdf_document.load_page(page_number) elements = [] if extraction_type in ('text', 'both'): text_blocks = page.get_text("dict")["blocks"] lines = {} for block in text_blocks: if block["type"] == 0: for line in block["lines"]: for span in line["spans"]: font_size = span["size"] top = span["bbox"][1] if font_size < minimum_font_size: continue if top not in lines: lines[top] = [] lines[top].append(span) for top in sorted(lines.keys()): line = lines[top] line_text = " ".join([span['text'] for span in line]) elements.append({ 'type': 'text', 'font_size': line[0]['size'], 'page': page_number + 1, 'content': line_text, 'x0': line[0]['bbox'][0], 'top': top, }) if extraction_type in ('images', 'both'): image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = pdf_document.extract_image(xref) image_bytes = base_image["image"] image_filename = os.path.join( output_folder, f"page_{page_number + 1}_img_{img_index + 1}.png" ) with open(image_filename, "wb") as img_file: img_file.write(image_bytes) img_rect = page.get_image_bbox(img) elements.append({ 'type': 'image', 'page': page_number + 1, 'path': image_filename, 'x0': img_rect.x0, 'top': img_rect.y0 }) elements.sort(key=lambda e: (e['top'], e['x0'])) page_content = [] for element in elements: if element['type'] == 'text': if page_content and page_content[-1]['type'] == 'text': page_content[-1]['content'] += " " + element['content'] else: page_content.append({ 'type': 'text', 'content': element['content'] }) elif element['type'] == 'image': page_content.append({ 'type': 'image', 'path': element['path'] }) extraction_data.append({ 'page': page_number + 1, 'content': page_content }) pdf_document.close() return extraction_data def convert_to_xlsx(data: dict) -> BytesIO: """ Converts the extracted data to an XLSX file. """ rows = [] for item in data: page_number = item['page'] content_list = item['content'] for content in content_list: if content['type'] == 'text': rows.append({ 'Page': page_number, 'Content': content['content'] }) elif content['type'] == 'image': rows.append({ 'Page': page_number, 'Content': f"[Image: {content['path']}]" }) df = pd.DataFrame(rows) output = BytesIO() with pd.ExcelWriter(output, engine='xlsxwriter') as writer: df.to_excel(writer, index=False, sheet_name='Extraction') output.seek(0) return output def create_zip_with_json_and_images(output_folder, extraction_data): """ Creates a ZIP file containing both images and JSON data. """ zip_buffer = BytesIO() with zipfile.ZipFile(zip_buffer, "w") as zip_file: # Add JSON file json_data = json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8') zip_file.writestr("extraction_data.json", json_data) # Add images for item in extraction_data: for content in item['content']: if content['type'] == 'image': image_path = content['path'] image_name = os.path.basename(image_path) zip_file.write(image_path, image_name) zip_buffer.seek(0) return zip_buffer def main(): st.markdown("

PDF DATA SNACHER:PAGEWISE

", unsafe_allow_html=True) st.markdown("

Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images

", unsafe_allow_html=True) st.sidebar.markdown('', unsafe_allow_html=True) pdf_file = st.file_uploader("Upload PDF", type="pdf") if pdf_file is not None: num_pages_to_preview = st.sidebar.slider( "Select number of pages to preview:", min_value=1, max_value=5, value=1 ) pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf") for page_num in range(min(num_pages_to_preview, pdf_document.page_count)): page = pdf_document.load_page(page_num) pix = page.get_pixmap() image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True) st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf") extraction_type = st.selectbox( "Choose extraction type:", ("text", "images", "both") ) st.info("Minimum font size is the size below which size, the text will get ignored for extraction") minimum_font_size = st.number_input( "Minimum font size to extract:", min_value=1, value=2 ) output_folder = st.text_input("Output folder path:") if st.button("Start Extraction"): if pdf_file is not None and output_folder: with tempfile.TemporaryDirectory() as temp_dir: temp_pdf_path = os.path.join(temp_dir, pdf_file.name) with open(temp_pdf_path, "wb") as f: f.write(pdf_file.getvalue()) extraction_data = extract_text_images( temp_pdf_path, temp_dir, minimum_font_size, extraction_type ) st.json(extraction_data) if extraction_type == 'images' or extraction_type == 'both': zip_data = create_zip_with_json_and_images(temp_dir, extraction_data) st.download_button( label="Download ZIP", data=zip_data, file_name='extraction_data.zip', mime='application/zip' ) xlsx_data = convert_to_xlsx(extraction_data) col1, col2 = st.columns(2) with col1: st.download_button( label="Download JSON", data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'), file_name='extraction_data.json', mime='application/json' ) with col2: st.download_button( label="Download XLSX", data=xlsx_data, file_name='extraction_data.xlsx', mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ) else: st.error("Please upload a PDF file and provide an output folder path.") st.markdown( """ """, unsafe_allow_html=True ) if __name__ == "__main__": main()