import fitz # PyMuPDF import os import pandas as pd import pdfplumber import gradio as gr import time from pathlib import Path import shutil # Function to extract content from a single PDF def extract_pdf_content(file_path): # Open the PDF pdf_file = fitz.open(file_path) page_nums = len(pdf_file) # Ensure images directory exists images_dir = "temp_images" if not os.path.exists(images_dir): os.makedirs(images_dir) # Store extracted content all_text = [] all_tables = [] images_list = [] # Extract text, tables, and images for page_num in range(page_nums): page_content = pdf_file[page_num] # Extract text text = page_content.get_text("text") all_text.append(f"--- Page {page_num + 1} ---\n{text}") # Extract tables using pdfplumber with pdfplumber.open(file_path) as pdf: tables = pdf.pages[page_num].extract_tables() for table in tables: df = pd.DataFrame(table) all_tables.append(df) # Extract images images_list.extend(page_content.get_images(full=True)) # Save extracted images image_paths = [] if images_list: for i, image in enumerate(images_list, start=1): xref = image[0] base_image = pdf_file.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] image_name = f"{images_dir}/image_{time.time()}_{i}.{image_ext}" # Unique name for each image image_paths.append(image_name) with open(image_name, "wb") as image_file: image_file.write(image_bytes) # Close the PDF file pdf_file.close() return "\n".join(all_text), all_tables, image_paths # Function to handle multiple PDFs def process_multiple_pdfs(files, progress=gr.Progress()): aggregated_text = [] aggregated_tables = [] aggregated_images = [] total_files = len(files) for idx, file in enumerate(files): file_path = file.name # Get the temporary file path progress(idx / total_files, desc=f"Processing PDF {idx + 1}/{total_files}") text, tables, images = extract_pdf_content(file_path) aggregated_text.append(f"=== File: {Path(file_path).name} ===\n{text}") aggregated_tables.extend(tables) aggregated_images.extend(images) # Convert tables to HTML with advanced styling table_html = "" for idx, table in enumerate(aggregated_tables): table_html += f"

Table {idx + 1}

" table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered") # Return outputs return "\n".join(aggregated_text), table_html, aggregated_images # Custom CSS for advanced styling custom_css = """ .gradio-container { max-width: 1200px; margin: auto; } .table { width: 100%; margin-bottom: 1rem; color: #212529; } .table-striped tbody tr:nth-of-type(odd) { background-color: rgba(0, 0, 0, 0.05); } .table-bordered { border: 1px solid #dee2e6; } .table-bordered th, .table-bordered td { border: 1px solid #dee2e6; } .gallery { display: flex; flex-wrap: wrap; gap: 10px; } .gallery img { max-width: 100%; height: auto; border-radius: 5px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); } .scrollable { max-height: 400px; /* Fixed height for vertical scrolling */ max-width: 100%; /* Ensure the width is constrained */ overflow-y: auto; /* Enable vertical scrolling */ overflow-x: auto; /* Enable horizontal scrolling */ white-space: pre-wrap; /* Preserve whitespace and wrap text */ word-wrap: break-word; /* Break long words if necessary */ border: 1px solid #ddd; padding: 10px; border-radius: 5px; } .row { display: flex; gap: 20px; margin-bottom: 20px; } .column { flex: 1; } .center { text-align: center; margin: auto; width: 80%; } """ # Create Gradio Interface with gr.Blocks(css=custom_css) as demo: gr.Markdown("# Advanced PDF Content Extractor") with gr.Row(): pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple") with gr.Row(): with gr.Column(): gr.Markdown("### Extracted Text") text_output = gr.Textbox( label="Text", lines=15, interactive=False, elem_classes="scrollable" # Apply scrollable class ) with gr.Column(): gr.Markdown("### Extracted Images") image_gallery = gr.Gallery( label="Images", columns=4, height="auto", elem_classes="scrollable" ) with gr.Row(): with gr.Column(): gr.Markdown("### Extracted Tables") table_output = gr.HTML( label="Tables", elem_classes="scrollable center" ) # Main function call pdf_input.change( fn=process_multiple_pdfs, inputs=pdf_input, outputs=[text_output, table_output, image_gallery] ) # Launch the Gradio app demo.launch()