File size: 5,449 Bytes
7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 e41d1ae 7eaea00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import fitz # PyMuPDF
import os
import pandas as pd
import pdfplumber
import gradio as gr
import time
from pathlib import Path
import shutil
# Function to extract content from a single PDF
def extract_pdf_content(file_path):
# Open the PDF
pdf_file = fitz.open(file_path)
page_nums = len(pdf_file)
# Ensure images directory exists
images_dir = "temp_images"
if not os.path.exists(images_dir):
os.makedirs(images_dir)
# Store extracted content
all_text = []
all_tables = []
images_list = []
# Extract text, tables, and images
for page_num in range(page_nums):
page_content = pdf_file[page_num]
# Extract text
text = page_content.get_text("text")
all_text.append(f"--- Page {page_num + 1} ---\n{text}")
# Extract tables using pdfplumber
with pdfplumber.open(file_path) as pdf:
tables = pdf.pages[page_num].extract_tables()
for table in tables:
df = pd.DataFrame(table)
all_tables.append(df)
# Extract images
images_list.extend(page_content.get_images(full=True))
# Save extracted images
image_paths = []
if images_list:
for i, image in enumerate(images_list, start=1):
xref = image[0]
base_image = pdf_file.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_name = f"{images_dir}/image_{time.time()}_{i}.{image_ext}" # Unique name for each image
image_paths.append(image_name)
with open(image_name, "wb") as image_file:
image_file.write(image_bytes)
# Close the PDF file
pdf_file.close()
return "\n".join(all_text), all_tables, image_paths
# Function to handle multiple PDFs
def process_multiple_pdfs(files, progress=gr.Progress()):
aggregated_text = []
aggregated_tables = []
aggregated_images = []
total_files = len(files)
for idx, file in enumerate(files):
file_path = file.name # Get the temporary file path
progress(idx / total_files, desc=f"Processing PDF {idx + 1}/{total_files}")
text, tables, images = extract_pdf_content(file_path)
aggregated_text.append(f"=== File: {Path(file_path).name} ===\n{text}")
aggregated_tables.extend(tables)
aggregated_images.extend(images)
# Convert tables to HTML with advanced styling
table_html = ""
for idx, table in enumerate(aggregated_tables):
table_html += f"<h3>Table {idx + 1}</h3>"
table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered")
# Return outputs
return "\n".join(aggregated_text), table_html, aggregated_images
# Custom CSS for advanced styling
custom_css = """
.gradio-container {
max-width: 1200px;
margin: auto;
}
.table {
width: 100%;
margin-bottom: 1rem;
color: #212529;
}
.table-striped tbody tr:nth-of-type(odd) {
background-color: rgba(0, 0, 0, 0.05);
}
.table-bordered {
border: 1px solid #dee2e6;
}
.table-bordered th,
.table-bordered td {
border: 1px solid #dee2e6;
}
.gallery {
display: flex;
flex-wrap: wrap;
gap: 10px;
}
.gallery img {
max-width: 100%;
height: auto;
border-radius: 5px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
}
.scrollable {
max-height: 400px; /* Fixed height for vertical scrolling */
max-width: 100%; /* Ensure the width is constrained */
overflow-y: auto; /* Enable vertical scrolling */
overflow-x: auto; /* Enable horizontal scrolling */
white-space: pre-wrap; /* Preserve whitespace and wrap text */
word-wrap: break-word; /* Break long words if necessary */
border: 1px solid #ddd;
padding: 10px;
border-radius: 5px;
}
.row {
display: flex;
gap: 20px;
margin-bottom: 20px;
}
.column {
flex: 1;
}
.center {
text-align: center;
margin: auto;
width: 80%;
}
"""
# Create Gradio Interface
with gr.Blocks(css=custom_css) as demo:
gr.Markdown("# Advanced PDF Content Extractor")
with gr.Row():
pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
with gr.Row():
with gr.Column():
gr.Markdown("### Extracted Text")
text_output = gr.Textbox(
label="Text",
lines=15,
interactive=False,
elem_classes="scrollable" # Apply scrollable class
)
with gr.Column():
gr.Markdown("### Extracted Images")
image_gallery = gr.Gallery(
label="Images",
columns=4,
height="auto",
elem_classes="scrollable"
)
with gr.Row():
with gr.Column():
gr.Markdown("### Extracted Tables")
table_output = gr.HTML(
label="Tables",
elem_classes="scrollable center"
)
# Main function call
pdf_input.change(
fn=process_multiple_pdfs,
inputs=pdf_input,
outputs=[text_output, table_output, image_gallery]
)
# Launch the Gradio app
demo.launch() |