|
import fitz |
|
import os |
|
import pandas as pd |
|
import pdfplumber |
|
import gradio as gr |
|
import time |
|
from pathlib import Path |
|
import shutil |
|
|
|
|
|
def extract_pdf_content(file_path): |
|
|
|
pdf_file = fitz.open(file_path) |
|
page_nums = len(pdf_file) |
|
|
|
|
|
images_dir = "temp_images" |
|
if not os.path.exists(images_dir): |
|
os.makedirs(images_dir) |
|
|
|
|
|
all_text = [] |
|
all_tables = [] |
|
images_list = [] |
|
|
|
|
|
for page_num in range(page_nums): |
|
page_content = pdf_file[page_num] |
|
|
|
|
|
text = page_content.get_text("text") |
|
all_text.append(f"--- Page {page_num + 1} ---\n{text}") |
|
|
|
|
|
with pdfplumber.open(file_path) as pdf: |
|
tables = pdf.pages[page_num].extract_tables() |
|
for table in tables: |
|
df = pd.DataFrame(table) |
|
all_tables.append(df) |
|
|
|
|
|
images_list.extend(page_content.get_images(full=True)) |
|
|
|
|
|
image_paths = [] |
|
if images_list: |
|
for i, image in enumerate(images_list, start=1): |
|
xref = image[0] |
|
base_image = pdf_file.extract_image(xref) |
|
image_bytes = base_image["image"] |
|
image_ext = base_image["ext"] |
|
image_name = f"{images_dir}/image_{time.time()}_{i}.{image_ext}" |
|
image_paths.append(image_name) |
|
|
|
with open(image_name, "wb") as image_file: |
|
image_file.write(image_bytes) |
|
|
|
|
|
pdf_file.close() |
|
|
|
return "\n".join(all_text), all_tables, image_paths |
|
|
|
|
|
def process_multiple_pdfs(files, progress=gr.Progress()): |
|
aggregated_text = [] |
|
aggregated_tables = [] |
|
aggregated_images = [] |
|
|
|
total_files = len(files) |
|
for idx, file in enumerate(files): |
|
file_path = file.name |
|
progress(idx / total_files, desc=f"Processing PDF {idx + 1}/{total_files}") |
|
text, tables, images = extract_pdf_content(file_path) |
|
aggregated_text.append(f"=== File: {Path(file_path).name} ===\n{text}") |
|
aggregated_tables.extend(tables) |
|
aggregated_images.extend(images) |
|
|
|
|
|
table_html = "" |
|
for idx, table in enumerate(aggregated_tables): |
|
table_html += f"<h3>Table {idx + 1}</h3>" |
|
table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered") |
|
|
|
|
|
return "\n".join(aggregated_text), table_html, aggregated_images |
|
|
|
|
|
custom_css = """ |
|
.gradio-container { |
|
max-width: 1200px; |
|
margin: auto; |
|
} |
|
.table { |
|
width: 100%; |
|
margin-bottom: 1rem; |
|
color: #212529; |
|
} |
|
.table-striped tbody tr:nth-of-type(odd) { |
|
background-color: rgba(0, 0, 0, 0.05); |
|
} |
|
.table-bordered { |
|
border: 1px solid #dee2e6; |
|
} |
|
.table-bordered th, |
|
.table-bordered td { |
|
border: 1px solid #dee2e6; |
|
} |
|
.gallery { |
|
display: flex; |
|
flex-wrap: wrap; |
|
gap: 10px; |
|
} |
|
.gallery img { |
|
max-width: 100%; |
|
height: auto; |
|
border-radius: 5px; |
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); |
|
} |
|
.scrollable { |
|
max-height: 400px; /* Fixed height for vertical scrolling */ |
|
max-width: 100%; /* Ensure the width is constrained */ |
|
overflow-y: auto; /* Enable vertical scrolling */ |
|
overflow-x: auto; /* Enable horizontal scrolling */ |
|
white-space: pre-wrap; /* Preserve whitespace and wrap text */ |
|
word-wrap: break-word; /* Break long words if necessary */ |
|
border: 1px solid #ddd; |
|
padding: 10px; |
|
border-radius: 5px; |
|
} |
|
.row { |
|
display: flex; |
|
gap: 20px; |
|
margin-bottom: 20px; |
|
} |
|
.column { |
|
flex: 1; |
|
} |
|
.center { |
|
text-align: center; |
|
margin: auto; |
|
width: 80%; |
|
} |
|
""" |
|
|
|
|
|
with gr.Blocks(css=custom_css) as demo: |
|
gr.Markdown("# Advanced PDF Content Extractor") |
|
with gr.Row(): |
|
pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple") |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Extracted Text") |
|
text_output = gr.Textbox( |
|
label="Text", |
|
lines=15, |
|
interactive=False, |
|
elem_classes="scrollable" |
|
) |
|
with gr.Column(): |
|
gr.Markdown("### Extracted Images") |
|
image_gallery = gr.Gallery( |
|
label="Images", |
|
columns=4, |
|
height="auto", |
|
elem_classes="scrollable" |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Extracted Tables") |
|
table_output = gr.HTML( |
|
label="Tables", |
|
elem_classes="scrollable center" |
|
) |
|
|
|
|
|
pdf_input.change( |
|
fn=process_multiple_pdfs, |
|
inputs=pdf_input, |
|
outputs=[text_output, table_output, image_gallery] |
|
) |
|
|
|
|
|
demo.launch() |