Spaces:

Gopikanth123
/

pdf_content_extraction

Running

App Files Files Community

Gopikanth123 commited on 3 days ago

Commit

e41d1ae

verified ·

1 Parent(s): d95f096

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -26

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import time
 from pathlib import Path
 import shutil
-# Function to extract content from PDF
 def extract_pdf_content(file_path):
     # Open the PDF
     pdf_file = fitz.open(file_path)
@@ -15,9 +15,8 @@ def extract_pdf_content(file_path):
     # Ensure images directory exists
     images_dir = "temp_images"
-    if os.path.exists(images_dir):
-        shutil.rmtree(images_dir)  # Clean up previous images
-    os.makedirs(images_dir)
     # Store extracted content
     all_text = []
@@ -50,7 +49,7 @@ def extract_pdf_content(file_path):
             base_image = pdf_file.extract_image(xref)
             image_bytes = base_image["image"]
             image_ext = base_image["ext"]
-            image_name = f"{images_dir}/image_{i}.{image_ext}"
             image_paths.append(image_name)
             with open(image_name, "wb") as image_file:
@@ -61,27 +60,29 @@ def extract_pdf_content(file_path):
     return "\n".join(all_text), all_tables, image_paths
-# Gradio Interface
-def display_pdf_content(file_path, progress=gr.Progress()):
-    # Extract content with progress updates
-    progress(0, desc="Starting extraction...")
-    time.sleep(1)
-    progress(0.25, desc="Extracting text...")
-    text, tables, images = extract_pdf_content(file_path)
-    progress(0.5, desc="Extracting tables...")
-    time.sleep(1)
-    progress(0.75, desc="Extracting images...")
-    time.sleep(1)
-    progress(1.0, desc="Extraction complete!")
     # Convert tables to HTML with advanced styling
     table_html = ""
-    for idx, table in enumerate(tables):
         table_html += f"<h3>Table {idx + 1}</h3>"
         table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered")
     # Return outputs
-    return text, table_html, images
 # Custom CSS for advanced styling
 custom_css = """
@@ -116,8 +117,12 @@ custom_css = """
         box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
     }
     .scrollable {
-        max-height: 400px;
-        overflow-y: auto;
         border: 1px solid #ddd;
         padding: 10px;
         border-radius: 5px;
@@ -141,22 +146,35 @@ custom_css = """
 with gr.Blocks(css=custom_css) as demo:
     gr.Markdown("# Advanced PDF Content Extractor")
     with gr.Row():
-        pdf_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Extracted Text")
-            text_output = gr.Textbox(label="Text", lines=15, interactive=False, elem_classes="scrollable")
         with gr.Column():
             gr.Markdown("### Extracted Images")
-            image_gallery = gr.Gallery(label="Images", columns=4, height="auto", elem_classes="scrollable")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Extracted Tables")
-            table_output = gr.HTML(label="Tables", elem_classes="scrollable center")
     # Main function call
     pdf_input.change(
-        fn=display_pdf_content,
         inputs=pdf_input,
         outputs=[text_output, table_output, image_gallery]
     )

 from pathlib import Path
 import shutil
+# Function to extract content from a single PDF
 def extract_pdf_content(file_path):
     # Open the PDF
     pdf_file = fitz.open(file_path)
     # Ensure images directory exists
     images_dir = "temp_images"
+    if not os.path.exists(images_dir):
+        os.makedirs(images_dir)
     # Store extracted content
     all_text = []
             base_image = pdf_file.extract_image(xref)
             image_bytes = base_image["image"]
             image_ext = base_image["ext"]
+            image_name = f"{images_dir}/image_{time.time()}_{i}.{image_ext}"  # Unique name for each image
             image_paths.append(image_name)
             with open(image_name, "wb") as image_file:
     return "\n".join(all_text), all_tables, image_paths
+# Function to handle multiple PDFs
+def process_multiple_pdfs(files, progress=gr.Progress()):
+    aggregated_text = []
+    aggregated_tables = []
+    aggregated_images = []
+    total_files = len(files)
+    for idx, file in enumerate(files):
+        file_path = file.name  # Get the temporary file path
+        progress(idx / total_files, desc=f"Processing PDF {idx + 1}/{total_files}")
+        text, tables, images = extract_pdf_content(file_path)
+        aggregated_text.append(f"=== File: {Path(file_path).name} ===\n{text}")
+        aggregated_tables.extend(tables)
+        aggregated_images.extend(images)
     # Convert tables to HTML with advanced styling
     table_html = ""
+    for idx, table in enumerate(aggregated_tables):
         table_html += f"<h3>Table {idx + 1}</h3>"
         table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered")
     # Return outputs
+    return "\n".join(aggregated_text), table_html, aggregated_images
 # Custom CSS for advanced styling
 custom_css = """
         box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
     }
     .scrollable {
+        max-height: 400px; /* Fixed height for vertical scrolling */
+        max-width: 100%; /* Ensure the width is constrained */
+        overflow-y: auto; /* Enable vertical scrolling */
+        overflow-x: auto; /* Enable horizontal scrolling */
+        white-space: pre-wrap; /* Preserve whitespace and wrap text */
+        word-wrap: break-word; /* Break long words if necessary */
         border: 1px solid #ddd;
         padding: 10px;
         border-radius: 5px;
 with gr.Blocks(css=custom_css) as demo:
     gr.Markdown("# Advanced PDF Content Extractor")
     with gr.Row():
+        pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Extracted Text")
+            text_output = gr.Textbox(
+                label="Text",
+                lines=15,
+                interactive=False,
+                elem_classes="scrollable"  # Apply scrollable class
+            )
         with gr.Column():
             gr.Markdown("### Extracted Images")
+            image_gallery = gr.Gallery(
+                label="Images",
+                columns=4,
+                height="auto",
+                elem_classes="scrollable"
+            )
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Extracted Tables")
+            table_output = gr.HTML(
+                label="Tables",
+                elem_classes="scrollable center"
+            )
     # Main function call
     pdf_input.change(
+        fn=process_multiple_pdfs,
         inputs=pdf_input,
         outputs=[text_output, table_output, image_gallery]
     )