Gopikanth123 commited on
Commit
e41d1ae
·
verified ·
1 Parent(s): d95f096

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -26
app.py CHANGED
@@ -7,7 +7,7 @@ import time
7
  from pathlib import Path
8
  import shutil
9
 
10
- # Function to extract content from PDF
11
  def extract_pdf_content(file_path):
12
  # Open the PDF
13
  pdf_file = fitz.open(file_path)
@@ -15,9 +15,8 @@ def extract_pdf_content(file_path):
15
 
16
  # Ensure images directory exists
17
  images_dir = "temp_images"
18
- if os.path.exists(images_dir):
19
- shutil.rmtree(images_dir) # Clean up previous images
20
- os.makedirs(images_dir)
21
 
22
  # Store extracted content
23
  all_text = []
@@ -50,7 +49,7 @@ def extract_pdf_content(file_path):
50
  base_image = pdf_file.extract_image(xref)
51
  image_bytes = base_image["image"]
52
  image_ext = base_image["ext"]
53
- image_name = f"{images_dir}/image_{i}.{image_ext}"
54
  image_paths.append(image_name)
55
 
56
  with open(image_name, "wb") as image_file:
@@ -61,27 +60,29 @@ def extract_pdf_content(file_path):
61
 
62
  return "\n".join(all_text), all_tables, image_paths
63
 
64
- # Gradio Interface
65
- def display_pdf_content(file_path, progress=gr.Progress()):
66
- # Extract content with progress updates
67
- progress(0, desc="Starting extraction...")
68
- time.sleep(1)
69
- progress(0.25, desc="Extracting text...")
70
- text, tables, images = extract_pdf_content(file_path)
71
- progress(0.5, desc="Extracting tables...")
72
- time.sleep(1)
73
- progress(0.75, desc="Extracting images...")
74
- time.sleep(1)
75
- progress(1.0, desc="Extraction complete!")
 
 
76
 
77
  # Convert tables to HTML with advanced styling
78
  table_html = ""
79
- for idx, table in enumerate(tables):
80
  table_html += f"<h3>Table {idx + 1}</h3>"
81
  table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered")
82
 
83
  # Return outputs
84
- return text, table_html, images
85
 
86
  # Custom CSS for advanced styling
87
  custom_css = """
@@ -116,8 +117,12 @@ custom_css = """
116
  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
117
  }
118
  .scrollable {
119
- max-height: 400px;
120
- overflow-y: auto;
 
 
 
 
121
  border: 1px solid #ddd;
122
  padding: 10px;
123
  border-radius: 5px;
@@ -141,22 +146,35 @@ custom_css = """
141
  with gr.Blocks(css=custom_css) as demo:
142
  gr.Markdown("# Advanced PDF Content Extractor")
143
  with gr.Row():
144
- pdf_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
145
  with gr.Row():
146
  with gr.Column():
147
  gr.Markdown("### Extracted Text")
148
- text_output = gr.Textbox(label="Text", lines=15, interactive=False, elem_classes="scrollable")
 
 
 
 
 
149
  with gr.Column():
150
  gr.Markdown("### Extracted Images")
151
- image_gallery = gr.Gallery(label="Images", columns=4, height="auto", elem_classes="scrollable")
 
 
 
 
 
152
  with gr.Row():
153
  with gr.Column():
154
  gr.Markdown("### Extracted Tables")
155
- table_output = gr.HTML(label="Tables", elem_classes="scrollable center")
 
 
 
156
 
157
  # Main function call
158
  pdf_input.change(
159
- fn=display_pdf_content,
160
  inputs=pdf_input,
161
  outputs=[text_output, table_output, image_gallery]
162
  )
 
7
  from pathlib import Path
8
  import shutil
9
 
10
+ # Function to extract content from a single PDF
11
  def extract_pdf_content(file_path):
12
  # Open the PDF
13
  pdf_file = fitz.open(file_path)
 
15
 
16
  # Ensure images directory exists
17
  images_dir = "temp_images"
18
+ if not os.path.exists(images_dir):
19
+ os.makedirs(images_dir)
 
20
 
21
  # Store extracted content
22
  all_text = []
 
49
  base_image = pdf_file.extract_image(xref)
50
  image_bytes = base_image["image"]
51
  image_ext = base_image["ext"]
52
+ image_name = f"{images_dir}/image_{time.time()}_{i}.{image_ext}" # Unique name for each image
53
  image_paths.append(image_name)
54
 
55
  with open(image_name, "wb") as image_file:
 
60
 
61
  return "\n".join(all_text), all_tables, image_paths
62
 
63
+ # Function to handle multiple PDFs
64
+ def process_multiple_pdfs(files, progress=gr.Progress()):
65
+ aggregated_text = []
66
+ aggregated_tables = []
67
+ aggregated_images = []
68
+
69
+ total_files = len(files)
70
+ for idx, file in enumerate(files):
71
+ file_path = file.name # Get the temporary file path
72
+ progress(idx / total_files, desc=f"Processing PDF {idx + 1}/{total_files}")
73
+ text, tables, images = extract_pdf_content(file_path)
74
+ aggregated_text.append(f"=== File: {Path(file_path).name} ===\n{text}")
75
+ aggregated_tables.extend(tables)
76
+ aggregated_images.extend(images)
77
 
78
  # Convert tables to HTML with advanced styling
79
  table_html = ""
80
+ for idx, table in enumerate(aggregated_tables):
81
  table_html += f"<h3>Table {idx + 1}</h3>"
82
  table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered")
83
 
84
  # Return outputs
85
+ return "\n".join(aggregated_text), table_html, aggregated_images
86
 
87
  # Custom CSS for advanced styling
88
  custom_css = """
 
117
  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
118
  }
119
  .scrollable {
120
+ max-height: 400px; /* Fixed height for vertical scrolling */
121
+ max-width: 100%; /* Ensure the width is constrained */
122
+ overflow-y: auto; /* Enable vertical scrolling */
123
+ overflow-x: auto; /* Enable horizontal scrolling */
124
+ white-space: pre-wrap; /* Preserve whitespace and wrap text */
125
+ word-wrap: break-word; /* Break long words if necessary */
126
  border: 1px solid #ddd;
127
  padding: 10px;
128
  border-radius: 5px;
 
146
  with gr.Blocks(css=custom_css) as demo:
147
  gr.Markdown("# Advanced PDF Content Extractor")
148
  with gr.Row():
149
+ pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
150
  with gr.Row():
151
  with gr.Column():
152
  gr.Markdown("### Extracted Text")
153
+ text_output = gr.Textbox(
154
+ label="Text",
155
+ lines=15,
156
+ interactive=False,
157
+ elem_classes="scrollable" # Apply scrollable class
158
+ )
159
  with gr.Column():
160
  gr.Markdown("### Extracted Images")
161
+ image_gallery = gr.Gallery(
162
+ label="Images",
163
+ columns=4,
164
+ height="auto",
165
+ elem_classes="scrollable"
166
+ )
167
  with gr.Row():
168
  with gr.Column():
169
  gr.Markdown("### Extracted Tables")
170
+ table_output = gr.HTML(
171
+ label="Tables",
172
+ elem_classes="scrollable center"
173
+ )
174
 
175
  # Main function call
176
  pdf_input.change(
177
+ fn=process_multiple_pdfs,
178
  inputs=pdf_input,
179
  outputs=[text_output, table_output, image_gallery]
180
  )