ChinmayBH commited on
Commit
96fadd5
·
verified ·
1 Parent(s): bcae609

updated app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -87
app.py CHANGED
@@ -5,6 +5,7 @@ import fitz
5
  from io import BytesIO
6
  from PIL import Image
7
  import pandas as pd
 
8
 
9
  def extract_text_images(
10
  pdf_path: str, output_folder: str,
@@ -42,19 +43,16 @@ def extract_text_images(
42
  elements = []
43
 
44
  if extraction_type in ('text', 'both'):
45
- # Extract text blocks with their positions and font sizes
46
  text_blocks = page.get_text("dict")["blocks"]
47
  lines = {}
48
 
49
- # Group text blocks by their vertical position (top) to form lines
50
  for block in text_blocks:
51
- if block["type"] == 0: # Text block
52
  for line in block["lines"]:
53
  for span in line["spans"]:
54
  font_size = span["size"]
55
  top = span["bbox"][1]
56
 
57
- # Skip text blocks with font size less than the minimum
58
  if font_size < minimum_font_size:
59
  continue
60
 
@@ -62,7 +60,6 @@ def extract_text_images(
62
  lines[top] = []
63
  lines[top].append(span)
64
 
65
- # Process each line
66
  for top in sorted(lines.keys()):
67
  line = lines[top]
68
  line_text = " ".join([span['text'] for span in line])
@@ -77,7 +74,6 @@ def extract_text_images(
77
  })
78
 
79
  if extraction_type in ('images', 'both'):
80
- # Extract images using PyMuPDF
81
  image_list = page.get_images(full=True)
82
 
83
  for img_index, img in enumerate(image_list):
@@ -92,7 +88,6 @@ def extract_text_images(
92
  with open(image_filename, "wb") as img_file:
93
  img_file.write(image_bytes)
94
 
95
- # Get the position of the image
96
  img_rect = page.get_image_bbox(img)
97
  elements.append({
98
  'type': 'image',
@@ -102,10 +97,8 @@ def extract_text_images(
102
  'top': img_rect.y0
103
  })
104
 
105
- # Sort elements by their vertical position (top) first, and then by horizontal position (x0)
106
  elements.sort(key=lambda e: (e['top'], e['x0']))
107
 
108
- # Process elements to extract content pagewise
109
  page_content = []
110
  for element in elements:
111
  if element['type'] == 'text':
@@ -132,19 +125,6 @@ def extract_text_images(
132
  return extraction_data
133
 
134
  def convert_to_xlsx(data: dict) -> BytesIO:
135
- """
136
- Converts the extracted data to an XLSX file.
137
-
138
- Params
139
- -------
140
- data: dict
141
- The extracted data organized by pages.
142
-
143
- Returns
144
- -------
145
- BytesIO
146
- The XLSX file in memory.
147
- """
148
  rows = []
149
 
150
  for item in data:
@@ -172,39 +152,20 @@ def convert_to_xlsx(data: dict) -> BytesIO:
172
  output.seek(0)
173
  return output
174
 
175
-
176
  def main():
177
  st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
178
  st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
179
 
180
- # Sidebar for PDF preview
181
- st.markdown(
182
- """
183
- <style>
184
- .sidebar-header {
185
- text-align: center;
186
- color: blue;
187
- padding: 5px 0;
188
- font-size:30px;
189
- font-weight: bold;
190
-
191
- }
192
- </style>
193
- """,
194
- unsafe_allow_html=True)
195
-
196
  st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
197
- # File uploader
198
  pdf_file = st.file_uploader("Upload PDF", type="pdf")
199
 
200
  if pdf_file is not None:
201
- # Slider to select number of pages to preview
202
  num_pages_to_preview = st.sidebar.slider(
203
  "Select number of pages to preview:",
204
  min_value=1, max_value=5, value=1
205
  )
206
 
207
- # Display PDF preview for selected number of pages
208
  pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
209
  for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
210
  page = pdf_document.load_page(page_num)
@@ -212,66 +173,55 @@ def main():
212
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
213
  st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
214
 
215
- # Extraction type selector
216
  st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
217
  extraction_type = st.selectbox(
218
  "Choose extraction type:",
219
  ("text", "images", "both")
220
  )
221
 
222
- # Minimum font size input
223
  st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
224
  minimum_font_size = st.number_input(
225
  "Minimum font size to extract:",
226
  min_value=1, value=2
227
  )
228
 
229
- # Output folder path input (full path provided by the user)
230
- output_folder = st.text_input(
231
- "Output folder path:"
232
- )
233
-
234
  if st.button("Start Extraction"):
235
- if pdf_file is not None and output_folder:
236
- # Save uploaded PDF to a temporary location
237
- temp_pdf_path = os.path.join(output_folder, pdf_file.name)
238
- with open(temp_pdf_path, "wb") as f:
239
- f.write(pdf_file.getvalue())
240
-
241
- # Call the extraction function
242
- extraction_data = extract_text_images(
243
- temp_pdf_path,
244
- output_folder,
245
- minimum_font_size,
246
- extraction_type
247
- )
248
-
249
- # Display extracted JSON data
250
- st.json(extraction_data)
251
-
252
- # Convert data to XLSX
253
- xlsx_data = convert_to_xlsx(extraction_data)
254
-
255
- col1, col2 = st.columns(2)
256
-
257
- with col1:
258
- st.download_button(
259
- label="Download JSON",
260
- data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
261
- file_name='extraction_data.json',
262
- mime='application/json')
263
-
264
- with col2:
265
- st.download_button(
266
- label="Download XLSX",
267
- data=xlsx_data,
268
- file_name='extraction_data.xlsx',
269
- mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
270
 
271
  else:
272
- st.error("Please upload a PDF file and provide an output folder path.")
273
 
274
- # Footer (Fixed Position)
275
  st.markdown(
276
  """
277
  <style>
@@ -296,6 +246,5 @@ def main():
296
  unsafe_allow_html=True
297
  )
298
 
299
-
300
  if __name__ == "__main__":
301
  main()
 
5
  from io import BytesIO
6
  from PIL import Image
7
  import pandas as pd
8
+ import tempfile
9
 
10
  def extract_text_images(
11
  pdf_path: str, output_folder: str,
 
43
  elements = []
44
 
45
  if extraction_type in ('text', 'both'):
 
46
  text_blocks = page.get_text("dict")["blocks"]
47
  lines = {}
48
 
 
49
  for block in text_blocks:
50
+ if block["type"] == 0:
51
  for line in block["lines"]:
52
  for span in line["spans"]:
53
  font_size = span["size"]
54
  top = span["bbox"][1]
55
 
 
56
  if font_size < minimum_font_size:
57
  continue
58
 
 
60
  lines[top] = []
61
  lines[top].append(span)
62
 
 
63
  for top in sorted(lines.keys()):
64
  line = lines[top]
65
  line_text = " ".join([span['text'] for span in line])
 
74
  })
75
 
76
  if extraction_type in ('images', 'both'):
 
77
  image_list = page.get_images(full=True)
78
 
79
  for img_index, img in enumerate(image_list):
 
88
  with open(image_filename, "wb") as img_file:
89
  img_file.write(image_bytes)
90
 
 
91
  img_rect = page.get_image_bbox(img)
92
  elements.append({
93
  'type': 'image',
 
97
  'top': img_rect.y0
98
  })
99
 
 
100
  elements.sort(key=lambda e: (e['top'], e['x0']))
101
 
 
102
  page_content = []
103
  for element in elements:
104
  if element['type'] == 'text':
 
125
  return extraction_data
126
 
127
  def convert_to_xlsx(data: dict) -> BytesIO:
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  rows = []
129
 
130
  for item in data:
 
152
  output.seek(0)
153
  return output
154
 
 
155
  def main():
156
  st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
157
  st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
160
+
161
  pdf_file = st.file_uploader("Upload PDF", type="pdf")
162
 
163
  if pdf_file is not None:
 
164
  num_pages_to_preview = st.sidebar.slider(
165
  "Select number of pages to preview:",
166
  min_value=1, max_value=5, value=1
167
  )
168
 
 
169
  pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
170
  for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
171
  page = pdf_document.load_page(page_num)
 
173
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
174
  st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
175
 
 
176
  st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
177
  extraction_type = st.selectbox(
178
  "Choose extraction type:",
179
  ("text", "images", "both")
180
  )
181
 
 
182
  st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
183
  minimum_font_size = st.number_input(
184
  "Minimum font size to extract:",
185
  min_value=1, value=2
186
  )
187
 
 
 
 
 
 
188
  if st.button("Start Extraction"):
189
+ if pdf_file is not None:
190
+ with tempfile.TemporaryDirectory() as output_folder:
191
+ temp_pdf_path = os.path.join(output_folder, pdf_file.name)
192
+ with open(temp_pdf_path, "wb") as f:
193
+ f.write(pdf_file.getvalue())
194
+
195
+ extraction_data = extract_text_images(
196
+ temp_pdf_path,
197
+ output_folder,
198
+ minimum_font_size,
199
+ extraction_type
200
+ )
201
+
202
+ st.json(extraction_data)
203
+
204
+ xlsx_data = convert_to_xlsx(extraction_data)
205
+
206
+ col1, col2 = st.columns(2)
207
+
208
+ with col1:
209
+ st.download_button(
210
+ label="Download JSON",
211
+ data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
212
+ file_name='extraction_data.json',
213
+ mime='application/json')
214
+
215
+ with col2:
216
+ st.download_button(
217
+ label="Download XLSX",
218
+ data=xlsx_data,
219
+ file_name='extraction_data.xlsx',
220
+ mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
 
 
 
221
 
222
  else:
223
+ st.error("Please upload a PDF file.")
224
 
 
225
  st.markdown(
226
  """
227
  <style>
 
246
  unsafe_allow_html=True
247
  )
248
 
 
249
  if __name__ == "__main__":
250
  main()