ChinmayBH commited on
Commit
c343a33
1 Parent(s): 4516170

updated app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -28
app.py CHANGED
@@ -5,6 +5,7 @@ import fitz
5
  from io import BytesIO
6
  from PIL import Image
7
  import pandas as pd
 
8
  import tempfile
9
 
10
  def extract_text_images(
@@ -14,28 +15,11 @@ def extract_text_images(
14
  ) -> dict:
15
  """
16
  Extracts text and/or images from a PDF and organizes them by pages.
17
-
18
- Params
19
- -------
20
- pdf_path: str
21
- Path to the input PDF file.
22
- output_folder: str
23
- Path to the output folder where extracted data will be saved.
24
- minimum_font_size: int
25
- Minimum font size below which the text will be ignored.
26
- extraction_type: str
27
- Type of extraction, either 'text', 'images', or 'both'.
28
-
29
- Returns
30
- -------
31
- dict
32
- The extracted data organized by pages.
33
  """
34
  if not os.path.exists(output_folder):
35
  os.makedirs(output_folder)
36
 
37
  extraction_data = []
38
-
39
  pdf_document = fitz.open(pdf_path)
40
 
41
  for page_number in range(pdf_document.page_count):
@@ -125,6 +109,9 @@ def extract_text_images(
125
  return extraction_data
126
 
127
  def convert_to_xlsx(data: dict) -> BytesIO:
 
 
 
128
  rows = []
129
 
130
  for item in data:
@@ -152,12 +139,33 @@ def convert_to_xlsx(data: dict) -> BytesIO:
152
  output.seek(0)
153
  return output
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def main():
156
  st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
157
  st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
158
 
159
  st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
160
-
161
  pdf_file = st.file_uploader("Upload PDF", type="pdf")
162
 
163
  if pdf_file is not None:
@@ -185,42 +193,52 @@ def main():
185
  min_value=1, value=2
186
  )
187
 
 
 
188
  if st.button("Start Extraction"):
189
- if pdf_file is not None:
190
- with tempfile.TemporaryDirectory() as output_folder:
191
- temp_pdf_path = os.path.join(output_folder, pdf_file.name)
192
  with open(temp_pdf_path, "wb") as f:
193
  f.write(pdf_file.getvalue())
194
 
195
  extraction_data = extract_text_images(
196
  temp_pdf_path,
197
- output_folder,
198
  minimum_font_size,
199
  extraction_type
200
  )
201
 
202
  st.json(extraction_data)
203
 
 
 
 
 
 
 
 
 
 
204
  xlsx_data = convert_to_xlsx(extraction_data)
205
 
206
  col1, col2 = st.columns(2)
207
-
208
  with col1:
209
  st.download_button(
210
  label="Download JSON",
211
  data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
212
  file_name='extraction_data.json',
213
- mime='application/json')
214
-
215
  with col2:
216
  st.download_button(
217
  label="Download XLSX",
218
  data=xlsx_data,
219
  file_name='extraction_data.xlsx',
220
- mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
221
-
222
  else:
223
- st.error("Please upload a PDF file.")
224
 
225
  st.markdown(
226
  """
 
5
  from io import BytesIO
6
  from PIL import Image
7
  import pandas as pd
8
+ import zipfile
9
  import tempfile
10
 
11
  def extract_text_images(
 
15
  ) -> dict:
16
  """
17
  Extracts text and/or images from a PDF and organizes them by pages.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  """
19
  if not os.path.exists(output_folder):
20
  os.makedirs(output_folder)
21
 
22
  extraction_data = []
 
23
  pdf_document = fitz.open(pdf_path)
24
 
25
  for page_number in range(pdf_document.page_count):
 
109
  return extraction_data
110
 
111
  def convert_to_xlsx(data: dict) -> BytesIO:
112
+ """
113
+ Converts the extracted data to an XLSX file.
114
+ """
115
  rows = []
116
 
117
  for item in data:
 
139
  output.seek(0)
140
  return output
141
 
142
+ def create_zip_with_json_and_images(output_folder, extraction_data):
143
+ """
144
+ Creates a ZIP file containing both images and JSON data.
145
+ """
146
+ zip_buffer = BytesIO()
147
+ with zipfile.ZipFile(zip_buffer, "w") as zip_file:
148
+ # Add JSON file
149
+ json_data = json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8')
150
+ zip_file.writestr("extraction_data.json", json_data)
151
+
152
+ # Add images
153
+ for item in extraction_data:
154
+ for content in item['content']:
155
+ if content['type'] == 'image':
156
+ image_path = content['path']
157
+ image_name = os.path.basename(image_path)
158
+ zip_file.write(image_path, image_name)
159
+
160
+ zip_buffer.seek(0)
161
+ return zip_buffer
162
+
163
  def main():
164
  st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
165
  st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
166
 
167
  st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
168
+
169
  pdf_file = st.file_uploader("Upload PDF", type="pdf")
170
 
171
  if pdf_file is not None:
 
193
  min_value=1, value=2
194
  )
195
 
196
+ output_folder = st.text_input("Output folder path:")
197
+
198
  if st.button("Start Extraction"):
199
+ if pdf_file is not None and output_folder:
200
+ with tempfile.TemporaryDirectory() as temp_dir:
201
+ temp_pdf_path = os.path.join(temp_dir, pdf_file.name)
202
  with open(temp_pdf_path, "wb") as f:
203
  f.write(pdf_file.getvalue())
204
 
205
  extraction_data = extract_text_images(
206
  temp_pdf_path,
207
+ temp_dir,
208
  minimum_font_size,
209
  extraction_type
210
  )
211
 
212
  st.json(extraction_data)
213
 
214
+ if extraction_type == 'images' or extraction_type == 'both':
215
+ zip_data = create_zip_with_json_and_images(temp_dir, extraction_data)
216
+ st.download_button(
217
+ label="Download ZIP",
218
+ data=zip_data,
219
+ file_name='extraction_data.zip',
220
+ mime='application/zip'
221
+ )
222
+
223
  xlsx_data = convert_to_xlsx(extraction_data)
224
 
225
  col1, col2 = st.columns(2)
 
226
  with col1:
227
  st.download_button(
228
  label="Download JSON",
229
  data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
230
  file_name='extraction_data.json',
231
+ mime='application/json'
232
+ )
233
  with col2:
234
  st.download_button(
235
  label="Download XLSX",
236
  data=xlsx_data,
237
  file_name='extraction_data.xlsx',
238
+ mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
239
+ )
240
  else:
241
+ st.error("Please upload a PDF file and provide an output folder path.")
242
 
243
  st.markdown(
244
  """