ChinmayBH commited on
Commit
8146073
·
verified ·
1 Parent(s): 503c3e1

Added pagewise only part to code

Browse files

separated two things, page wise and header wise

Files changed (1) hide show
  1. app.py +309 -376
app.py CHANGED
@@ -1,376 +1,309 @@
1
- import os
2
- import json
3
- import fitz
4
- import pdfplumber
5
- import pandas as pd
6
- import streamlit as st
7
- from tempfile import NamedTemporaryFile
8
- from PIL import Image
9
- import io
10
-
11
- def extract_text_images(
12
- pdf_path: str, output_folder: str,
13
- minimum_font_size: int,
14
- extract_text: bool = True,
15
- extract_images: bool = True,
16
- mode: str = 'headerwise',
17
- header_font_sizes: list[float] = None,
18
- tolerance: float = 0.01,
19
- ) -> dict:
20
- """
21
- Extracts text and/or images from a PDF and organizes them either by headers or by pages.
22
-
23
- Params
24
- -------
25
- pdf_path: str
26
- Path to the input PDF file.
27
- output_folder: str
28
- Path to the output folder where extracted data will be saved.
29
- extract_text: bool
30
- Whether to extract text.
31
- extract_images: bool
32
- Whether to extract images.
33
- mode: str
34
- Extraction mode, either 'headerwise' or 'pagewise'.
35
- header_font_sizes: list[float]
36
- List of font sizes to be considered as headers.
37
- tolerance: float
38
- Tolerance for font size comparison.
39
-
40
- Returns
41
- -------
42
- dict
43
- Dictionary containing extracted text and/or image data.
44
- """
45
- if not os.path.exists(output_folder):
46
- os.makedirs(output_folder)
47
-
48
- extraction_data = []
49
- current_header = None
50
- current_header_content = []
51
-
52
- def add_current_header_content() -> None:
53
- """
54
- Adds the current header and its content to the extraction data.
55
- """
56
- nonlocal current_header, current_header_content
57
- if current_header:
58
- extraction_data.append({
59
- 'header': current_header,
60
- 'content': current_header_content
61
- })
62
- current_header_content = []
63
- current_header = None
64
-
65
- def is_header_font_size(font_size: float) -> bool:
66
- """
67
- Checks if a given font size matches any of the header font sizes.
68
- """
69
- return any(
70
- abs(font_size - header_font_size) <= tolerance
71
- for header_font_size in header_font_sizes
72
- )
73
-
74
- pdf_document = fitz.open(pdf_path)
75
-
76
- for page_number in range(pdf_document.page_count):
77
- page = pdf_document.load_page(page_number)
78
- elements = []
79
-
80
- if extract_text:
81
- # Extract text blocks with their positions and font sizes
82
- text_blocks = page.get_text("dict")["blocks"]
83
- lines = {}
84
-
85
- # Group text blocks by their vertical position (top) to form lines
86
- for block in text_blocks:
87
- if block["type"] == 0: # Text block
88
- for line in block["lines"]:
89
- for span in line["spans"]:
90
- font_size = span["size"]
91
- top = span["bbox"][1]
92
-
93
- # Skip text blocks with font size less than 10
94
- if font_size < minimum_font_size:
95
- continue
96
-
97
- if top not in lines:
98
- lines[top] = []
99
- lines[top].append(span)
100
-
101
- # Process each line to check if it's a header
102
- for top in sorted(lines.keys()):
103
- line = lines[top]
104
- line_text = " ".join([span['text'] for span in line])
105
- line_font_size = line[0]['size']
106
-
107
- elements.append({
108
- 'type': 'text',
109
- 'font_size': line_font_size,
110
- 'page': page_number + 1,
111
- 'content': line_text,
112
- 'x0': line[0]['bbox'][0],
113
- 'top': top
114
- })
115
-
116
- if extract_images:
117
- # Extract images using PyMuPDF
118
- image_list = page.get_images(full=True)
119
-
120
- for img_index, img in enumerate(image_list):
121
- xref = img[0]
122
- base_image = pdf_document.extract_image(xref)
123
- image_bytes = base_image["image"]
124
- image_filename = os.path.join(
125
- output_folder,
126
- f"page_{page_number + 1}_img_{img_index + 1}.png"
127
- )
128
-
129
- with open(image_filename, "wb") as img_file:
130
- img_file.write(image_bytes)
131
-
132
- # Get the position of the image
133
- img_rect = page.get_image_bbox(img)
134
- elements.append({
135
- 'type': 'image',
136
- 'page': page_number + 1,
137
- 'path': image_filename,
138
- 'x0': img_rect.x0,
139
- 'top': img_rect.y0
140
- })
141
-
142
- # Sort elements by their vertical position (top) first,
143
- # and then by horizontal position (x0)
144
- elements.sort(key=lambda e: (e['top'], e['x0']))
145
-
146
- if mode == 'headerwise':
147
- # Process elements to extract headers and content
148
- for element in elements:
149
- if element['type'] == 'text' and \
150
- is_header_font_size(element['font_size']):
151
- # If a new header is found,
152
- # finalize the current header content
153
- add_current_header_content()
154
- current_header = element['content']
155
- elif element['type'] == 'text':
156
- if current_header_content and \
157
- current_header_content[-1]['type'] == 'text':
158
- current_header_content[-1]['content'] \
159
- += " " + element['content']
160
- else:
161
- current_header_content.append({
162
- 'type': 'text',
163
- 'content': element['content']
164
- })
165
- elif element['type'] == 'image':
166
- current_header_content.append({
167
- 'type': 'image',
168
- 'path': element['path']
169
- })
170
-
171
- elif mode == 'pagewise':
172
- page_content = []
173
- for element in elements:
174
- if element['type'] == 'text':
175
- if page_content and \
176
- page_content[-1]['type'] == 'text':
177
- page_content[-1]['content'] \
178
- += " " + element['content']
179
- else:
180
- page_content.append({
181
- 'type': 'text',
182
- 'content': element['content']
183
- })
184
- elif element['type'] == 'image':
185
- page_content.append({
186
- 'type': 'image',
187
- 'path': element['path']
188
- })
189
- extraction_data.append({
190
- 'page': page_number + 1,
191
- 'content': page_content
192
- })
193
-
194
- # After the loop, finalize any remaining header content
195
- if mode == 'headerwise':
196
- add_current_header_content()
197
-
198
- pdf_document.close()
199
-
200
- return extraction_data
201
-
202
- def get_word_font_sizes(pdf_path, target_words):
203
- word_font_sizes = {word: [] for word in target_words}
204
-
205
- with pdfplumber.open(pdf_path) as pdf:
206
- for page in pdf.pages:
207
- words = page.extract_words(extra_attrs=['fontname', 'size'])
208
- for word in words:
209
- text = word['text'].strip()
210
- if text in target_words:
211
- word_font_sizes[text].append(word['size'])
212
- return word_font_sizes
213
-
214
- def preview_pdf(pdf_path, num_pages=1):
215
- pdf_document = fitz.open(pdf_path)
216
- preview_images = []
217
-
218
- for page_number in range(min(num_pages, pdf_document.page_count)):
219
- page = pdf_document.load_page(page_number)
220
- pix = page.get_pixmap()
221
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
222
- preview_images.append(img)
223
-
224
- pdf_document.close()
225
- return preview_images
226
-
227
- # Streamlit UI
228
-
229
- import io
230
-
231
- def main():
232
- # setting page config
233
- st.set_page_config(
234
- page_title="Object counting",
235
- page_icon="🧊",
236
- layout="wide",
237
- initial_sidebar_state="expanded",
238
- menu_items={
239
- 'Get Help': 'https://www.extremelycoolapp.com/help',
240
- 'Report a bug': "https://www.extremelycoolapp.com/bug",
241
- }
242
- )
243
-
244
- st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER</h1>",
245
- unsafe_allow_html=True)
246
- st.markdown(
247
- "<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>",
248
- unsafe_allow_html=True
249
- )
250
- st.markdown(
251
- "<h5 style='text-align: center;color: red;'>Step 1: Upload pdf </h5>",
252
- unsafe_allow_html=True
253
- )
254
- st.markdown(
255
- "<h5 style='text-align: center;color: red;'>Step 2: Fill the values at right in data extraction settings </h5>",
256
- unsafe_allow_html=True
257
- )
258
- st.markdown(
259
- "<h5 style='text-align: center;color: red;'>Step 3: Download the data in desired format </h5>",
260
- unsafe_allow_html=True
261
- )
262
-
263
- uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
264
- if uploaded_pdf:
265
- # Save the uploaded PDF to a temporary file
266
- with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
267
- temp_pdf.write(uploaded_pdf.read())
268
- temp_pdf_path = temp_pdf.name
269
-
270
-
271
- # Collapsible PDF Preview
272
- with st.expander("PDF Preview", expanded=True):
273
- num_pages = st.slider("Number of pages to preview", min_value=1, max_value=5, value=1)
274
- preview_images = preview_pdf(temp_pdf_path, num_pages)
275
-
276
- for img in preview_images:
277
- st.image(img, caption=f"Page {preview_images.index(img) + 1}", use_column_width=True)
278
-
279
- st.sidebar.title("DATA EXTRACTION SETTINGS")
280
- st.sidebar.write("How you want to extract data?")
281
-
282
- extraction_mode = st.sidebar.radio("Extraction Mode", ["headerwise", "pagewise"])
283
- # Font Size Detection
284
- st.sidebar.title("FONT SIZE DETECTION")
285
- st.sidebar.warning("[Only in case of headerwise extraction] if you dont know the font size for your headers or text then copy paste any of those words below")
286
- target_words_input = st.sidebar.text_input(
287
- "Target words (comma-separated)", "")
288
- target_words = [word.strip() for word in target_words_input.split(",")]
289
-
290
- if st.sidebar.button("Get Font Sizes"):
291
- word_font_sizes = get_word_font_sizes(temp_pdf_path, target_words)
292
- for word, sizes in word_font_sizes.items():
293
- st.sidebar.write(f"Word: {word}, Font sizes: {sizes}")
294
-
295
- # st.sidebar.warning("Fill below required details")
296
- header_font_sizes = st.sidebar.text_input("Header Font Sizes (comma-separated)", "0")
297
- # st.sidebar.info("Header sizes are only required in case of headerwise extraction")
298
- header_font_sizes = [float(size.strip()) for size in header_font_sizes.split(",")]
299
- st.sidebar.title("OUTPUT FOLDER PATH")
300
- output_folder = st.sidebar.text_input(" ", value=os.path.join(os.path.dirname ("Extracted_Data")))
301
- st.sidebar.info("what do you want to include in data extraction?")
302
- extract_text = st.sidebar.checkbox("Extract Text", value=True)
303
- extract_images = st.sidebar.checkbox("Extract Images", value=True)
304
-
305
- minimum_font_size = st.sidebar.number_input("Minimum Font Size", min_value=1, value=10)
306
-
307
-
308
- if st.sidebar.button("Start Extraction"):
309
- if not os.path.exists(output_folder):
310
- os.makedirs(output_folder)
311
-
312
- extracted_data = extract_text_images(
313
- temp_pdf_path,
314
- output_folder,
315
- minimum_font_size=minimum_font_size,
316
- extract_text=extract_text,
317
- extract_images=extract_images,
318
- mode=extraction_mode,
319
- header_font_sizes=header_font_sizes
320
- )
321
-
322
- # Display extracted data as JSON
323
- st.json(extracted_data)
324
-
325
- # Convert extracted data to a pandas DataFrame
326
- def extract_to_dataframe(data):
327
- rows = []
328
- for item in data:
329
- if 'header' in item:
330
- header = item['header']
331
- for content_item in item['content']:
332
- if content_item['type'] == 'text':
333
- rows.append({'Header': header, 'Content': content_item['content']})
334
- elif content_item['type'] == 'image':
335
- rows.append({'Header': header, 'Content': f"Image: {content_item['path']}"})
336
- elif 'page' in item:
337
- page_num = item['page']
338
- for content_item in item['content']:
339
- if content_item['type'] == 'text':
340
- rows.append({'Page': page_num, 'Content': content_item['content']})
341
- elif content_item['type'] == 'image':
342
- rows.append({'Page': page_num, 'Content': f"Image: {content_item['path']}"})
343
- return pd.DataFrame(rows)
344
-
345
- df = extract_to_dataframe(extracted_data)
346
-
347
- # Save DataFrame to an in-memory BytesIO buffer
348
- buffer = io.BytesIO()
349
- with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
350
- df.to_excel(writer, index=False, sheet_name='Extracted Data')
351
- buffer.seek(0)
352
-
353
- # Preview the first 5 lines of the XLSX data
354
- st.subheader("Preview of Extracted Data (First 5 Lines)")
355
- preview_df = pd.read_excel(buffer, sheet_name='Extracted Data')
356
- st.dataframe(preview_df.head())
357
-
358
- # Provide download options
359
- st.download_button(
360
- label="Download JSON",
361
- data=json.dumps(extracted_data, ensure_ascii=False),
362
- file_name='extracted_data.json',
363
- mime='application/json'
364
- )
365
-
366
- st.download_button(
367
- label="Download XLSX",
368
- data=buffer,
369
- file_name='extracted_data.xlsx',
370
- mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
371
- )
372
-
373
- st.success("Extraction complete. Data displayed as JSON.")
374
-
375
- if __name__ == "__main__":
376
- main()
 
1
+ import streamlit as st
2
+ import os
3
+ import json
4
+ import fitz
5
+ from io import BytesIO
6
+ from PIL import Image
7
+ import pandas as pd
8
+
9
+
10
+
11
+ def extract_text_images(
12
+ pdf_path: str, output_folder: str,
13
+ minimum_font_size: int,
14
+ extraction_type: str = 'both'
15
+ ) -> dict:
16
+ """
17
+ Extracts text and/or images from a PDF and organizes them by pages.
18
+
19
+ Params
20
+ -------
21
+ pdf_path: str
22
+ Path to the input PDF file.
23
+ output_folder: str
24
+ Path to the output folder where extracted data will be saved.
25
+ minimum_font_size: int
26
+ Minimum font size below which the text will be ignored.
27
+ extraction_type: str
28
+ Type of extraction, either 'text', 'images', or 'both'.
29
+
30
+ Returns
31
+ -------
32
+ dict
33
+ The extracted data organized by pages.
34
+ """
35
+ if not os.path.exists(output_folder):
36
+ os.makedirs(output_folder)
37
+
38
+ extraction_data = []
39
+
40
+ pdf_document = fitz.open(pdf_path)
41
+
42
+ for page_number in range(pdf_document.page_count):
43
+ page = pdf_document.load_page(page_number)
44
+ elements = []
45
+
46
+ if extraction_type in ('text', 'both'):
47
+ # Extract text blocks with their positions and font sizes
48
+ text_blocks = page.get_text("dict")["blocks"]
49
+ lines = {}
50
+
51
+ # Group text blocks by their vertical position (top) to form lines
52
+ for block in text_blocks:
53
+ if block["type"] == 0: # Text block
54
+ for line in block["lines"]:
55
+ for span in line["spans"]:
56
+ font_size = span["size"]
57
+ top = span["bbox"][1]
58
+
59
+ # Skip text blocks with font size less than the minimum
60
+ if font_size < minimum_font_size:
61
+ continue
62
+
63
+ if top not in lines:
64
+ lines[top] = []
65
+ lines[top].append(span)
66
+
67
+ # Process each line
68
+ for top in sorted(lines.keys()):
69
+ line = lines[top]
70
+ line_text = " ".join([span['text'] for span in line])
71
+
72
+ elements.append({
73
+ 'type': 'text',
74
+ 'font_size': line[0]['size'],
75
+ 'page': page_number + 1,
76
+ 'content': line_text,
77
+ 'x0': line[0]['bbox'][0],
78
+ 'top': top,
79
+ })
80
+
81
+ if extraction_type in ('images', 'both'):
82
+ # Extract images using PyMuPDF
83
+ image_list = page.get_images(full=True)
84
+
85
+ for img_index, img in enumerate(image_list):
86
+ xref = img[0]
87
+ base_image = pdf_document.extract_image(xref)
88
+ image_bytes = base_image["image"]
89
+ image_filename = os.path.join(
90
+ output_folder,
91
+ f"page_{page_number + 1}_img_{img_index + 1}.png"
92
+ )
93
+
94
+ with open(image_filename, "wb") as img_file:
95
+ img_file.write(image_bytes)
96
+
97
+ # Get the position of the image
98
+ img_rect = page.get_image_bbox(img)
99
+ elements.append({
100
+ 'type': 'image',
101
+ 'page': page_number + 1,
102
+ 'path': image_filename,
103
+ 'x0': img_rect.x0,
104
+ 'top': img_rect.y0
105
+ })
106
+
107
+ # Sort elements by their vertical position (top) first, and then by horizontal position (x0)
108
+ elements.sort(key=lambda e: (e['top'], e['x0']))
109
+
110
+ # Process elements to extract content pagewise
111
+ page_content = []
112
+ for element in elements:
113
+ if element['type'] == 'text':
114
+ if page_content and page_content[-1]['type'] == 'text':
115
+ page_content[-1]['content'] += " " + element['content']
116
+ else:
117
+ page_content.append({
118
+ 'type': 'text',
119
+ 'content': element['content']
120
+ })
121
+ elif element['type'] == 'image':
122
+ page_content.append({
123
+ 'type': 'image',
124
+ 'path': element['path']
125
+ })
126
+
127
+ extraction_data.append({
128
+ 'page': page_number + 1,
129
+ 'content': page_content
130
+ })
131
+
132
+ pdf_document.close()
133
+
134
+ return extraction_data
135
+
136
+ def convert_to_xlsx(data: dict) -> BytesIO:
137
+ """
138
+ Converts the extracted data to an XLSX file.
139
+
140
+ Params
141
+ -------
142
+ data: dict
143
+ The extracted data organized by pages.
144
+
145
+ Returns
146
+ -------
147
+ BytesIO
148
+ The XLSX file in memory.
149
+ """
150
+ rows = []
151
+
152
+ for item in data:
153
+ page_number = item['page']
154
+ content_list = item['content']
155
+
156
+ for content in content_list:
157
+ if content['type'] == 'text':
158
+ rows.append({
159
+ 'Page': page_number,
160
+ 'Content': content['content']
161
+ })
162
+ elif content['type'] == 'image':
163
+ rows.append({
164
+ 'Page': page_number,
165
+ 'Content': f"[Image: {content['path']}]"
166
+ })
167
+
168
+ df = pd.DataFrame(rows)
169
+
170
+ output = BytesIO()
171
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
172
+ df.to_excel(writer, index=False, sheet_name='Extraction')
173
+
174
+ output.seek(0)
175
+ return output
176
+
177
+
178
+ def main():
179
+ st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
180
+ st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
181
+
182
+ # Sidebar for PDF preview
183
+ st.markdown(
184
+ """
185
+ <style>
186
+ .sidebar-header {
187
+ text-align: center;
188
+ color: blue;
189
+ padding: 5px 0;
190
+ font-size:30px;
191
+ font-weight: bold;
192
+
193
+ }
194
+ </style>
195
+ """,
196
+ unsafe_allow_html=True)
197
+
198
+ st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
199
+ # File uploader
200
+ pdf_file = st.file_uploader("Upload PDF", type="pdf")
201
+
202
+ if pdf_file is not None:
203
+ # Slider to select number of pages to preview
204
+ num_pages_to_preview = st.sidebar.slider(
205
+ "Select number of pages to preview:",
206
+ min_value=1, max_value=5, value=1
207
+ )
208
+
209
+ # Display PDF preview for selected number of pages
210
+ pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
211
+ for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
212
+ page = pdf_document.load_page(page_num)
213
+ pix = page.get_pixmap()
214
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
215
+ st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
216
+
217
+ # Extraction type selector
218
+ st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
219
+ extraction_type = st.selectbox(
220
+ "Choose extraction type:",
221
+ ("text", "images", "both")
222
+ )
223
+
224
+ # Minimum font size input
225
+ st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
226
+ minimum_font_size = st.number_input(
227
+ "Minimum font size to extract:",
228
+ min_value=1, value=2
229
+ )
230
+
231
+ # Output folder path input
232
+ output_folder = st.text_input(
233
+ "Output folder path:",
234
+ os.path.join(os.getcwd(), "Extracted_Data")
235
+ )
236
+
237
+ if st.button("Start Extraction"):
238
+ if pdf_file is not None:
239
+ # Save uploaded PDF to a temporary location
240
+ temp_pdf_path = os.path.join(output_folder, pdf_file.name)
241
+ with open(temp_pdf_path, "wb") as f:
242
+ f.write(pdf_file.getvalue())
243
+
244
+ # Call the extraction function
245
+ extraction_data = extract_text_images(
246
+ temp_pdf_path,
247
+ output_folder,
248
+ minimum_font_size,
249
+ extraction_type
250
+ )
251
+
252
+ # Display extracted JSON data
253
+ st.json(extraction_data)
254
+
255
+ # Convert data to XLSX
256
+ xlsx_data = convert_to_xlsx(extraction_data)
257
+
258
+ # Show a preview of the XLSX data (first 5 rows)
259
+ # st.subheader("XLSX Preview (First 5 Rows)")
260
+ df = pd.read_excel(xlsx_data, sheet_name='Extraction')
261
+ # st.dataframe(df.head())
262
+
263
+ col1, col2 = st.columns(2)
264
+
265
+ with col1:
266
+ st.download_button(
267
+ label="Download JSON",
268
+ data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
269
+ file_name='extraction_data.json',
270
+ mime='application/json')
271
+
272
+ with col2:
273
+ st.download_button(
274
+ label="Download XLSX",
275
+ data=xlsx_data,
276
+ file_name='extraction_data.xlsx',
277
+ mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
278
+
279
+ else:
280
+ st.error("Please upload a PDF file.")
281
+
282
+ # Footer (Fixed Position)
283
+ st.markdown(
284
+ """
285
+ <style>
286
+ .footer {
287
+ position: fixed;
288
+ bottom: 0;
289
+ left: 0;
290
+ width: 100%;
291
+ background-color: #F0F0F0;
292
+ font-family:cursive;
293
+ text-align: right;
294
+ padding: 5px 0;
295
+ font-size:20px;
296
+ font-weight: bold;
297
+ color: #FF0000;
298
+ }
299
+ </style>
300
+ <div class="footer">
301
+ CREATED BY: CHINMAY BHALERAO
302
+ </div>
303
+ """,
304
+ unsafe_allow_html=True
305
+ )
306
+
307
+
308
+ if __name__ == "__main__":
309
+ main()