ChinmayBH commited on
Commit
4516170
1 Parent(s): a6cd894

updated app.py

Browse files
Files changed (1) hide show
  1. app.py +250 -0
app.py CHANGED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import json
4
+ import fitz
5
+ from io import BytesIO
6
+ from PIL import Image
7
+ import pandas as pd
8
+ import tempfile
9
+
10
+ def extract_text_images(
11
+ pdf_path: str, output_folder: str,
12
+ minimum_font_size: int,
13
+ extraction_type: str = 'both'
14
+ ) -> dict:
15
+ """
16
+ Extracts text and/or images from a PDF and organizes them by pages.
17
+
18
+ Params
19
+ -------
20
+ pdf_path: str
21
+ Path to the input PDF file.
22
+ output_folder: str
23
+ Path to the output folder where extracted data will be saved.
24
+ minimum_font_size: int
25
+ Minimum font size below which the text will be ignored.
26
+ extraction_type: str
27
+ Type of extraction, either 'text', 'images', or 'both'.
28
+
29
+ Returns
30
+ -------
31
+ dict
32
+ The extracted data organized by pages.
33
+ """
34
+ if not os.path.exists(output_folder):
35
+ os.makedirs(output_folder)
36
+
37
+ extraction_data = []
38
+
39
+ pdf_document = fitz.open(pdf_path)
40
+
41
+ for page_number in range(pdf_document.page_count):
42
+ page = pdf_document.load_page(page_number)
43
+ elements = []
44
+
45
+ if extraction_type in ('text', 'both'):
46
+ text_blocks = page.get_text("dict")["blocks"]
47
+ lines = {}
48
+
49
+ for block in text_blocks:
50
+ if block["type"] == 0:
51
+ for line in block["lines"]:
52
+ for span in line["spans"]:
53
+ font_size = span["size"]
54
+ top = span["bbox"][1]
55
+
56
+ if font_size < minimum_font_size:
57
+ continue
58
+
59
+ if top not in lines:
60
+ lines[top] = []
61
+ lines[top].append(span)
62
+
63
+ for top in sorted(lines.keys()):
64
+ line = lines[top]
65
+ line_text = " ".join([span['text'] for span in line])
66
+
67
+ elements.append({
68
+ 'type': 'text',
69
+ 'font_size': line[0]['size'],
70
+ 'page': page_number + 1,
71
+ 'content': line_text,
72
+ 'x0': line[0]['bbox'][0],
73
+ 'top': top,
74
+ })
75
+
76
+ if extraction_type in ('images', 'both'):
77
+ image_list = page.get_images(full=True)
78
+
79
+ for img_index, img in enumerate(image_list):
80
+ xref = img[0]
81
+ base_image = pdf_document.extract_image(xref)
82
+ image_bytes = base_image["image"]
83
+ image_filename = os.path.join(
84
+ output_folder,
85
+ f"page_{page_number + 1}_img_{img_index + 1}.png"
86
+ )
87
+
88
+ with open(image_filename, "wb") as img_file:
89
+ img_file.write(image_bytes)
90
+
91
+ img_rect = page.get_image_bbox(img)
92
+ elements.append({
93
+ 'type': 'image',
94
+ 'page': page_number + 1,
95
+ 'path': image_filename,
96
+ 'x0': img_rect.x0,
97
+ 'top': img_rect.y0
98
+ })
99
+
100
+ elements.sort(key=lambda e: (e['top'], e['x0']))
101
+
102
+ page_content = []
103
+ for element in elements:
104
+ if element['type'] == 'text':
105
+ if page_content and page_content[-1]['type'] == 'text':
106
+ page_content[-1]['content'] += " " + element['content']
107
+ else:
108
+ page_content.append({
109
+ 'type': 'text',
110
+ 'content': element['content']
111
+ })
112
+ elif element['type'] == 'image':
113
+ page_content.append({
114
+ 'type': 'image',
115
+ 'path': element['path']
116
+ })
117
+
118
+ extraction_data.append({
119
+ 'page': page_number + 1,
120
+ 'content': page_content
121
+ })
122
+
123
+ pdf_document.close()
124
+
125
+ return extraction_data
126
+
127
+ def convert_to_xlsx(data: dict) -> BytesIO:
128
+ rows = []
129
+
130
+ for item in data:
131
+ page_number = item['page']
132
+ content_list = item['content']
133
+
134
+ for content in content_list:
135
+ if content['type'] == 'text':
136
+ rows.append({
137
+ 'Page': page_number,
138
+ 'Content': content['content']
139
+ })
140
+ elif content['type'] == 'image':
141
+ rows.append({
142
+ 'Page': page_number,
143
+ 'Content': f"[Image: {content['path']}]"
144
+ })
145
+
146
+ df = pd.DataFrame(rows)
147
+
148
+ output = BytesIO()
149
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
150
+ df.to_excel(writer, index=False, sheet_name='Extraction')
151
+
152
+ output.seek(0)
153
+ return output
154
+
155
+ def main():
156
+ st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
157
+ st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
158
+
159
+ st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
160
+
161
+ pdf_file = st.file_uploader("Upload PDF", type="pdf")
162
+
163
+ if pdf_file is not None:
164
+ num_pages_to_preview = st.sidebar.slider(
165
+ "Select number of pages to preview:",
166
+ min_value=1, max_value=5, value=1
167
+ )
168
+
169
+ pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
170
+ for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
171
+ page = pdf_document.load_page(page_num)
172
+ pix = page.get_pixmap()
173
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
174
+ st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
175
+
176
+ st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
177
+ extraction_type = st.selectbox(
178
+ "Choose extraction type:",
179
+ ("text", "images", "both")
180
+ )
181
+
182
+ st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
183
+ minimum_font_size = st.number_input(
184
+ "Minimum font size to extract:",
185
+ min_value=1, value=2
186
+ )
187
+
188
+ if st.button("Start Extraction"):
189
+ if pdf_file is not None:
190
+ with tempfile.TemporaryDirectory() as output_folder:
191
+ temp_pdf_path = os.path.join(output_folder, pdf_file.name)
192
+ with open(temp_pdf_path, "wb") as f:
193
+ f.write(pdf_file.getvalue())
194
+
195
+ extraction_data = extract_text_images(
196
+ temp_pdf_path,
197
+ output_folder,
198
+ minimum_font_size,
199
+ extraction_type
200
+ )
201
+
202
+ st.json(extraction_data)
203
+
204
+ xlsx_data = convert_to_xlsx(extraction_data)
205
+
206
+ col1, col2 = st.columns(2)
207
+
208
+ with col1:
209
+ st.download_button(
210
+ label="Download JSON",
211
+ data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
212
+ file_name='extraction_data.json',
213
+ mime='application/json')
214
+
215
+ with col2:
216
+ st.download_button(
217
+ label="Download XLSX",
218
+ data=xlsx_data,
219
+ file_name='extraction_data.xlsx',
220
+ mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
221
+
222
+ else:
223
+ st.error("Please upload a PDF file.")
224
+
225
+ st.markdown(
226
+ """
227
+ <style>
228
+ .footer {
229
+ position: fixed;
230
+ bottom: 0;
231
+ left: 0;
232
+ width: 100%;
233
+ background-color: #F0F0F0;
234
+ font-family:cursive;
235
+ text-align: right;
236
+ padding: 5px 0;
237
+ font-size:20px;
238
+ font-weight: bold;
239
+ color: #FF0000;
240
+ }
241
+ </style>
242
+ <div class="footer">
243
+ CREATED BY: CHINMAY BHALERAO
244
+ </div>
245
+ """,
246
+ unsafe_allow_html=True
247
+ )
248
+
249
+ if __name__ == "__main__":
250
+ main()