updated app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import fitz
|
|
5 |
from io import BytesIO
|
6 |
from PIL import Image
|
7 |
import pandas as pd
|
|
|
8 |
import tempfile
|
9 |
|
10 |
def extract_text_images(
|
@@ -14,28 +15,11 @@ def extract_text_images(
|
|
14 |
) -> dict:
|
15 |
"""
|
16 |
Extracts text and/or images from a PDF and organizes them by pages.
|
17 |
-
|
18 |
-
Params
|
19 |
-
-------
|
20 |
-
pdf_path: str
|
21 |
-
Path to the input PDF file.
|
22 |
-
output_folder: str
|
23 |
-
Path to the output folder where extracted data will be saved.
|
24 |
-
minimum_font_size: int
|
25 |
-
Minimum font size below which the text will be ignored.
|
26 |
-
extraction_type: str
|
27 |
-
Type of extraction, either 'text', 'images', or 'both'.
|
28 |
-
|
29 |
-
Returns
|
30 |
-
-------
|
31 |
-
dict
|
32 |
-
The extracted data organized by pages.
|
33 |
"""
|
34 |
if not os.path.exists(output_folder):
|
35 |
os.makedirs(output_folder)
|
36 |
|
37 |
extraction_data = []
|
38 |
-
|
39 |
pdf_document = fitz.open(pdf_path)
|
40 |
|
41 |
for page_number in range(pdf_document.page_count):
|
@@ -125,6 +109,9 @@ def extract_text_images(
|
|
125 |
return extraction_data
|
126 |
|
127 |
def convert_to_xlsx(data: dict) -> BytesIO:
|
|
|
|
|
|
|
128 |
rows = []
|
129 |
|
130 |
for item in data:
|
@@ -152,12 +139,33 @@ def convert_to_xlsx(data: dict) -> BytesIO:
|
|
152 |
output.seek(0)
|
153 |
return output
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
def main():
|
156 |
st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
|
157 |
st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
|
158 |
|
159 |
st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
|
160 |
-
|
161 |
pdf_file = st.file_uploader("Upload PDF", type="pdf")
|
162 |
|
163 |
if pdf_file is not None:
|
@@ -185,42 +193,52 @@ def main():
|
|
185 |
min_value=1, value=2
|
186 |
)
|
187 |
|
|
|
|
|
188 |
if st.button("Start Extraction"):
|
189 |
-
if pdf_file is not None:
|
190 |
-
with tempfile.TemporaryDirectory() as
|
191 |
-
temp_pdf_path = os.path.join(
|
192 |
with open(temp_pdf_path, "wb") as f:
|
193 |
f.write(pdf_file.getvalue())
|
194 |
|
195 |
extraction_data = extract_text_images(
|
196 |
temp_pdf_path,
|
197 |
-
|
198 |
minimum_font_size,
|
199 |
extraction_type
|
200 |
)
|
201 |
|
202 |
st.json(extraction_data)
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
xlsx_data = convert_to_xlsx(extraction_data)
|
205 |
|
206 |
col1, col2 = st.columns(2)
|
207 |
-
|
208 |
with col1:
|
209 |
st.download_button(
|
210 |
label="Download JSON",
|
211 |
data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
|
212 |
file_name='extraction_data.json',
|
213 |
-
mime='application/json'
|
214 |
-
|
215 |
with col2:
|
216 |
st.download_button(
|
217 |
label="Download XLSX",
|
218 |
data=xlsx_data,
|
219 |
file_name='extraction_data.xlsx',
|
220 |
-
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
221 |
-
|
222 |
else:
|
223 |
-
st.error("Please upload a PDF file.")
|
224 |
|
225 |
st.markdown(
|
226 |
"""
|
|
|
5 |
from io import BytesIO
|
6 |
from PIL import Image
|
7 |
import pandas as pd
|
8 |
+
import zipfile
|
9 |
import tempfile
|
10 |
|
11 |
def extract_text_images(
|
|
|
15 |
) -> dict:
|
16 |
"""
|
17 |
Extracts text and/or images from a PDF and organizes them by pages.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
"""
|
19 |
if not os.path.exists(output_folder):
|
20 |
os.makedirs(output_folder)
|
21 |
|
22 |
extraction_data = []
|
|
|
23 |
pdf_document = fitz.open(pdf_path)
|
24 |
|
25 |
for page_number in range(pdf_document.page_count):
|
|
|
109 |
return extraction_data
|
110 |
|
111 |
def convert_to_xlsx(data: dict) -> BytesIO:
|
112 |
+
"""
|
113 |
+
Converts the extracted data to an XLSX file.
|
114 |
+
"""
|
115 |
rows = []
|
116 |
|
117 |
for item in data:
|
|
|
139 |
output.seek(0)
|
140 |
return output
|
141 |
|
142 |
+
def create_zip_with_json_and_images(output_folder, extraction_data):
|
143 |
+
"""
|
144 |
+
Creates a ZIP file containing both images and JSON data.
|
145 |
+
"""
|
146 |
+
zip_buffer = BytesIO()
|
147 |
+
with zipfile.ZipFile(zip_buffer, "w") as zip_file:
|
148 |
+
# Add JSON file
|
149 |
+
json_data = json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8')
|
150 |
+
zip_file.writestr("extraction_data.json", json_data)
|
151 |
+
|
152 |
+
# Add images
|
153 |
+
for item in extraction_data:
|
154 |
+
for content in item['content']:
|
155 |
+
if content['type'] == 'image':
|
156 |
+
image_path = content['path']
|
157 |
+
image_name = os.path.basename(image_path)
|
158 |
+
zip_file.write(image_path, image_name)
|
159 |
+
|
160 |
+
zip_buffer.seek(0)
|
161 |
+
return zip_buffer
|
162 |
+
|
163 |
def main():
|
164 |
st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
|
165 |
st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
|
166 |
|
167 |
st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
|
168 |
+
|
169 |
pdf_file = st.file_uploader("Upload PDF", type="pdf")
|
170 |
|
171 |
if pdf_file is not None:
|
|
|
193 |
min_value=1, value=2
|
194 |
)
|
195 |
|
196 |
+
output_folder = st.text_input("Output folder path:")
|
197 |
+
|
198 |
if st.button("Start Extraction"):
|
199 |
+
if pdf_file is not None and output_folder:
|
200 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
201 |
+
temp_pdf_path = os.path.join(temp_dir, pdf_file.name)
|
202 |
with open(temp_pdf_path, "wb") as f:
|
203 |
f.write(pdf_file.getvalue())
|
204 |
|
205 |
extraction_data = extract_text_images(
|
206 |
temp_pdf_path,
|
207 |
+
temp_dir,
|
208 |
minimum_font_size,
|
209 |
extraction_type
|
210 |
)
|
211 |
|
212 |
st.json(extraction_data)
|
213 |
|
214 |
+
if extraction_type == 'images' or extraction_type == 'both':
|
215 |
+
zip_data = create_zip_with_json_and_images(temp_dir, extraction_data)
|
216 |
+
st.download_button(
|
217 |
+
label="Download ZIP",
|
218 |
+
data=zip_data,
|
219 |
+
file_name='extraction_data.zip',
|
220 |
+
mime='application/zip'
|
221 |
+
)
|
222 |
+
|
223 |
xlsx_data = convert_to_xlsx(extraction_data)
|
224 |
|
225 |
col1, col2 = st.columns(2)
|
|
|
226 |
with col1:
|
227 |
st.download_button(
|
228 |
label="Download JSON",
|
229 |
data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
|
230 |
file_name='extraction_data.json',
|
231 |
+
mime='application/json'
|
232 |
+
)
|
233 |
with col2:
|
234 |
st.download_button(
|
235 |
label="Download XLSX",
|
236 |
data=xlsx_data,
|
237 |
file_name='extraction_data.xlsx',
|
238 |
+
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
239 |
+
)
|
240 |
else:
|
241 |
+
st.error("Please upload a PDF file and provide an output folder path.")
|
242 |
|
243 |
st.markdown(
|
244 |
"""
|