import streamlit as st |
import os |
import json |
import fitz |
from io import BytesIO |
from PIL import Image |
import pandas as pd |
import zipfile |
import tempfile |
def extract_text_images( |
pdf_path: str, output_folder: str, |
minimum_font_size: int, |
extraction_type: str = 'both' |
) -> dict: |
""" |
Extracts text and/or images from a PDF and organizes them by pages. |
""" |
if not os.path.exists(output_folder): |
os.makedirs(output_folder) |
extraction_data = [] |
pdf_document = fitz.open(pdf_path) |
for page_number in range(pdf_document.page_count): |
page = pdf_document.load_page(page_number) |
elements = [] |
if extraction_type in ('text', 'both'): |
text_blocks = page.get_text("dict")["blocks"] |
lines = {} |
for block in text_blocks: |
if block["type"] == 0: |
for line in block["lines"]: |
for span in line["spans"]: |
font_size = span["size"] |
top = span["bbox"][1] |
if font_size < minimum_font_size: |
continue |
if top not in lines: |
lines[top] = [] |
lines[top].append(span) |
for top in sorted(lines.keys()): |
line = lines[top] |
line_text = " ".join([span['text'] for span in line]) |
elements.append({ |
'type': 'text', |
'font_size': line[0]['size'], |
'page': page_number + 1, |
'content': line_text, |
'x0': line[0]['bbox'][0], |
'top': top, |
}) |
if extraction_type in ('images', 'both'): |
image_list = page.get_images(full=True) |
for img_index, img in enumerate(image_list): |
xref = img[0] |
base_image = pdf_document.extract_image(xref) |
image_bytes = base_image["image"] |
image_filename = os.path.join( |
output_folder, |
f"page_{page_number + 1}_img_{img_index + 1}.png" |
) |
with open(image_filename, "wb") as img_file: |
img_file.write(image_bytes) |
img_rect = page.get_image_bbox(img) |
elements.append({ |
'type': 'image', |
'page': page_number + 1, |
'path': image_filename, |
'x0': img_rect.x0, |
'top': img_rect.y0 |
}) |
elements.sort(key=lambda e: (e['top'], e['x0'])) |
page_content = [] |
for element in elements: |
if element['type'] == 'text': |
if page_content and page_content[-1]['type'] == 'text': |
page_content[-1]['content'] += " " + element['content'] |
else: |
page_content.append({ |
'type': 'text', |
'content': element['content'] |
}) |
elif element['type'] == 'image': |
page_content.append({ |
'type': 'image', |
'path': element['path'] |
}) |
extraction_data.append({ |
'page': page_number + 1, |
'content': page_content |
}) |
pdf_document.close() |
return extraction_data |
def convert_to_xlsx(data: dict) -> BytesIO: |
""" |
Converts the extracted data to an XLSX file. |
""" |
rows = [] |
for item in data: |
page_number = item['page'] |
content_list = item['content'] |
for content in content_list: |
if content['type'] == 'text': |
rows.append({ |
'Page': page_number, |
'Content': content['content'] |
}) |
elif content['type'] == 'image': |
rows.append({ |
'Page': page_number, |
'Content': f"[Image: {content['path']}]" |
}) |
df = pd.DataFrame(rows) |
output = BytesIO() |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: |
df.to_excel(writer, index=False, sheet_name='Extraction') |
output.seek(0) |
return output |
def create_zip_with_json_and_images(output_folder, extraction_data): |
""" |
Creates a ZIP file containing both images and JSON data. |
""" |
zip_buffer = BytesIO() |
with zipfile.ZipFile(zip_buffer, "w") as zip_file: |
json_data = json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8') |
zip_file.writestr("extraction_data.json", json_data) |
for item in extraction_data: |
for content in item['content']: |
if content['type'] == 'image': |
image_path = content['path'] |
image_name = os.path.basename(image_path) |
zip_file.write(image_path, image_name) |
zip_buffer.seek(0) |
return zip_buffer |
def main(): |
st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True) |
st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True) |
st.sidebar.markdown( |
""" |
<div style="background-color: lightgray; padding: 2px; border-radius: 2px; text-align: center;"> |
<h2 style="color: blue; margin: 0;">PDF PREVIEW</h2> |
</div> |
""", unsafe_allow_html=True) |
pdf_file = st.file_uploader("Upload PDF", type="pdf") |
if pdf_file is not None: |
num_pages_to_preview = st.sidebar.slider( |
"Select number of pages to preview:", |
min_value=1, max_value=5, value=1 |
) |
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf") |
for page_num in range(min(num_pages_to_preview, pdf_document.page_count)): |
page = pdf_document.load_page(page_num) |
pix = page.get_pixmap() |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True) |
st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf") |
extraction_type = st.selectbox( |
"Choose extraction type:", |
("text", "images", "both") |
) |
st.info("Minimum font size is the size below which size, the text will get ignored for extraction") |
minimum_font_size = st.number_input( |
"Minimum font size to extract:", |
min_value=1, value=2 |
) |
output_folder = st.text_input("Output folder path:") |
if st.button("Start Extraction"): |
if pdf_file is not None and output_folder: |
with tempfile.TemporaryDirectory() as temp_dir: |
temp_pdf_path = os.path.join(temp_dir, pdf_file.name) |
with open(temp_pdf_path, "wb") as f: |
f.write(pdf_file.getvalue()) |
extraction_data = extract_text_images( |
temp_pdf_path, |
temp_dir, |
minimum_font_size, |
extraction_type |
) |
st.json(extraction_data) |
if extraction_type == 'images' or extraction_type == 'both': |
zip_data = create_zip_with_json_and_images(temp_dir, extraction_data) |
st.download_button( |
label="Download ZIP", |
data=zip_data, |
file_name='extraction_data.zip', |
mime='application/zip' |
) |
xlsx_data = convert_to_xlsx(extraction_data) |
col1, col2 = st.columns(2) |
with col1: |
st.download_button( |
label="Download JSON", |
data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'), |
file_name='extraction_data.json', |
mime='application/json' |
) |
with col2: |
st.download_button( |
label="Download XLSX", |
data=xlsx_data, |
file_name='extraction_data.xlsx', |
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' |
) |
else: |
st.error("Please upload a PDF file and provide an output folder path.") |
st.markdown( |
""" |
<style> |
.footer { |
position: fixed; |
bottom: 0; |
left: 0; |
width: 100%; |
background-color: #F0F0F0; |
font-family:cursive; |
text-align: right; |
padding: 5px 0; |
font-size:20px; |
font-weight: bold; |
color: #FF0000; |
} |
</style> |
<div class="footer"> |
</div> |
""", |
unsafe_allow_html=True |
) |
if __name__ == "__main__": |
main() |