import os import json import fitz import pdfplumber import pandas as pd import streamlit as st from tempfile import NamedTemporaryFile from PIL import Image import io def extract_text_images( pdf_path: str, output_folder: str, minimum_font_size: int, extract_text: bool = True, extract_images: bool = True, mode: str = 'headerwise', header_font_sizes: list[float] = None, tolerance: float = 0.01, ) -> dict: """ Extracts text and/or images from a PDF and organizes them either by headers or by pages. Params ------- pdf_path: str Path to the input PDF file. output_folder: str Path to the output folder where extracted data will be saved. extract_text: bool Whether to extract text. extract_images: bool Whether to extract images. mode: str Extraction mode, either 'headerwise' or 'pagewise'. header_font_sizes: list[float] List of font sizes to be considered as headers. tolerance: float Tolerance for font size comparison. Returns ------- dict Dictionary containing extracted text and/or image data. """ if not os.path.exists(output_folder): os.makedirs(output_folder) extraction_data = [] current_header = None current_header_content = [] def add_current_header_content() -> None: """ Adds the current header and its content to the extraction data. """ nonlocal current_header, current_header_content if current_header: extraction_data.append({ 'header': current_header, 'content': current_header_content }) current_header_content = [] current_header = None def is_header_font_size(font_size: float) -> bool: """ Checks if a given font size matches any of the header font sizes. """ return any( abs(font_size - header_font_size) <= tolerance for header_font_size in header_font_sizes ) pdf_document = fitz.open(pdf_path) for page_number in range(pdf_document.page_count): page = pdf_document.load_page(page_number) elements = [] if extract_text: # Extract text blocks with their positions and font sizes text_blocks = page.get_text("dict")["blocks"] lines = {} # Group text blocks by their vertical position (top) to form lines for block in text_blocks: if block["type"] == 0: # Text block for line in block["lines"]: for span in line["spans"]: font_size = span["size"] top = span["bbox"][1] # Skip text blocks with font size less than 10 if font_size < minimum_font_size: continue if top not in lines: lines[top] = [] lines[top].append(span) # Process each line to check if it's a header for top in sorted(lines.keys()): line = lines[top] line_text = " ".join([span['text'] for span in line]) line_font_size = line[0]['size'] elements.append({ 'type': 'text', 'font_size': line_font_size, 'page': page_number + 1, 'content': line_text, 'x0': line[0]['bbox'][0], 'top': top }) if extract_images: # Extract images using PyMuPDF image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = pdf_document.extract_image(xref) image_bytes = base_image["image"] image_filename = os.path.join( output_folder, f"page_{page_number + 1}_img_{img_index + 1}.png" ) with open(image_filename, "wb") as img_file: img_file.write(image_bytes) # Get the position of the image img_rect = page.get_image_bbox(img) elements.append({ 'type': 'image', 'page': page_number + 1, 'path': image_filename, 'x0': img_rect.x0, 'top': img_rect.y0 }) # Sort elements by their vertical position (top) first, # and then by horizontal position (x0) elements.sort(key=lambda e: (e['top'], e['x0'])) if mode == 'headerwise': # Process elements to extract headers and content for element in elements: if element['type'] == 'text' and \ is_header_font_size(element['font_size']): # If a new header is found, # finalize the current header content add_current_header_content() current_header = element['content'] elif element['type'] == 'text': if current_header_content and \ current_header_content[-1]['type'] == 'text': current_header_content[-1]['content'] \ += " " + element['content'] else: current_header_content.append({ 'type': 'text', 'content': element['content'] }) elif element['type'] == 'image': current_header_content.append({ 'type': 'image', 'path': element['path'] }) elif mode == 'pagewise': page_content = [] for element in elements: if element['type'] == 'text': if page_content and \ page_content[-1]['type'] == 'text': page_content[-1]['content'] \ += " " + element['content'] else: page_content.append({ 'type': 'text', 'content': element['content'] }) elif element['type'] == 'image': page_content.append({ 'type': 'image', 'path': element['path'] }) extraction_data.append({ 'page': page_number + 1, 'content': page_content }) # After the loop, finalize any remaining header content if mode == 'headerwise': add_current_header_content() pdf_document.close() return extraction_data def get_word_font_sizes(pdf_path, target_words): word_font_sizes = {word: [] for word in target_words} with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: words = page.extract_words(extra_attrs=['fontname', 'size']) for word in words: text = word['text'].strip() if text in target_words: word_font_sizes[text].append(word['size']) return word_font_sizes def preview_pdf(pdf_path, num_pages=1): pdf_document = fitz.open(pdf_path) preview_images = [] for page_number in range(min(num_pages, pdf_document.page_count)): page = pdf_document.load_page(page_number) pix = page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) preview_images.append(img) pdf_document.close() return preview_images # Streamlit UI import io def main(): # setting page config st.set_page_config( page_title="Object counting", page_icon="🧊", layout="wide", initial_sidebar_state="expanded", menu_items={ 'Get Help': 'https://www.extremelycoolapp.com/help', 'Report a bug': "https://www.extremelycoolapp.com/bug", } ) st.markdown("

PDF DATA SNACHER

", unsafe_allow_html=True) st.markdown( "

Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images

", unsafe_allow_html=True ) st.markdown( "
Step 1: Upload pdf
", unsafe_allow_html=True ) st.markdown( "
Step 2: Fill the values at right in data extraction settings
", unsafe_allow_html=True ) st.markdown( "
Step 3: Download the data in desired format
", unsafe_allow_html=True ) uploaded_pdf = st.file_uploader("Upload PDF", type="pdf") if uploaded_pdf: # Save the uploaded PDF to a temporary file with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: temp_pdf.write(uploaded_pdf.read()) temp_pdf_path = temp_pdf.name # Collapsible PDF Preview with st.expander("PDF Preview", expanded=True): num_pages = st.slider("Number of pages to preview", min_value=1, max_value=5, value=1) preview_images = preview_pdf(temp_pdf_path, num_pages) for img in preview_images: st.image(img, caption=f"Page {preview_images.index(img) + 1}", use_column_width=True) st.sidebar.title("DATA EXTRACTION SETTINGS") st.sidebar.write("How you want to extract data?") extraction_mode = st.sidebar.radio("Extraction Mode", ["headerwise", "pagewise"]) # Font Size Detection st.sidebar.title("FONT SIZE DETECTION") st.sidebar.warning("[Only in case of headerwise extraction] if you dont know the font size for your headers or text then copy paste any of those words below") target_words_input = st.sidebar.text_input( "Target words (comma-separated)", "") target_words = [word.strip() for word in target_words_input.split(",")] if st.sidebar.button("Get Font Sizes"): word_font_sizes = get_word_font_sizes(temp_pdf_path, target_words) for word, sizes in word_font_sizes.items(): st.sidebar.write(f"Word: {word}, Font sizes: {sizes}") # st.sidebar.warning("Fill below required details") header_font_sizes = st.sidebar.text_input("Header Font Sizes (comma-separated)", "0") # st.sidebar.info("Header sizes are only required in case of headerwise extraction") header_font_sizes = [float(size.strip()) for size in header_font_sizes.split(",")] st.sidebar.title("OUTPUT FOLDER PATH") output_folder = st.sidebar.text_input(" ", value=os.path.join(os.path.dirname ("Extracted_Data"))) st.sidebar.info("what do you want to include in data extraction?") extract_text = st.sidebar.checkbox("Extract Text", value=True) extract_images = st.sidebar.checkbox("Extract Images", value=True) minimum_font_size = st.sidebar.number_input("Minimum Font Size", min_value=1, value=10) if st.sidebar.button("Start Extraction"): if not os.path.exists(output_folder): os.makedirs(output_folder) extracted_data = extract_text_images( temp_pdf_path, output_folder, minimum_font_size=minimum_font_size, extract_text=extract_text, extract_images=extract_images, mode=extraction_mode, header_font_sizes=header_font_sizes ) # Display extracted data as JSON st.json(extracted_data) # Convert extracted data to a pandas DataFrame def extract_to_dataframe(data): rows = [] for item in data: if 'header' in item: header = item['header'] for content_item in item['content']: if content_item['type'] == 'text': rows.append({'Header': header, 'Content': content_item['content']}) elif content_item['type'] == 'image': rows.append({'Header': header, 'Content': f"Image: {content_item['path']}"}) elif 'page' in item: page_num = item['page'] for content_item in item['content']: if content_item['type'] == 'text': rows.append({'Page': page_num, 'Content': content_item['content']}) elif content_item['type'] == 'image': rows.append({'Page': page_num, 'Content': f"Image: {content_item['path']}"}) return pd.DataFrame(rows) df = extract_to_dataframe(extracted_data) # Save DataFrame to an in-memory BytesIO buffer buffer = io.BytesIO() with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer: df.to_excel(writer, index=False, sheet_name='Extracted Data') buffer.seek(0) # Preview the first 5 lines of the XLSX data st.subheader("Preview of Extracted Data (First 5 Lines)") preview_df = pd.read_excel(buffer, sheet_name='Extracted Data') st.dataframe(preview_df.head()) # Provide download options st.download_button( label="Download JSON", data=json.dumps(extracted_data, ensure_ascii=False), file_name='extracted_data.json', mime='application/json' ) st.download_button( label="Download XLSX", data=buffer, file_name='extracted_data.xlsx', mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ) st.success("Extraction complete. Data displayed as JSON.") if __name__ == "__main__": main()