Spaces:

ChinmayBH
/

PDF_DATA_EXTRACTOR_PAGEWISE

Running

App Files Files Community

ChinmayBH commited on Aug 12, 2024

Commit

8146073

verified ·

1 Parent(s): 503c3e1

Added pagewise only part to code

Browse files

separated two things, page wise and header wise

Files changed (1) hide show

app.py +309 -376

app.py CHANGED Viewed

@@ -1,376 +1,309 @@
-import os
-import json
-import fitz
-import pdfplumber
-import pandas as pd
-import streamlit as st
-from tempfile import NamedTemporaryFile
-from PIL import Image
-import io
-def extract_text_images(
-        pdf_path: str, output_folder: str,
-        minimum_font_size: int,
-        extract_text: bool = True,
-        extract_images: bool = True,
-        mode: str = 'headerwise',
-        header_font_sizes: list[float] = None,
-        tolerance: float = 0.01,
-        ) -> dict:
-    """
-    Extracts text and/or images from a PDF and organizes them either by headers or by pages.
-    Params
-    -------
-    pdf_path: str
-        Path to the input PDF file.
-    output_folder: str
-        Path to the output folder where extracted data will be saved.
-    extract_text: bool
-        Whether to extract text.
-    extract_images: bool
-        Whether to extract images.
-    mode: str
-        Extraction mode, either 'headerwise' or 'pagewise'.
-    header_font_sizes: list[float]
-        List of font sizes to be considered as headers.
-    tolerance: float
-        Tolerance for font size comparison.
-    Returns
-    -------
-    dict
-        Dictionary containing extracted text and/or image data.
-    """
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-    extraction_data = []
-    current_header = None
-    current_header_content = []
-    def add_current_header_content() -> None:
-        """
-        Adds the current header and its content to the extraction data.
-        """
-        nonlocal current_header, current_header_content
-        if current_header:
-            extraction_data.append({
-                'header': current_header,
-                'content': current_header_content
-            })
-            current_header_content = []
-        current_header = None
-    def is_header_font_size(font_size: float) -> bool:
-        """
-        Checks if a given font size matches any of the header font sizes.
-        """
-        return any(
-            abs(font_size - header_font_size) <= tolerance
-            for header_font_size in header_font_sizes
-        )
-    pdf_document = fitz.open(pdf_path)
-    for page_number in range(pdf_document.page_count):
-        page = pdf_document.load_page(page_number)
-        elements = []
-        if extract_text:
-            # Extract text blocks with their positions and font sizes
-            text_blocks = page.get_text("dict")["blocks"]
-            lines = {}
-            # Group text blocks by their vertical position (top) to form lines
-            for block in text_blocks:
-                if block["type"] == 0:  # Text block
-                    for line in block["lines"]:
-                        for span in line["spans"]:
-                            font_size = span["size"]
-                            top = span["bbox"][1]
-                            # Skip text blocks with font size less than 10
-                            if font_size < minimum_font_size:
-                                continue
-                            if top not in lines:
-                                lines[top] = []
-                            lines[top].append(span)
-            # Process each line to check if it's a header
-            for top in sorted(lines.keys()):
-                line = lines[top]
-                line_text = " ".join([span['text'] for span in line])
-                line_font_size = line[0]['size']
-                elements.append({
-                    'type': 'text',
-                    'font_size': line_font_size,
-                    'page': page_number + 1,
-                    'content': line_text,
-                    'x0': line[0]['bbox'][0],
-                    'top': top
-                })
-        if extract_images:
-            # Extract images using PyMuPDF
-            image_list = page.get_images(full=True)
-            for img_index, img in enumerate(image_list):
-                xref = img[0]
-                base_image = pdf_document.extract_image(xref)
-                image_bytes = base_image["image"]
-                image_filename = os.path.join(
-                    output_folder,
-                    f"page_{page_number + 1}_img_{img_index + 1}.png"
-                )
-                with open(image_filename, "wb") as img_file:
-                    img_file.write(image_bytes)
-                # Get the position of the image
-                img_rect = page.get_image_bbox(img)
-                elements.append({
-                    'type': 'image',
-                    'page': page_number + 1,
-                    'path': image_filename,
-                    'x0': img_rect.x0,
-                    'top': img_rect.y0
-                })
-        # Sort elements by their vertical position (top) first,
-        # and then by horizontal position (x0)
-        elements.sort(key=lambda e: (e['top'], e['x0']))
-        if mode == 'headerwise':
-            # Process elements to extract headers and content
-            for element in elements:
-                if element['type'] == 'text' and \
-                 is_header_font_size(element['font_size']):
-                    # If a new header is found,
-                    #  finalize the current header content
-                    add_current_header_content()
-                    current_header = element['content']
-                elif element['type'] == 'text':
-                    if current_header_content and \
-                     current_header_content[-1]['type'] == 'text':
-                        current_header_content[-1]['content'] \
-                         += " " + element['content']
-                    else:
-                        current_header_content.append({
-                            'type': 'text',
-                            'content': element['content']
-                        })
-                elif element['type'] == 'image':
-                    current_header_content.append({
-                        'type': 'image',
-                        'path': element['path']
-                    })
-        elif mode == 'pagewise':
-            page_content = []
-            for element in elements:
-                if element['type'] == 'text':
-                    if page_content and \
-                     page_content[-1]['type'] == 'text':
-                        page_content[-1]['content'] \
-                         += " " + element['content']
-                    else:
-                        page_content.append({
-                            'type': 'text',
-                            'content': element['content']
-                        })
-                elif element['type'] == 'image':
-                    page_content.append({
-                        'type': 'image',
-                        'path': element['path']
-                    })
-            extraction_data.append({
-                'page': page_number + 1,
-                'content': page_content
-            })
-    # After the loop, finalize any remaining header content
-    if mode == 'headerwise':
-        add_current_header_content()
-    pdf_document.close()
-    return extraction_data
-def get_word_font_sizes(pdf_path, target_words):
-    word_font_sizes = {word: [] for word in target_words}
-    with pdfplumber.open(pdf_path) as pdf:
-        for page in pdf.pages:
-            words = page.extract_words(extra_attrs=['fontname', 'size'])
-            for word in words:
-                text = word['text'].strip()
-                if text in target_words:
-                    word_font_sizes[text].append(word['size'])
-    return word_font_sizes
-def preview_pdf(pdf_path, num_pages=1):
-    pdf_document = fitz.open(pdf_path)
-    preview_images = []
-    for page_number in range(min(num_pages, pdf_document.page_count)):
-        page = pdf_document.load_page(page_number)
-        pix = page.get_pixmap()
-        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        preview_images.append(img)
-    pdf_document.close()
-    return preview_images
-# Streamlit UI
-import io
-def main():
-    # setting page config
-    st.set_page_config(
-        page_title="Object counting",
-        page_icon="🧊",
-        layout="wide",
-        initial_sidebar_state="expanded",
-        menu_items={
-            'Get Help': 'https://www.extremelycoolapp.com/help',
-            'Report a bug': "https://www.extremelycoolapp.com/bug",
-        }
-    )
-    st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER</h1>",
-                unsafe_allow_html=True)
-    st.markdown(
-        "<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>",
-        unsafe_allow_html=True
-    )
-    st.markdown(
-        "<h5 style='text-align: center;color: red;'>Step 1: Upload pdf </h5>",
-        unsafe_allow_html=True
-    )
-    st.markdown(
-        "<h5 style='text-align: center;color: red;'>Step 2: Fill the values at right in data extraction settings </h5>",
-        unsafe_allow_html=True
-    )
-    st.markdown(
-        "<h5 style='text-align: center;color: red;'>Step 3: Download the data in desired format </h5>",
-        unsafe_allow_html=True
-    )
-    uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
-    if uploaded_pdf:
-        # Save the uploaded PDF to a temporary file
-        with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
-            temp_pdf.write(uploaded_pdf.read())
-            temp_pdf_path = temp_pdf.name
-        # Collapsible PDF Preview
-        with st.expander("PDF Preview", expanded=True):
-            num_pages = st.slider("Number of pages to preview", min_value=1, max_value=5, value=1)
-            preview_images = preview_pdf(temp_pdf_path, num_pages)
-            for img in preview_images:
-                st.image(img, caption=f"Page {preview_images.index(img) + 1}", use_column_width=True)
-        st.sidebar.title("DATA EXTRACTION SETTINGS")
-        st.sidebar.write("How you want to extract data?")
-        extraction_mode = st.sidebar.radio("Extraction Mode", ["headerwise", "pagewise"])
-        # Font Size Detection
-        st.sidebar.title("FONT SIZE DETECTION")
-        st.sidebar.warning("[Only in case of headerwise extraction] if you dont know the font size for your headers or text then copy paste any of those words below")
-        target_words_input = st.sidebar.text_input(
-            "Target words (comma-separated)", "")
-        target_words = [word.strip() for word in target_words_input.split(",")]
-        if st.sidebar.button("Get Font Sizes"):
-            word_font_sizes = get_word_font_sizes(temp_pdf_path, target_words)
-            for word, sizes in word_font_sizes.items():
-                st.sidebar.write(f"Word: {word}, Font sizes: {sizes}")
-        # st.sidebar.warning("Fill below required details")
-        header_font_sizes = st.sidebar.text_input("Header Font Sizes (comma-separated)", "0")
-        # st.sidebar.info("Header sizes are only required in case of headerwise extraction")
-        header_font_sizes = [float(size.strip()) for size in header_font_sizes.split(",")]
-        st.sidebar.title("OUTPUT FOLDER PATH")
-        output_folder = st.sidebar.text_input(" ", value=os.path.join(os.path.dirname ("Extracted_Data")))
-        st.sidebar.info("what do you want to include in data extraction?")
-        extract_text = st.sidebar.checkbox("Extract Text", value=True)
-        extract_images = st.sidebar.checkbox("Extract Images", value=True)
-        minimum_font_size = st.sidebar.number_input("Minimum Font Size", min_value=1, value=10)
-        if st.sidebar.button("Start Extraction"):
-            if not os.path.exists(output_folder):
-                os.makedirs(output_folder)
-            extracted_data = extract_text_images(
-                temp_pdf_path,
-                output_folder,
-                minimum_font_size=minimum_font_size,
-                extract_text=extract_text,
-                extract_images=extract_images,
-                mode=extraction_mode,
-                header_font_sizes=header_font_sizes
-            )
-            # Display extracted data as JSON
-            st.json(extracted_data)
-            # Convert extracted data to a pandas DataFrame
-            def extract_to_dataframe(data):
-                rows = []
-                for item in data:
-                    if 'header' in item:
-                        header = item['header']
-                        for content_item in item['content']:
-                            if content_item['type'] == 'text':
-                                rows.append({'Header': header, 'Content': content_item['content']})
-                            elif content_item['type'] == 'image':
-                                rows.append({'Header': header, 'Content': f"Image: {content_item['path']}"})
-                    elif 'page' in item:
-                        page_num = item['page']
-                        for content_item in item['content']:
-                            if content_item['type'] == 'text':
-                                rows.append({'Page': page_num, 'Content': content_item['content']})
-                            elif content_item['type'] == 'image':
-                                rows.append({'Page': page_num, 'Content': f"Image: {content_item['path']}"})
-                return pd.DataFrame(rows)
-            df = extract_to_dataframe(extracted_data)
-            # Save DataFrame to an in-memory BytesIO buffer
-            buffer = io.BytesIO()
-            with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
-                df.to_excel(writer, index=False, sheet_name='Extracted Data')
-            buffer.seek(0)
-            # Preview the first 5 lines of the XLSX data
-            st.subheader("Preview of Extracted Data (First 5 Lines)")
-            preview_df = pd.read_excel(buffer, sheet_name='Extracted Data')
-            st.dataframe(preview_df.head())
-            # Provide download options
-            st.download_button(
-                label="Download JSON",
-                data=json.dumps(extracted_data, ensure_ascii=False),
-                file_name='extracted_data.json',
-                mime='application/json'
-            )
-            st.download_button(
-                label="Download XLSX",
-                data=buffer,
-                file_name='extracted_data.xlsx',
-                mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
-            )
-            st.success("Extraction complete. Data displayed as JSON.")
-if __name__ == "__main__":
-    main()

+import streamlit as st
+import os
+import json
+import fitz
+from io import BytesIO
+from PIL import Image
+import pandas as pd
+def extract_text_images(
+        pdf_path: str, output_folder: str,
+        minimum_font_size: int,
+        extraction_type: str = 'both'
+        ) -> dict:
+    """
+    Extracts text and/or images from a PDF and organizes them by pages.
+    Params
+    -------
+    pdf_path: str
+        Path to the input PDF file.
+    output_folder: str
+        Path to the output folder where extracted data will be saved.
+    minimum_font_size: int
+        Minimum font size below which the text will be ignored.
+    extraction_type: str
+        Type of extraction, either 'text', 'images', or 'both'.
+    Returns
+    -------
+    dict
+        The extracted data organized by pages.
+    """
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    extraction_data = []
+    pdf_document = fitz.open(pdf_path)
+    for page_number in range(pdf_document.page_count):
+        page = pdf_document.load_page(page_number)
+        elements = []
+        if extraction_type in ('text', 'both'):
+            # Extract text blocks with their positions and font sizes
+            text_blocks = page.get_text("dict")["blocks"]
+            lines = {}
+            # Group text blocks by their vertical position (top) to form lines
+            for block in text_blocks:
+                if block["type"] == 0:  # Text block
+                    for line in block["lines"]:
+                        for span in line["spans"]:
+                            font_size = span["size"]
+                            top = span["bbox"][1]
+                            # Skip text blocks with font size less than the minimum
+                            if font_size < minimum_font_size:
+                                continue
+                            if top not in lines:
+                                lines[top] = []
+                            lines[top].append(span)
+            # Process each line
+            for top in sorted(lines.keys()):
+                line = lines[top]
+                line_text = " ".join([span['text'] for span in line])
+                elements.append({
+                    'type': 'text',
+                    'font_size': line[0]['size'],
+                    'page': page_number + 1,
+                    'content': line_text,
+                    'x0': line[0]['bbox'][0],
+                    'top': top,
+                })
+        if extraction_type in ('images', 'both'):
+            # Extract images using PyMuPDF
+            image_list = page.get_images(full=True)
+            for img_index, img in enumerate(image_list):
+                xref = img[0]
+                base_image = pdf_document.extract_image(xref)
+                image_bytes = base_image["image"]
+                image_filename = os.path.join(
+                    output_folder,
+                    f"page_{page_number + 1}_img_{img_index + 1}.png"
+                )
+                with open(image_filename, "wb") as img_file:
+                    img_file.write(image_bytes)
+                # Get the position of the image
+                img_rect = page.get_image_bbox(img)
+                elements.append({
+                    'type': 'image',
+                    'page': page_number + 1,
+                    'path': image_filename,
+                    'x0': img_rect.x0,
+                    'top': img_rect.y0
+                })
+        # Sort elements by their vertical position (top) first, and then by horizontal position (x0)
+        elements.sort(key=lambda e: (e['top'], e['x0']))
+        # Process elements to extract content pagewise
+        page_content = []
+        for element in elements:
+            if element['type'] == 'text':
+                if page_content and page_content[-1]['type'] == 'text':
+                    page_content[-1]['content'] += " " + element['content']
+                else:
+                    page_content.append({
+                        'type': 'text',
+                        'content': element['content']
+                    })
+            elif element['type'] == 'image':
+                page_content.append({
+                    'type': 'image',
+                    'path': element['path']
+                })
+        extraction_data.append({
+            'page': page_number + 1,
+            'content': page_content
+        })
+    pdf_document.close()
+    return extraction_data
+def convert_to_xlsx(data: dict) -> BytesIO:
+    """
+    Converts the extracted data to an XLSX file.
+    Params
+    -------
+    data: dict
+        The extracted data organized by pages.
+    Returns
+    -------
+    BytesIO
+        The XLSX file in memory.
+    """
+    rows = []
+    for item in data:
+        page_number = item['page']
+        content_list = item['content']
+        for content in content_list:
+            if content['type'] == 'text':
+                rows.append({
+                    'Page': page_number,
+                    'Content': content['content']
+                })
+            elif content['type'] == 'image':
+                rows.append({
+                    'Page': page_number,
+                    'Content': f"[Image: {content['path']}]"
+                })
+    df = pd.DataFrame(rows)
+    output = BytesIO()
+    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+        df.to_excel(writer, index=False, sheet_name='Extraction')
+    output.seek(0)
+    return output
+def main():
+    st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
+    st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
+    # Sidebar for PDF preview
+    st.markdown(
+    """
+    <style>
+        .sidebar-header {
+            text-align: center;
+            color: blue;
+            padding: 5px 0;
+            font-size:30px;
+            font-weight: bold;
+        }
+    </style>
+    """,
+        unsafe_allow_html=True)
+    st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
+    # File uploader
+    pdf_file = st.file_uploader("Upload PDF", type="pdf")
+    if pdf_file is not None:
+        # Slider to select number of pages to preview
+        num_pages_to_preview = st.sidebar.slider(
+            "Select number of pages to preview:",
+            min_value=1, max_value=5, value=1
+        )
+        # Display PDF preview for selected number of pages
+        pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
+        for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
+            page = pdf_document.load_page(page_num)
+            pix = page.get_pixmap()
+            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
+    # Extraction type selector
+    st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
+    extraction_type = st.selectbox(
+        "Choose extraction type:",
+        ("text", "images", "both")
+    )
+    # Minimum font size input
+    st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
+    minimum_font_size = st.number_input(
+        "Minimum font size to extract:",
+        min_value=1, value=2
+    )
+    # Output folder path input
+    output_folder = st.text_input(
+        "Output folder path:",
+        os.path.join(os.getcwd(), "Extracted_Data")
+    )
+    if st.button("Start Extraction"):
+        if pdf_file is not None:
+            # Save uploaded PDF to a temporary location
+            temp_pdf_path = os.path.join(output_folder, pdf_file.name)
+            with open(temp_pdf_path, "wb") as f:
+                f.write(pdf_file.getvalue())
+            # Call the extraction function
+            extraction_data = extract_text_images(
+                temp_pdf_path,
+                output_folder,
+                minimum_font_size,
+                extraction_type
+            )
+            # Display extracted JSON data
+            st.json(extraction_data)
+            # Convert data to XLSX
+            xlsx_data = convert_to_xlsx(extraction_data)
+            # Show a preview of the XLSX data (first 5 rows)
+            # st.subheader("XLSX Preview (First 5 Rows)")
+            df = pd.read_excel(xlsx_data, sheet_name='Extraction')
+            # st.dataframe(df.head())
+            col1, col2 = st.columns(2)
+            with col1:
+                st.download_button(
+                    label="Download JSON",
+                    data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
+                    file_name='extraction_data.json',
+                    mime='application/json')
+            with col2:
+                st.download_button(
+                    label="Download XLSX",
+                    data=xlsx_data,
+                    file_name='extraction_data.xlsx',
+                    mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
+        else:
+            st.error("Please upload a PDF file.")
+    # Footer (Fixed Position)
+    st.markdown(
+        """
+        <style>
+        .footer {
+            position: fixed;
+            bottom: 0;
+            left: 0;
+            width: 100%;
+            background-color: #F0F0F0;
+            font-family:cursive;
+            text-align: right;
+            padding: 5px 0;
+            font-size:20px;
+            font-weight: bold;
+            color: #FF0000;
+        }
+        </style>
+        <div class="footer">
+            CREATED BY: CHINMAY BHALERAO
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+if __name__ == "__main__":
+    main()