import streamlit as st
import os
import json
import fitz
from io import BytesIO
from PIL import Image
import pandas as pd
import zipfile
import tempfile

def extract_text_images(
        pdf_path: str, output_folder: str,
        minimum_font_size: int,
        extraction_type: str = 'both'
        ) -> dict:
    """
    Extracts text and/or images from a PDF and organizes them by pages.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    extraction_data = []
    pdf_document = fitz.open(pdf_path)

    for page_number in range(pdf_document.page_count):
        page = pdf_document.load_page(page_number)
        elements = []

        if extraction_type in ('text', 'both'):
            text_blocks = page.get_text("dict")["blocks"]
            lines = {}

            for block in text_blocks:
                if block["type"] == 0:
                    for line in block["lines"]:
                        for span in line["spans"]:
                            font_size = span["size"]
                            top = span["bbox"][1]

                            if font_size < minimum_font_size:
                                continue

                            if top not in lines:
                                lines[top] = []
                            lines[top].append(span)

            for top in sorted(lines.keys()):
                line = lines[top]
                line_text = " ".join([span['text'] for span in line])

                elements.append({
                    'type': 'text',
                    'font_size': line[0]['size'],
                    'page': page_number + 1,
                    'content': line_text,
                    'x0': line[0]['bbox'][0],
                    'top': top,
                })

        if extraction_type in ('images', 'both'):
            image_list = page.get_images(full=True)

            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = pdf_document.extract_image(xref)
                image_bytes = base_image["image"]
                image_filename = os.path.join(
                    output_folder,
                    f"page_{page_number + 1}_img_{img_index + 1}.png"
                )

                with open(image_filename, "wb") as img_file:
                    img_file.write(image_bytes)

                img_rect = page.get_image_bbox(img)
                elements.append({
                    'type': 'image',
                    'page': page_number + 1,
                    'path': image_filename,
                    'x0': img_rect.x0,
                    'top': img_rect.y0
                })

        elements.sort(key=lambda e: (e['top'], e['x0']))

        page_content = []
        for element in elements:
            if element['type'] == 'text':
                if page_content and page_content[-1]['type'] == 'text':
                    page_content[-1]['content'] += " " + element['content']
                else:
                    page_content.append({
                        'type': 'text',
                        'content': element['content']
                    })
            elif element['type'] == 'image':
                page_content.append({
                    'type': 'image',
                    'path': element['path']
                })

        extraction_data.append({
            'page': page_number + 1,
            'content': page_content
        })

    pdf_document.close()

    return extraction_data

def convert_to_xlsx(data: dict) -> BytesIO:
    """
    Converts the extracted data to an XLSX file.
    """
    rows = []

    for item in data:
        page_number = item['page']
        content_list = item['content']
        
        for content in content_list:
            if content['type'] == 'text':
                rows.append({
                    'Page': page_number,
                    'Content': content['content']
                })
            elif content['type'] == 'image':
                rows.append({
                    'Page': page_number,
                    'Content': f"[Image: {content['path']}]"
                })

    df = pd.DataFrame(rows)

    output = BytesIO()
    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
        df.to_excel(writer, index=False, sheet_name='Extraction')

    output.seek(0)
    return output

def create_zip_with_json_and_images(output_folder, extraction_data):
    """
    Creates a ZIP file containing both images and JSON data.
    """
    zip_buffer = BytesIO()
    with zipfile.ZipFile(zip_buffer, "w") as zip_file:
        # Add JSON file
        json_data = json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8')
        zip_file.writestr("extraction_data.json", json_data)
        
        # Add images
        for item in extraction_data:
            for content in item['content']:
                if content['type'] == 'image':
                    image_path = content['path']
                    image_name = os.path.basename(image_path)
                    zip_file.write(image_path, image_name)

    zip_buffer.seek(0)
    return zip_buffer

def main():
    st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
    st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)

    st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)

    pdf_file = st.file_uploader("Upload PDF", type="pdf")

    if pdf_file is not None:
        num_pages_to_preview = st.sidebar.slider(
            "Select number of pages to preview:", 
            min_value=1, max_value=5, value=1
        )

        pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
        for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
            page = pdf_document.load_page(page_num)
            pix = page.get_pixmap()
            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)

    st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
    extraction_type = st.selectbox(
        "Choose extraction type:",
        ("text", "images", "both")
    )

    st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
    minimum_font_size = st.number_input(
        "Minimum font size to extract:",
        min_value=1, value=2
    )

    output_folder = st.text_input("Output folder path:")

    if st.button("Start Extraction"):
        if pdf_file is not None and output_folder:
            with tempfile.TemporaryDirectory() as temp_dir:
                temp_pdf_path = os.path.join(temp_dir, pdf_file.name)
                with open(temp_pdf_path, "wb") as f:
                    f.write(pdf_file.getvalue())

                extraction_data = extract_text_images(
                    temp_pdf_path,
                    temp_dir,
                    minimum_font_size,
                    extraction_type
                )

                st.json(extraction_data)

                if extraction_type == 'images' or extraction_type == 'both':
                    zip_data = create_zip_with_json_and_images(temp_dir, extraction_data)
                    st.download_button(
                        label="Download ZIP",
                        data=zip_data,
                        file_name='extraction_data.zip',
                        mime='application/zip'
                    )

                xlsx_data = convert_to_xlsx(extraction_data)

                col1, col2 = st.columns(2)
                with col1:
                    st.download_button(
                        label="Download JSON",
                        data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
                        file_name='extraction_data.json',
                        mime='application/json'
                    )
                with col2:
                    st.download_button(
                        label="Download XLSX",
                        data=xlsx_data,
                        file_name='extraction_data.xlsx',
                        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
                    )
        else:
            st.error("Please upload a PDF file and provide an output folder path.")

    st.markdown(
        """
        <style>
        .footer {
            position: fixed;
            bottom: 0;
            left: 0;
            width: 100%;
            background-color: #F0F0F0;
            font-family:cursive;
            text-align: right;
            padding: 5px 0;
            font-size:20px;
            font-weight: bold;
            color: #FF0000;
        }
        </style>
        <div class="footer">
            CREATED BY: CHINMAY BHALERAO
        </div>
        """,
        unsafe_allow_html=True
    )

if __name__ == "__main__":
    main()