import os
import json
import fitz
import pdfplumber
import pandas as pd
import streamlit as st
from tempfile import NamedTemporaryFile
from PIL import Image
import io

def extract_text_images(
        pdf_path: str, output_folder: str,
        minimum_font_size: int,
        extract_text: bool = True,
        extract_images: bool = True,
        mode: str = 'headerwise',
        header_font_sizes: list[float] = None,
        tolerance: float = 0.01,
        ) -> dict:
    """
    Extracts text and/or images from a PDF and organizes them either by headers or by pages.

    Params
    -------
    pdf_path: str
        Path to the input PDF file.
    output_folder: str
        Path to the output folder where extracted data will be saved.
    extract_text: bool
        Whether to extract text.
    extract_images: bool
        Whether to extract images.
    mode: str
        Extraction mode, either 'headerwise' or 'pagewise'.
    header_font_sizes: list[float]
        List of font sizes to be considered as headers.
    tolerance: float
        Tolerance for font size comparison.

    Returns
    -------
    dict
        Dictionary containing extracted text and/or image data.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    extraction_data = []
    current_header = None
    current_header_content = []

    def add_current_header_content() -> None:
        """
        Adds the current header and its content to the extraction data.
        """
        nonlocal current_header, current_header_content
        if current_header:
            extraction_data.append({
                'header': current_header,
                'content': current_header_content
            })
            current_header_content = []
        current_header = None

    def is_header_font_size(font_size: float) -> bool:
        """
        Checks if a given font size matches any of the header font sizes.
        """
        return any(
            abs(font_size - header_font_size) <= tolerance
            for header_font_size in header_font_sizes
        )

    pdf_document = fitz.open(pdf_path)

    for page_number in range(pdf_document.page_count):
        page = pdf_document.load_page(page_number)
        elements = []

        if extract_text:
            # Extract text blocks with their positions and font sizes
            text_blocks = page.get_text("dict")["blocks"]
            lines = {}

            # Group text blocks by their vertical position (top) to form lines
            for block in text_blocks:
                if block["type"] == 0:  # Text block
                    for line in block["lines"]:
                        for span in line["spans"]:
                            font_size = span["size"]
                            top = span["bbox"][1]

                            # Skip text blocks with font size less than 10
                            if font_size < minimum_font_size:
                                continue

                            if top not in lines:
                                lines[top] = []
                            lines[top].append(span)

            # Process each line to check if it's a header
            for top in sorted(lines.keys()):
                line = lines[top]
                line_text = " ".join([span['text'] for span in line])
                line_font_size = line[0]['size']

                elements.append({
                    'type': 'text',
                    'font_size': line_font_size,
                    'page': page_number + 1,
                    'content': line_text,
                    'x0': line[0]['bbox'][0],
                    'top': top
                })

        if extract_images:
            # Extract images using PyMuPDF
            image_list = page.get_images(full=True)

            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = pdf_document.extract_image(xref)
                image_bytes = base_image["image"]
                image_filename = os.path.join(
                    output_folder,
                    f"page_{page_number + 1}_img_{img_index + 1}.png"
                )

                with open(image_filename, "wb") as img_file:
                    img_file.write(image_bytes)

                # Get the position of the image
                img_rect = page.get_image_bbox(img)
                elements.append({
                    'type': 'image',
                    'page': page_number + 1,
                    'path': image_filename,
                    'x0': img_rect.x0,
                    'top': img_rect.y0
                })

        # Sort elements by their vertical position (top) first,
        # and then by horizontal position (x0)
        elements.sort(key=lambda e: (e['top'], e['x0']))

        if mode == 'headerwise':
            # Process elements to extract headers and content
            for element in elements:
                if element['type'] == 'text' and \
                 is_header_font_size(element['font_size']):
                    # If a new header is found,
                    #  finalize the current header content
                    add_current_header_content()
                    current_header = element['content']
                elif element['type'] == 'text':
                    if current_header_content and \
                     current_header_content[-1]['type'] == 'text':
                        current_header_content[-1]['content'] \
                         += " " + element['content']
                    else:
                        current_header_content.append({
                            'type': 'text',
                            'content': element['content']
                        })
                elif element['type'] == 'image':
                    current_header_content.append({
                        'type': 'image',
                        'path': element['path']
                    })

        elif mode == 'pagewise':
            page_content = []
            for element in elements:
                if element['type'] == 'text':
                    if page_content and \
                     page_content[-1]['type'] == 'text':
                        page_content[-1]['content'] \
                         += " " + element['content']
                    else:
                        page_content.append({
                            'type': 'text',
                            'content': element['content']
                        })
                elif element['type'] == 'image':
                    page_content.append({
                        'type': 'image',
                        'path': element['path']
                    })
            extraction_data.append({
                'page': page_number + 1,
                'content': page_content
            })

    # After the loop, finalize any remaining header content
    if mode == 'headerwise':
        add_current_header_content()

    pdf_document.close()

    return extraction_data

def get_word_font_sizes(pdf_path, target_words):
    word_font_sizes = {word: [] for word in target_words}

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            words = page.extract_words(extra_attrs=['fontname', 'size'])
            for word in words:
                text = word['text'].strip()
                if text in target_words:
                    word_font_sizes[text].append(word['size'])
    return word_font_sizes

def preview_pdf(pdf_path, num_pages=1):
    pdf_document = fitz.open(pdf_path)
    preview_images = []

    for page_number in range(min(num_pages, pdf_document.page_count)):
        page = pdf_document.load_page(page_number)
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        preview_images.append(img)

    pdf_document.close()
    return preview_images

# Streamlit UI

import io

def main():
    # setting page config
    st.set_page_config(
        page_title="Object counting",
        page_icon="🧊",
        layout="wide",
        initial_sidebar_state="expanded",
        menu_items={
            'Get Help': 'https://www.extremelycoolapp.com/help',
            'Report a bug': "https://www.extremelycoolapp.com/bug",
        }
    )

    st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER</h1>",
                unsafe_allow_html=True)
    st.markdown(
        "<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>",
        unsafe_allow_html=True
    )
    st.markdown(
        "<h5 style='text-align: center;color: red;'>Step 1: Upload pdf </h5>",
        unsafe_allow_html=True
    )
    st.markdown(
        "<h5 style='text-align: center;color: red;'>Step 2: Fill the values at right in data extraction settings </h5>",
        unsafe_allow_html=True
    )
    st.markdown(
        "<h5 style='text-align: center;color: red;'>Step 3: Download the data in desired format </h5>",
        unsafe_allow_html=True
    )

    uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
    if uploaded_pdf:
        # Save the uploaded PDF to a temporary file
        with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf.write(uploaded_pdf.read())
            temp_pdf_path = temp_pdf.name

        
        # Collapsible PDF Preview
        with st.expander("PDF Preview", expanded=True):
            num_pages = st.slider("Number of pages to preview", min_value=1, max_value=5, value=1)
            preview_images = preview_pdf(temp_pdf_path, num_pages)

            for img in preview_images:
                st.image(img, caption=f"Page {preview_images.index(img) + 1}", use_column_width=True)

        st.sidebar.title("DATA EXTRACTION SETTINGS")
        st.sidebar.write("How you want to extract data?")

        extraction_mode = st.sidebar.radio("Extraction Mode", ["headerwise", "pagewise"])
        # Font Size Detection
        st.sidebar.title("FONT SIZE DETECTION")
        st.sidebar.warning("[Only in case of headerwise extraction] if you dont know the font size for your headers or text then copy paste any of those words below")
        target_words_input = st.sidebar.text_input(
            "Target words (comma-separated)", "")
        target_words = [word.strip() for word in target_words_input.split(",")]

        if st.sidebar.button("Get Font Sizes"):
            word_font_sizes = get_word_font_sizes(temp_pdf_path, target_words)
            for word, sizes in word_font_sizes.items():
                st.sidebar.write(f"Word: {word}, Font sizes: {sizes}")

        # st.sidebar.warning("Fill below required details")
        header_font_sizes = st.sidebar.text_input("Header Font Sizes (comma-separated)", "0")
        # st.sidebar.info("Header sizes are only required in case of headerwise extraction")
        header_font_sizes = [float(size.strip()) for size in header_font_sizes.split(",")]
        st.sidebar.title("OUTPUT FOLDER PATH")
        output_folder = st.sidebar.text_input(" ", value=os.path.join(os.path.dirname ("Extracted_Data")))
        st.sidebar.info("what do you want to include in data extraction?")
        extract_text = st.sidebar.checkbox("Extract Text", value=True)
        extract_images = st.sidebar.checkbox("Extract Images", value=True)

        minimum_font_size = st.sidebar.number_input("Minimum Font Size", min_value=1, value=10)


        if st.sidebar.button("Start Extraction"):
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)

            extracted_data = extract_text_images(
                temp_pdf_path,
                output_folder,
                minimum_font_size=minimum_font_size,
                extract_text=extract_text,
                extract_images=extract_images,
                mode=extraction_mode,
                header_font_sizes=header_font_sizes
            )

            # Display extracted data as JSON
            st.json(extracted_data)

            # Convert extracted data to a pandas DataFrame
            def extract_to_dataframe(data):
                rows = []
                for item in data:
                    if 'header' in item:
                        header = item['header']
                        for content_item in item['content']:
                            if content_item['type'] == 'text':
                                rows.append({'Header': header, 'Content': content_item['content']})
                            elif content_item['type'] == 'image':
                                rows.append({'Header': header, 'Content': f"Image: {content_item['path']}"})
                    elif 'page' in item:
                        page_num = item['page']
                        for content_item in item['content']:
                            if content_item['type'] == 'text':
                                rows.append({'Page': page_num, 'Content': content_item['content']})
                            elif content_item['type'] == 'image':
                                rows.append({'Page': page_num, 'Content': f"Image: {content_item['path']}"})
                return pd.DataFrame(rows)

            df = extract_to_dataframe(extracted_data)

            # Save DataFrame to an in-memory BytesIO buffer
            buffer = io.BytesIO()
            with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
                df.to_excel(writer, index=False, sheet_name='Extracted Data')
            buffer.seek(0)

            # Preview the first 5 lines of the XLSX data
            st.subheader("Preview of Extracted Data (First 5 Lines)")
            preview_df = pd.read_excel(buffer, sheet_name='Extracted Data')
            st.dataframe(preview_df.head())

            # Provide download options
            st.download_button(
                label="Download JSON",
                data=json.dumps(extracted_data, ensure_ascii=False),
                file_name='extracted_data.json',
                mime='application/json'
            )

            st.download_button(
                label="Download XLSX",
                data=buffer,
                file_name='extracted_data.xlsx',
                mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
            )

            st.success("Extraction complete. Data displayed as JSON.")

if __name__ == "__main__":
    main()