Spaces:

Nechba
/

cvs-generation-using-images

Sleeping

File size: 8,527 Bytes

7d7653b

import streamlit as st
import os
from dotenv import load_dotenv
from utils import (
    configure_gemini,
    analyze_pdf_directly,
    csv_to_dataframe,
    save_csv,
    get_pdf_metadata,
    extract_csv_from_response,
    pdf_to_images,
    analyze_single_document
)
import base64
from datetime import datetime
import tempfile

# Load environment variables
load_dotenv()

# Configure page settings
st.set_page_config(
    page_title="PDF Document Analyzer",
    page_icon="📄",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS styling
st.markdown("""
    <style>
    .document-card {
        border-radius: 10px;
        padding: 1.5rem;
        margin-bottom: 1.5rem;
        background-color: white;
        box-shadow: 0 4px 12px rgba(0,0,0,0.1);
        transition: transform 0.2s;
    }
    .document-card:hover {
        transform: translateY(-2px);
    }
    .stButton>button {
        background-color: #4285F4;
        color: white;
        border-radius: 8px;
        padding: 0.5rem 1.5rem;
        font-weight: 500;
    }
    .analysis-section {
        border-left: 4px solid #4285F4;
        padding-left: 1rem;
        margin-top: 1.5rem;
    }
    </style>
""", unsafe_allow_html=True)

# App Header
st.title("📄 PDF Document Analyzer")
st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")

# Load prompt
PROMPT  ="""Please analyze the provided images of the real estate document set and perform the following actions:

1.  *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
2.  *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
3.  *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
4.  *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
5.  *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
6.  *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
7.  *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
8.  *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
    *   Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
    *   Location (Document Name/Page, e.g., Sale Contract Pg 2)
    *   Line Item(s) (Approximate line number or location description)
    *   Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
    *   Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
    *   Details (Specifics like names, text of the checkbox, description of the issue or document status)
    *   Secondary Question (if applicable) (The question generated in step 4)

Please apply this analysis to the entire set of documents provided.
"""

# Sidebar Configuration
with st.sidebar:
    st.header("Configuration")
    api_key = st.text_input(
        "Enter Gemini API Key:",
        type="password",
        value=os.getenv("GEMINI_API_KEY", "")
    )
    if api_key:
        configure_gemini(api_key)
    
    st.markdown("---")
    st.info("""
        **Features:**
        - PDF processing using images partitioned by page
        - Individual analysis for each document
        - Downloadable CSV reports
    """)

# Main App Content
uploaded_files = st.file_uploader(
    "Upload PDF Documents",
    type=["pdf"],
    accept_multiple_files=True,
    help="Upload multiple PDF documents for analysis"
)

if uploaded_files and api_key:
    st.success(f"✅ {len(uploaded_files)} PDF(s) ready for analysis")
    
    # Process each PDF separately
    for i, uploaded_file in enumerate(uploaded_files):
        with st.container():
            st.markdown(f"### 📑 Document {i+1}: {uploaded_file.name}")
            
            # Display document info
            metadata = get_pdf_metadata(uploaded_file.getvalue())
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Pages", metadata['page_count'])
            with col2:
                st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
            with col3:
                if st.button(f"Analyze Document", key=f"analyze_{i}"):
                    with st.spinner(f"Analyzing {uploaded_file.name}..."):
                        try:
                            # Analyze PDF directly
                             # Convert PDF to images
                            images = pdf_to_images(uploaded_file.getvalue())
                            
                            # Analyze document
                            raw_response = analyze_single_document(images, PROMPT)

                            # raw_response = analyze_pdf_directly(
                            #     pdf_bytes=uploaded_file.getvalue(),
                            #     prompt=PROMPT,
                            #     model_name="gemini-1.5-pro"  # or "gemini-1.5-flash"
                            # )
                            
                            # Process response
                            csv_data = extract_csv_from_response(raw_response)
                            
                            # Display results in expandable section
                            with st.expander("View Analysis Results", expanded=True):
                                if csv_data:
                                    df = csv_to_dataframe(csv_data)
                                    print(f"DataFrame: {df}")
                                    if not df.empty:
                                        st.dataframe(df)
                                        
                                        # Download button
                                        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                                        csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv"
                                        
                                        st.download_button(
                                            label="Download Analysis",
                                            data=csv_data,
                                            file_name=csv_filename,
                                            mime="text/csv",
                                            key=f"download_{i}"
                                        )
                                    else:
                                        st.warning("No tabular data found in response")
                                        st.markdown("### Full Response")
                                        st.write(raw_response)
                                else:
                                    st.warning("No CSV data found in response")
                                    st.markdown("### Full Response")
                                    st.write(raw_response)
                            
                        except Exception as e:
                            st.error(f"Analysis failed: {str(e)}")
            
            st.markdown("---")

elif not api_key:
    st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed")

elif not uploaded_files:
    st.info("📤 Please upload PDF documents using the file uploader above")