File size: 8,527 Bytes
7d7653b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import streamlit as st
import os
from dotenv import load_dotenv
from utils import (
    configure_gemini,
    analyze_pdf_directly,
    csv_to_dataframe,
    save_csv,
    get_pdf_metadata,
    extract_csv_from_response,
    pdf_to_images,
    analyze_single_document
)
import base64
from datetime import datetime
import tempfile

# Load environment variables
load_dotenv()

# Configure page settings
st.set_page_config(
    page_title="PDF Document Analyzer",
    page_icon="πŸ“„",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS styling
st.markdown("""
    <style>
    .document-card {
        border-radius: 10px;
        padding: 1.5rem;
        margin-bottom: 1.5rem;
        background-color: white;
        box-shadow: 0 4px 12px rgba(0,0,0,0.1);
        transition: transform 0.2s;
    }
    .document-card:hover {
        transform: translateY(-2px);
    }
    .stButton>button {
        background-color: #4285F4;
        color: white;
        border-radius: 8px;
        padding: 0.5rem 1.5rem;
        font-weight: 500;
    }
    .analysis-section {
        border-left: 4px solid #4285F4;
        padding-left: 1rem;
        margin-top: 1.5rem;
    }
    </style>
""", unsafe_allow_html=True)

# App Header
st.title("πŸ“„ PDF Document Analyzer")
st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")

# Load prompt
PROMPT  ="""Please analyze the provided images of the real estate document set and perform the following actions:

1.  *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
2.  *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
3.  *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
4.  *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
5.  *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
6.  *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
7.  *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
8.  *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
    *   Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
    *   Location (Document Name/Page, e.g., Sale Contract Pg 2)
    *   Line Item(s) (Approximate line number or location description)
    *   Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
    *   Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
    *   Details (Specifics like names, text of the checkbox, description of the issue or document status)
    *   Secondary Question (if applicable) (The question generated in step 4)

Please apply this analysis to the entire set of documents provided.
"""

# Sidebar Configuration
with st.sidebar:
    st.header("Configuration")
    api_key = st.text_input(
        "Enter Gemini API Key:",
        type="password",
        value=os.getenv("GEMINI_API_KEY", "")
    )
    if api_key:
        configure_gemini(api_key)
    
    st.markdown("---")
    st.info("""
        **Features:**
        - PDF processing using images partitioned by page
        - Individual analysis for each document
        - Downloadable CSV reports
    """)

# Main App Content
uploaded_files = st.file_uploader(
    "Upload PDF Documents",
    type=["pdf"],
    accept_multiple_files=True,
    help="Upload multiple PDF documents for analysis"
)

if uploaded_files and api_key:
    st.success(f"βœ… {len(uploaded_files)} PDF(s) ready for analysis")
    
    # Process each PDF separately
    for i, uploaded_file in enumerate(uploaded_files):
        with st.container():
            st.markdown(f"### πŸ“‘ Document {i+1}: {uploaded_file.name}")
            
            # Display document info
            metadata = get_pdf_metadata(uploaded_file.getvalue())
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Pages", metadata['page_count'])
            with col2:
                st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
            with col3:
                if st.button(f"Analyze Document", key=f"analyze_{i}"):
                    with st.spinner(f"Analyzing {uploaded_file.name}..."):
                        try:
                            # Analyze PDF directly
                             # Convert PDF to images
                            images = pdf_to_images(uploaded_file.getvalue())
                            
                            # Analyze document
                            raw_response = analyze_single_document(images, PROMPT)

                            # raw_response = analyze_pdf_directly(
                            #     pdf_bytes=uploaded_file.getvalue(),
                            #     prompt=PROMPT,
                            #     model_name="gemini-1.5-pro"  # or "gemini-1.5-flash"
                            # )
                            
                            # Process response
                            csv_data = extract_csv_from_response(raw_response)
                            
                            # Display results in expandable section
                            with st.expander("View Analysis Results", expanded=True):
                                if csv_data:
                                    df = csv_to_dataframe(csv_data)
                                    print(f"DataFrame: {df}")
                                    if not df.empty:
                                        st.dataframe(df)
                                        
                                        # Download button
                                        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                                        csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv"
                                        
                                        st.download_button(
                                            label="Download Analysis",
                                            data=csv_data,
                                            file_name=csv_filename,
                                            mime="text/csv",
                                            key=f"download_{i}"
                                        )
                                    else:
                                        st.warning("No tabular data found in response")
                                        st.markdown("### Full Response")
                                        st.write(raw_response)
                                else:
                                    st.warning("No CSV data found in response")
                                    st.markdown("### Full Response")
                                    st.write(raw_response)
                            
                        except Exception as e:
                            st.error(f"Analysis failed: {str(e)}")
            
            st.markdown("---")

elif not api_key:
    st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed")

elif not uploaded_files:
    st.info("πŸ“€ Please upload PDF documents using the file uploader above")