import streamlit as st import os from dotenv import load_dotenv from utils import ( configure_gemini, analyze_pdf_directly, csv_to_dataframe, save_csv, get_pdf_metadata, extract_csv_from_response, pdf_to_images, analyze_single_document ) import base64 from datetime import datetime import tempfile # Load environment variables load_dotenv() # Configure page settings st.set_page_config( page_title="PDF Document Analyzer", page_icon="📄", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS styling st.markdown(""" """, unsafe_allow_html=True) # App Header st.title("📄 PDF Document Analyzer") st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing") # Load prompt PROMPT ="""Please analyze the provided images of the real estate document set and perform the following actions: 1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2. 2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents. 3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked. 4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved. 5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images. 6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously). 7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block). 8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be: * Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict) * Location (Document Name/Page, e.g., Sale Contract Pg 2) * Line Item(s) (Approximate line number or location description) * Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document) * Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected) * Details (Specifics like names, text of the checkbox, description of the issue or document status) * Secondary Question (if applicable) (The question generated in step 4) Please apply this analysis to the entire set of documents provided. """ # Sidebar Configuration with st.sidebar: st.header("Configuration") api_key = st.text_input( "Enter Gemini API Key:", type="password", value=os.getenv("GEMINI_API_KEY", "") ) if api_key: configure_gemini(api_key) st.markdown("---") st.info(""" **Features:** - PDF processing using images partitioned by page - Individual analysis for each document - Downloadable CSV reports """) # Main App Content uploaded_files = st.file_uploader( "Upload PDF Documents", type=["pdf"], accept_multiple_files=True, help="Upload multiple PDF documents for analysis" ) if uploaded_files and api_key: st.success(f"✅ {len(uploaded_files)} PDF(s) ready for analysis") # Process each PDF separately for i, uploaded_file in enumerate(uploaded_files): with st.container(): st.markdown(f"### 📑 Document {i+1}: {uploaded_file.name}") # Display document info metadata = get_pdf_metadata(uploaded_file.getvalue()) col1, col2, col3 = st.columns(3) with col1: st.metric("Pages", metadata['page_count']) with col2: st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB") with col3: if st.button(f"Analyze Document", key=f"analyze_{i}"): with st.spinner(f"Analyzing {uploaded_file.name}..."): try: # Analyze PDF directly # Convert PDF to images images = pdf_to_images(uploaded_file.getvalue()) # Analyze document raw_response = analyze_single_document(images, PROMPT) # raw_response = analyze_pdf_directly( # pdf_bytes=uploaded_file.getvalue(), # prompt=PROMPT, # model_name="gemini-1.5-pro" # or "gemini-1.5-flash" # ) # Process response csv_data = extract_csv_from_response(raw_response) # Display results in expandable section with st.expander("View Analysis Results", expanded=True): if csv_data: df = csv_to_dataframe(csv_data) print(f"DataFrame: {df}") if not df.empty: st.dataframe(df) # Download button timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv" st.download_button( label="Download Analysis", data=csv_data, file_name=csv_filename, mime="text/csv", key=f"download_{i}" ) else: st.warning("No tabular data found in response") st.markdown("### Full Response") st.write(raw_response) else: st.warning("No CSV data found in response") st.markdown("### Full Response") st.write(raw_response) except Exception as e: st.error(f"Analysis failed: {str(e)}") st.markdown("---") elif not api_key: st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed") elif not uploaded_files: st.info("📤 Please upload PDF documents using the file uploader above")