csv-generation / app.py
Nechba's picture
Update app.py
0500dd5 verified
raw
history blame
8 kB
import streamlit as st
import os
from dotenv import load_dotenv
from utils import (
configure_gemini,
analyze_pdf_directly,
csv_to_dataframe,
save_csv,
get_pdf_metadata,
extract_csv_from_response
)
import base64
from datetime import datetime
import tempfile
# Load environment variables
load_dotenv()
# Configure page settings
st.set_page_config(
page_title="PDF Document Analyzer",
page_icon="πŸ“„",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS styling
st.markdown("""
<style>
.document-card {
border-radius: 10px;
padding: 1.5rem;
margin-bottom: 1.5rem;
background-color: white;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
transition: transform 0.2s;
}
.document-card:hover {
transform: translateY(-2px);
}
.stButton>button {
background-color: #4285F4;
color: white;
border-radius: 8px;
padding: 0.5rem 1.5rem;
font-weight: 500;
}
.analysis-section {
border-left: 4px solid #4285F4;
padding-left: 1rem;
margin-top: 1.5rem;
}
</style>
""", unsafe_allow_html=True)
# App Header
st.title("πŸ“„ PDF Document Analyzer")
st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")
# Load prompt
PROMPT ="""Please analyze the provided images of the real estate document set and perform the following actions:
1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
* Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
* Location (Document Name/Page, e.g., Sale Contract Pg 2)
* Line Item(s) (Approximate line number or location description)
* Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
* Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
* Details (Specifics like names, text of the checkbox, description of the issue or document status)
* Secondary Question (if applicable) (The question generated in step 4)
Please apply this analysis to the entire set of documents provided.
"""
# Sidebar Configuration
with st.sidebar:
api_key = os.getenv("GEMINI_API_KEY", "")
if api_key:
configure_gemini(api_key)
st.markdown("---")
st.info("""
**Features:**
- Direct PDF processing
- Individual analysis for each document
- Downloadable CSV reports
""")
# Main App Content
uploaded_files = st.file_uploader(
"Upload PDF Documents",
type=["pdf"],
accept_multiple_files=True,
help="Upload multiple PDF documents for analysis"
)
if uploaded_files and api_key:
st.success(f"βœ… {len(uploaded_files)} PDF(s) ready for analysis")
# Process each PDF separately
for i, uploaded_file in enumerate(uploaded_files):
with st.container():
st.markdown(f"### πŸ“‘ Document {i+1}: {uploaded_file.name}")
# Display document info
metadata = get_pdf_metadata(uploaded_file.getvalue())
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Pages", metadata['page_count'])
with col2:
st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
with col3:
if st.button(f"Analyze Document", key=f"analyze_{i}"):
with st.spinner(f"Analyzing {uploaded_file.name}..."):
try:
# Analyze PDF directly
raw_response = analyze_pdf_directly(
pdf_bytes=uploaded_file.getvalue(),
prompt=PROMPT,
model_name="gemini-2.5-pro-exp-03-25" # or "gemini-1.5-flash"
)
# Process response
csv_data = extract_csv_from_response(raw_response)
# Display results in expandable section
with st.expander("View Analysis Results", expanded=True):
if csv_data:
df = csv_to_dataframe(csv_data)
if not df.empty:
st.dataframe(df, use_container_width=True)
# Download button
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv"
st.download_button(
label="Download Analysis",
data=csv_data,
file_name=csv_filename,
mime="text/csv",
key=f"download_{i}"
)
else:
st.warning("No tabular data found in response")
st.markdown("### Full Response")
st.write(raw_response)
else:
st.warning("No CSV data found in response")
st.markdown("### Full Response")
st.write(raw_response)
except Exception as e:
st.error(f"Analysis failed: {str(e)}")
st.markdown("---")
elif not api_key:
st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed")
elif not uploaded_files:
st.info("πŸ“€ Please upload PDF documents using the file uploader above")