Spaces:
Sleeping
Sleeping
File size: 8,527 Bytes
7d7653b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import streamlit as st
import os
from dotenv import load_dotenv
from utils import (
configure_gemini,
analyze_pdf_directly,
csv_to_dataframe,
save_csv,
get_pdf_metadata,
extract_csv_from_response,
pdf_to_images,
analyze_single_document
)
import base64
from datetime import datetime
import tempfile
# Load environment variables
load_dotenv()
# Configure page settings
st.set_page_config(
page_title="PDF Document Analyzer",
page_icon="π",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS styling
st.markdown("""
<style>
.document-card {
border-radius: 10px;
padding: 1.5rem;
margin-bottom: 1.5rem;
background-color: white;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
transition: transform 0.2s;
}
.document-card:hover {
transform: translateY(-2px);
}
.stButton>button {
background-color: #4285F4;
color: white;
border-radius: 8px;
padding: 0.5rem 1.5rem;
font-weight: 500;
}
.analysis-section {
border-left: 4px solid #4285F4;
padding-left: 1rem;
margin-top: 1.5rem;
}
</style>
""", unsafe_allow_html=True)
# App Header
st.title("π PDF Document Analyzer")
st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")
# Load prompt
PROMPT ="""Please analyze the provided images of the real estate document set and perform the following actions:
1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
* Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
* Location (Document Name/Page, e.g., Sale Contract Pg 2)
* Line Item(s) (Approximate line number or location description)
* Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
* Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
* Details (Specifics like names, text of the checkbox, description of the issue or document status)
* Secondary Question (if applicable) (The question generated in step 4)
Please apply this analysis to the entire set of documents provided.
"""
# Sidebar Configuration
with st.sidebar:
st.header("Configuration")
api_key = st.text_input(
"Enter Gemini API Key:",
type="password",
value=os.getenv("GEMINI_API_KEY", "")
)
if api_key:
configure_gemini(api_key)
st.markdown("---")
st.info("""
**Features:**
- PDF processing using images partitioned by page
- Individual analysis for each document
- Downloadable CSV reports
""")
# Main App Content
uploaded_files = st.file_uploader(
"Upload PDF Documents",
type=["pdf"],
accept_multiple_files=True,
help="Upload multiple PDF documents for analysis"
)
if uploaded_files and api_key:
st.success(f"β
{len(uploaded_files)} PDF(s) ready for analysis")
# Process each PDF separately
for i, uploaded_file in enumerate(uploaded_files):
with st.container():
st.markdown(f"### π Document {i+1}: {uploaded_file.name}")
# Display document info
metadata = get_pdf_metadata(uploaded_file.getvalue())
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Pages", metadata['page_count'])
with col2:
st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
with col3:
if st.button(f"Analyze Document", key=f"analyze_{i}"):
with st.spinner(f"Analyzing {uploaded_file.name}..."):
try:
# Analyze PDF directly
# Convert PDF to images
images = pdf_to_images(uploaded_file.getvalue())
# Analyze document
raw_response = analyze_single_document(images, PROMPT)
# raw_response = analyze_pdf_directly(
# pdf_bytes=uploaded_file.getvalue(),
# prompt=PROMPT,
# model_name="gemini-1.5-pro" # or "gemini-1.5-flash"
# )
# Process response
csv_data = extract_csv_from_response(raw_response)
# Display results in expandable section
with st.expander("View Analysis Results", expanded=True):
if csv_data:
df = csv_to_dataframe(csv_data)
print(f"DataFrame: {df}")
if not df.empty:
st.dataframe(df)
# Download button
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv"
st.download_button(
label="Download Analysis",
data=csv_data,
file_name=csv_filename,
mime="text/csv",
key=f"download_{i}"
)
else:
st.warning("No tabular data found in response")
st.markdown("### Full Response")
st.write(raw_response)
else:
st.warning("No CSV data found in response")
st.markdown("### Full Response")
st.write(raw_response)
except Exception as e:
st.error(f"Analysis failed: {str(e)}")
st.markdown("---")
elif not api_key:
st.warning("β οΈ Please enter your Gemini API key in the sidebar to proceed")
elif not uploaded_files:
st.info("π€ Please upload PDF documents using the file uploader above") |