Spaces:
Sleeping
Sleeping
first commit
Browse files
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
GEMINI_API_KEY=AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA
|
app.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from utils import (
|
5 |
+
configure_gemini,
|
6 |
+
analyze_pdf_directly,
|
7 |
+
csv_to_dataframe,
|
8 |
+
save_csv,
|
9 |
+
get_pdf_metadata,
|
10 |
+
extract_csv_from_response
|
11 |
+
)
|
12 |
+
import base64
|
13 |
+
from datetime import datetime
|
14 |
+
import tempfile
|
15 |
+
|
16 |
+
# Load environment variables
|
17 |
+
load_dotenv()
|
18 |
+
|
19 |
+
# Configure page settings
|
20 |
+
st.set_page_config(
|
21 |
+
page_title="PDF Document Analyzer",
|
22 |
+
page_icon="📄",
|
23 |
+
layout="wide",
|
24 |
+
initial_sidebar_state="expanded"
|
25 |
+
)
|
26 |
+
|
27 |
+
# Custom CSS styling
|
28 |
+
st.markdown("""
|
29 |
+
<style>
|
30 |
+
.document-card {
|
31 |
+
border-radius: 10px;
|
32 |
+
padding: 1.5rem;
|
33 |
+
margin-bottom: 1.5rem;
|
34 |
+
background-color: white;
|
35 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
|
36 |
+
transition: transform 0.2s;
|
37 |
+
}
|
38 |
+
.document-card:hover {
|
39 |
+
transform: translateY(-2px);
|
40 |
+
}
|
41 |
+
.stButton>button {
|
42 |
+
background-color: #4285F4;
|
43 |
+
color: white;
|
44 |
+
border-radius: 8px;
|
45 |
+
padding: 0.5rem 1.5rem;
|
46 |
+
font-weight: 500;
|
47 |
+
}
|
48 |
+
.analysis-section {
|
49 |
+
border-left: 4px solid #4285F4;
|
50 |
+
padding-left: 1rem;
|
51 |
+
margin-top: 1.5rem;
|
52 |
+
}
|
53 |
+
</style>
|
54 |
+
""", unsafe_allow_html=True)
|
55 |
+
|
56 |
+
# App Header
|
57 |
+
st.title("📄 PDF Document Analyzer")
|
58 |
+
st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")
|
59 |
+
|
60 |
+
# Load prompt
|
61 |
+
PROMPT ="""Please analyze the provided images of the real estate document set and perform the following actions:
|
62 |
+
|
63 |
+
1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
|
64 |
+
2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
|
65 |
+
3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
|
66 |
+
4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
|
67 |
+
5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
|
68 |
+
6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
|
69 |
+
7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
|
70 |
+
8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
|
71 |
+
* Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
|
72 |
+
* Location (Document Name/Page, e.g., Sale Contract Pg 2)
|
73 |
+
* Line Item(s) (Approximate line number or location description)
|
74 |
+
* Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
|
75 |
+
* Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
|
76 |
+
* Details (Specifics like names, text of the checkbox, description of the issue or document status)
|
77 |
+
* Secondary Question (if applicable) (The question generated in step 4)
|
78 |
+
|
79 |
+
Please apply this analysis to the entire set of documents provided.
|
80 |
+
"""
|
81 |
+
|
82 |
+
# Sidebar Configuration
|
83 |
+
with st.sidebar:
|
84 |
+
st.header("Configuration")
|
85 |
+
api_key = st.text_input(
|
86 |
+
"Enter Gemini API Key:",
|
87 |
+
type="password",
|
88 |
+
value=os.getenv("GEMINI_API_KEY", "")
|
89 |
+
)
|
90 |
+
if api_key:
|
91 |
+
configure_gemini(api_key)
|
92 |
+
|
93 |
+
st.markdown("---")
|
94 |
+
st.info("""
|
95 |
+
**Features:**
|
96 |
+
- Direct PDF processing
|
97 |
+
- Individual analysis for each document
|
98 |
+
- Downloadable CSV reports
|
99 |
+
""")
|
100 |
+
|
101 |
+
# Main App Content
|
102 |
+
uploaded_files = st.file_uploader(
|
103 |
+
"Upload PDF Documents",
|
104 |
+
type=["pdf"],
|
105 |
+
accept_multiple_files=True,
|
106 |
+
help="Upload multiple PDF documents for analysis"
|
107 |
+
)
|
108 |
+
|
109 |
+
if uploaded_files and api_key:
|
110 |
+
st.success(f"✅ {len(uploaded_files)} PDF(s) ready for analysis")
|
111 |
+
|
112 |
+
# Process each PDF separately
|
113 |
+
for i, uploaded_file in enumerate(uploaded_files):
|
114 |
+
with st.container():
|
115 |
+
st.markdown(f"### 📑 Document {i+1}: {uploaded_file.name}")
|
116 |
+
|
117 |
+
# Display document info
|
118 |
+
metadata = get_pdf_metadata(uploaded_file.getvalue())
|
119 |
+
col1, col2, col3 = st.columns(3)
|
120 |
+
with col1:
|
121 |
+
st.metric("Pages", metadata['page_count'])
|
122 |
+
with col2:
|
123 |
+
st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
|
124 |
+
with col3:
|
125 |
+
if st.button(f"Analyze Document", key=f"analyze_{i}"):
|
126 |
+
with st.spinner(f"Analyzing {uploaded_file.name}..."):
|
127 |
+
try:
|
128 |
+
# Analyze PDF directly
|
129 |
+
raw_response = analyze_pdf_directly(
|
130 |
+
pdf_bytes=uploaded_file.getvalue(),
|
131 |
+
prompt=PROMPT,
|
132 |
+
model_name="gemini-1.5-pro" # or "gemini-1.5-flash"
|
133 |
+
)
|
134 |
+
|
135 |
+
# Process response
|
136 |
+
csv_data = extract_csv_from_response(raw_response)
|
137 |
+
|
138 |
+
# Display results in expandable section
|
139 |
+
with st.expander("View Analysis Results", expanded=True):
|
140 |
+
if csv_data:
|
141 |
+
df = csv_to_dataframe(csv_data)
|
142 |
+
if not df.empty:
|
143 |
+
st.dataframe(df, use_container_width=True)
|
144 |
+
|
145 |
+
# Download button
|
146 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
147 |
+
csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv"
|
148 |
+
|
149 |
+
st.download_button(
|
150 |
+
label="Download Analysis",
|
151 |
+
data=csv_data,
|
152 |
+
file_name=csv_filename,
|
153 |
+
mime="text/csv",
|
154 |
+
key=f"download_{i}"
|
155 |
+
)
|
156 |
+
else:
|
157 |
+
st.warning("No tabular data found in response")
|
158 |
+
st.markdown("### Full Response")
|
159 |
+
st.write(raw_response)
|
160 |
+
else:
|
161 |
+
st.warning("No CSV data found in response")
|
162 |
+
st.markdown("### Full Response")
|
163 |
+
st.write(raw_response)
|
164 |
+
|
165 |
+
except Exception as e:
|
166 |
+
st.error(f"Analysis failed: {str(e)}")
|
167 |
+
|
168 |
+
st.markdown("---")
|
169 |
+
|
170 |
+
elif not api_key:
|
171 |
+
st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed")
|
172 |
+
|
173 |
+
elif not uploaded_files:
|
174 |
+
st.info("📤 Please upload PDF documents using the file uploader above")
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
google-generativeai
|
3 |
+
pandas
|
4 |
+
pillow
|
5 |
+
python-dotenv
|
6 |
+
PyPDF2>=3.0.0
|
7 |
+
pdf2image>=1.16.3
|
8 |
+
poppler-utils
|
utils.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import google.generativeai as genai
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
import io
|
5 |
+
import tempfile
|
6 |
+
from PyPDF2 import PdfReader
|
7 |
+
import re
|
8 |
+
import csv
|
9 |
+
|
10 |
+
def configure_gemini(api_key: str):
|
11 |
+
"""Configure Gemini API with the provided key"""
|
12 |
+
genai.configure(api_key=api_key)
|
13 |
+
|
14 |
+
def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"):
|
15 |
+
"""Analyze a PDF directly using Gemini's PDF support"""
|
16 |
+
model = genai.GenerativeModel(model_name)
|
17 |
+
|
18 |
+
# Create a temporary PDF file
|
19 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
|
20 |
+
tmp_file.write(pdf_bytes)
|
21 |
+
tmp_file_path = tmp_file.name
|
22 |
+
|
23 |
+
try:
|
24 |
+
# Use the file upload feature
|
25 |
+
response = model.generate_content(
|
26 |
+
[prompt, genai.upload_file(tmp_file_path)]
|
27 |
+
)
|
28 |
+
print(f"Response: {response}")
|
29 |
+
return response.text
|
30 |
+
finally:
|
31 |
+
# Clean up temporary file
|
32 |
+
if os.path.exists(tmp_file_path):
|
33 |
+
os.unlink(tmp_file_path)
|
34 |
+
|
35 |
+
def extract_response_text(response) -> str:
|
36 |
+
"""Extract text content from Gemini response object"""
|
37 |
+
try:
|
38 |
+
if hasattr(response, 'text'):
|
39 |
+
return response.text
|
40 |
+
elif hasattr(response, 'result') and hasattr(response.result, 'candidates'):
|
41 |
+
for candidate in response.result.candidates:
|
42 |
+
if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
|
43 |
+
for part in candidate.content.parts:
|
44 |
+
if hasattr(part, 'text'):
|
45 |
+
return part.text
|
46 |
+
return str(response)
|
47 |
+
except Exception as e:
|
48 |
+
print(f"Error extracting response text: {str(e)}")
|
49 |
+
return str(response)
|
50 |
+
|
51 |
+
def extract_csv_from_response(response) -> str:
|
52 |
+
"""Extract CSV data from Gemini response"""
|
53 |
+
try:
|
54 |
+
# Get the text content from the response
|
55 |
+
response_text = extract_response_text(response)
|
56 |
+
|
57 |
+
# Extract CSV content between ```csv markers
|
58 |
+
csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL)
|
59 |
+
if csv_match:
|
60 |
+
return csv_match.group(1).strip()
|
61 |
+
|
62 |
+
# Fallback: Try to find any CSV-like content
|
63 |
+
lines = []
|
64 |
+
in_csv = False
|
65 |
+
for line in response_text.split('\n'):
|
66 |
+
if ',' in line and ('Category,' in line or 'Location,' in line):
|
67 |
+
in_csv = True
|
68 |
+
if in_csv:
|
69 |
+
lines.append(line)
|
70 |
+
if lines:
|
71 |
+
return '\n'.join(lines)
|
72 |
+
|
73 |
+
return response_text # Return full response if no CSV found
|
74 |
+
except Exception as e:
|
75 |
+
print(f"Error extracting CSV: {str(e)}")
|
76 |
+
return response.text if hasattr(response, 'text') else str(response)
|
77 |
+
|
78 |
+
def csv_to_dataframe(csv_data: str) -> pd.DataFrame:
|
79 |
+
"""Convert CSV string to pandas DataFrame with error handling"""
|
80 |
+
if not csv_data.strip():
|
81 |
+
return pd.DataFrame()
|
82 |
+
|
83 |
+
try:
|
84 |
+
# Clean line breaks and extra spaces
|
85 |
+
cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()])
|
86 |
+
|
87 |
+
# Use CSV reader to handle irregular fields
|
88 |
+
rows = []
|
89 |
+
reader = csv.reader(io.StringIO(cleaned_data),
|
90 |
+
delimiter=',',
|
91 |
+
quotechar='"',
|
92 |
+
skipinitialspace=True)
|
93 |
+
|
94 |
+
header = next(reader)
|
95 |
+
for row in reader:
|
96 |
+
if len(row) > len(header):
|
97 |
+
# Combine extra fields into the last column
|
98 |
+
row = row[:len(header)-1] + [','.join(row[len(header)-1:])]
|
99 |
+
rows.append(row)
|
100 |
+
|
101 |
+
return pd.DataFrame(rows, columns=header)
|
102 |
+
|
103 |
+
except Exception as e:
|
104 |
+
print(f"CSV conversion error: {str(e)}")
|
105 |
+
try:
|
106 |
+
# Fallback to pandas with flexible parsing
|
107 |
+
return pd.read_csv(io.StringIO(cleaned_data),
|
108 |
+
on_bad_lines='warn',
|
109 |
+
engine='python',
|
110 |
+
quotechar='"',
|
111 |
+
skipinitialspace=True)
|
112 |
+
except Exception as fallback_error:
|
113 |
+
print(f"Fallback conversion failed: {str(fallback_error)}")
|
114 |
+
return pd.DataFrame()
|
115 |
+
|
116 |
+
|
117 |
+
def save_csv(csv_data: str, filename: str) -> str:
|
118 |
+
"""Save CSV data to file"""
|
119 |
+
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
120 |
+
csvfile.write(csv_data.strip())
|
121 |
+
return filename
|
122 |
+
|
123 |
+
def get_pdf_metadata(pdf_bytes: bytes) -> dict:
|
124 |
+
"""Extract basic PDF metadata"""
|
125 |
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
126 |
+
return {
|
127 |
+
'page_count': len(reader.pages),
|
128 |
+
'author': reader.metadata.author if reader.metadata else None,
|
129 |
+
'title': reader.metadata.title if reader.metadata else None
|
130 |
+
}
|