Nechba commited on
Commit
3a55e0a
·
1 Parent(s): 61df440

first commit

Browse files
Files changed (4) hide show
  1. .env +1 -0
  2. app.py +174 -0
  3. requirements.txt +8 -0
  4. utils.py +130 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GEMINI_API_KEY=AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from utils import (
5
+ configure_gemini,
6
+ analyze_pdf_directly,
7
+ csv_to_dataframe,
8
+ save_csv,
9
+ get_pdf_metadata,
10
+ extract_csv_from_response
11
+ )
12
+ import base64
13
+ from datetime import datetime
14
+ import tempfile
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # Configure page settings
20
+ st.set_page_config(
21
+ page_title="PDF Document Analyzer",
22
+ page_icon="📄",
23
+ layout="wide",
24
+ initial_sidebar_state="expanded"
25
+ )
26
+
27
+ # Custom CSS styling
28
+ st.markdown("""
29
+ <style>
30
+ .document-card {
31
+ border-radius: 10px;
32
+ padding: 1.5rem;
33
+ margin-bottom: 1.5rem;
34
+ background-color: white;
35
+ box-shadow: 0 4px 12px rgba(0,0,0,0.1);
36
+ transition: transform 0.2s;
37
+ }
38
+ .document-card:hover {
39
+ transform: translateY(-2px);
40
+ }
41
+ .stButton>button {
42
+ background-color: #4285F4;
43
+ color: white;
44
+ border-radius: 8px;
45
+ padding: 0.5rem 1.5rem;
46
+ font-weight: 500;
47
+ }
48
+ .analysis-section {
49
+ border-left: 4px solid #4285F4;
50
+ padding-left: 1rem;
51
+ margin-top: 1.5rem;
52
+ }
53
+ </style>
54
+ """, unsafe_allow_html=True)
55
+
56
+ # App Header
57
+ st.title("📄 PDF Document Analyzer")
58
+ st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")
59
+
60
+ # Load prompt
61
+ PROMPT ="""Please analyze the provided images of the real estate document set and perform the following actions:
62
+
63
+ 1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
64
+ 2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
65
+ 3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
66
+ 4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
67
+ 5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
68
+ 6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
69
+ 7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
70
+ 8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
71
+ * Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
72
+ * Location (Document Name/Page, e.g., Sale Contract Pg 2)
73
+ * Line Item(s) (Approximate line number or location description)
74
+ * Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
75
+ * Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
76
+ * Details (Specifics like names, text of the checkbox, description of the issue or document status)
77
+ * Secondary Question (if applicable) (The question generated in step 4)
78
+
79
+ Please apply this analysis to the entire set of documents provided.
80
+ """
81
+
82
+ # Sidebar Configuration
83
+ with st.sidebar:
84
+ st.header("Configuration")
85
+ api_key = st.text_input(
86
+ "Enter Gemini API Key:",
87
+ type="password",
88
+ value=os.getenv("GEMINI_API_KEY", "")
89
+ )
90
+ if api_key:
91
+ configure_gemini(api_key)
92
+
93
+ st.markdown("---")
94
+ st.info("""
95
+ **Features:**
96
+ - Direct PDF processing
97
+ - Individual analysis for each document
98
+ - Downloadable CSV reports
99
+ """)
100
+
101
+ # Main App Content
102
+ uploaded_files = st.file_uploader(
103
+ "Upload PDF Documents",
104
+ type=["pdf"],
105
+ accept_multiple_files=True,
106
+ help="Upload multiple PDF documents for analysis"
107
+ )
108
+
109
+ if uploaded_files and api_key:
110
+ st.success(f"✅ {len(uploaded_files)} PDF(s) ready for analysis")
111
+
112
+ # Process each PDF separately
113
+ for i, uploaded_file in enumerate(uploaded_files):
114
+ with st.container():
115
+ st.markdown(f"### 📑 Document {i+1}: {uploaded_file.name}")
116
+
117
+ # Display document info
118
+ metadata = get_pdf_metadata(uploaded_file.getvalue())
119
+ col1, col2, col3 = st.columns(3)
120
+ with col1:
121
+ st.metric("Pages", metadata['page_count'])
122
+ with col2:
123
+ st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
124
+ with col3:
125
+ if st.button(f"Analyze Document", key=f"analyze_{i}"):
126
+ with st.spinner(f"Analyzing {uploaded_file.name}..."):
127
+ try:
128
+ # Analyze PDF directly
129
+ raw_response = analyze_pdf_directly(
130
+ pdf_bytes=uploaded_file.getvalue(),
131
+ prompt=PROMPT,
132
+ model_name="gemini-1.5-pro" # or "gemini-1.5-flash"
133
+ )
134
+
135
+ # Process response
136
+ csv_data = extract_csv_from_response(raw_response)
137
+
138
+ # Display results in expandable section
139
+ with st.expander("View Analysis Results", expanded=True):
140
+ if csv_data:
141
+ df = csv_to_dataframe(csv_data)
142
+ if not df.empty:
143
+ st.dataframe(df, use_container_width=True)
144
+
145
+ # Download button
146
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
147
+ csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv"
148
+
149
+ st.download_button(
150
+ label="Download Analysis",
151
+ data=csv_data,
152
+ file_name=csv_filename,
153
+ mime="text/csv",
154
+ key=f"download_{i}"
155
+ )
156
+ else:
157
+ st.warning("No tabular data found in response")
158
+ st.markdown("### Full Response")
159
+ st.write(raw_response)
160
+ else:
161
+ st.warning("No CSV data found in response")
162
+ st.markdown("### Full Response")
163
+ st.write(raw_response)
164
+
165
+ except Exception as e:
166
+ st.error(f"Analysis failed: {str(e)}")
167
+
168
+ st.markdown("---")
169
+
170
+ elif not api_key:
171
+ st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed")
172
+
173
+ elif not uploaded_files:
174
+ st.info("📤 Please upload PDF documents using the file uploader above")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ google-generativeai
3
+ pandas
4
+ pillow
5
+ python-dotenv
6
+ PyPDF2>=3.0.0
7
+ pdf2image>=1.16.3
8
+ poppler-utils
utils.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ import os
3
+ import pandas as pd
4
+ import io
5
+ import tempfile
6
+ from PyPDF2 import PdfReader
7
+ import re
8
+ import csv
9
+
10
+ def configure_gemini(api_key: str):
11
+ """Configure Gemini API with the provided key"""
12
+ genai.configure(api_key=api_key)
13
+
14
+ def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"):
15
+ """Analyze a PDF directly using Gemini's PDF support"""
16
+ model = genai.GenerativeModel(model_name)
17
+
18
+ # Create a temporary PDF file
19
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
20
+ tmp_file.write(pdf_bytes)
21
+ tmp_file_path = tmp_file.name
22
+
23
+ try:
24
+ # Use the file upload feature
25
+ response = model.generate_content(
26
+ [prompt, genai.upload_file(tmp_file_path)]
27
+ )
28
+ print(f"Response: {response}")
29
+ return response.text
30
+ finally:
31
+ # Clean up temporary file
32
+ if os.path.exists(tmp_file_path):
33
+ os.unlink(tmp_file_path)
34
+
35
+ def extract_response_text(response) -> str:
36
+ """Extract text content from Gemini response object"""
37
+ try:
38
+ if hasattr(response, 'text'):
39
+ return response.text
40
+ elif hasattr(response, 'result') and hasattr(response.result, 'candidates'):
41
+ for candidate in response.result.candidates:
42
+ if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
43
+ for part in candidate.content.parts:
44
+ if hasattr(part, 'text'):
45
+ return part.text
46
+ return str(response)
47
+ except Exception as e:
48
+ print(f"Error extracting response text: {str(e)}")
49
+ return str(response)
50
+
51
+ def extract_csv_from_response(response) -> str:
52
+ """Extract CSV data from Gemini response"""
53
+ try:
54
+ # Get the text content from the response
55
+ response_text = extract_response_text(response)
56
+
57
+ # Extract CSV content between ```csv markers
58
+ csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL)
59
+ if csv_match:
60
+ return csv_match.group(1).strip()
61
+
62
+ # Fallback: Try to find any CSV-like content
63
+ lines = []
64
+ in_csv = False
65
+ for line in response_text.split('\n'):
66
+ if ',' in line and ('Category,' in line or 'Location,' in line):
67
+ in_csv = True
68
+ if in_csv:
69
+ lines.append(line)
70
+ if lines:
71
+ return '\n'.join(lines)
72
+
73
+ return response_text # Return full response if no CSV found
74
+ except Exception as e:
75
+ print(f"Error extracting CSV: {str(e)}")
76
+ return response.text if hasattr(response, 'text') else str(response)
77
+
78
+ def csv_to_dataframe(csv_data: str) -> pd.DataFrame:
79
+ """Convert CSV string to pandas DataFrame with error handling"""
80
+ if not csv_data.strip():
81
+ return pd.DataFrame()
82
+
83
+ try:
84
+ # Clean line breaks and extra spaces
85
+ cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()])
86
+
87
+ # Use CSV reader to handle irregular fields
88
+ rows = []
89
+ reader = csv.reader(io.StringIO(cleaned_data),
90
+ delimiter=',',
91
+ quotechar='"',
92
+ skipinitialspace=True)
93
+
94
+ header = next(reader)
95
+ for row in reader:
96
+ if len(row) > len(header):
97
+ # Combine extra fields into the last column
98
+ row = row[:len(header)-1] + [','.join(row[len(header)-1:])]
99
+ rows.append(row)
100
+
101
+ return pd.DataFrame(rows, columns=header)
102
+
103
+ except Exception as e:
104
+ print(f"CSV conversion error: {str(e)}")
105
+ try:
106
+ # Fallback to pandas with flexible parsing
107
+ return pd.read_csv(io.StringIO(cleaned_data),
108
+ on_bad_lines='warn',
109
+ engine='python',
110
+ quotechar='"',
111
+ skipinitialspace=True)
112
+ except Exception as fallback_error:
113
+ print(f"Fallback conversion failed: {str(fallback_error)}")
114
+ return pd.DataFrame()
115
+
116
+
117
+ def save_csv(csv_data: str, filename: str) -> str:
118
+ """Save CSV data to file"""
119
+ with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
120
+ csvfile.write(csv_data.strip())
121
+ return filename
122
+
123
+ def get_pdf_metadata(pdf_bytes: bytes) -> dict:
124
+ """Extract basic PDF metadata"""
125
+ reader = PdfReader(io.BytesIO(pdf_bytes))
126
+ return {
127
+ 'page_count': len(reader.pages),
128
+ 'author': reader.metadata.author if reader.metadata else None,
129
+ 'title': reader.metadata.title if reader.metadata else None
130
+ }