Spaces:
Sleeping
Sleeping
first commit
Browse files- .env +1 -0
- app.py +183 -0
- dockerfile +25 -0
- requirements.txt +8 -0
- utils.py +146 -0
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
GEMINI_API_KEY=AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA
|
app.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from utils import (
|
5 |
+
configure_gemini,
|
6 |
+
analyze_pdf_directly,
|
7 |
+
csv_to_dataframe,
|
8 |
+
save_csv,
|
9 |
+
get_pdf_metadata,
|
10 |
+
extract_csv_from_response,
|
11 |
+
pdf_to_images,
|
12 |
+
analyze_single_document
|
13 |
+
)
|
14 |
+
import base64
|
15 |
+
from datetime import datetime
|
16 |
+
import tempfile
|
17 |
+
|
18 |
+
# Load environment variables
|
19 |
+
load_dotenv()
|
20 |
+
|
21 |
+
# Configure page settings
|
22 |
+
st.set_page_config(
|
23 |
+
page_title="PDF Document Analyzer",
|
24 |
+
page_icon="📄",
|
25 |
+
layout="wide",
|
26 |
+
initial_sidebar_state="expanded"
|
27 |
+
)
|
28 |
+
|
29 |
+
# Custom CSS styling
|
30 |
+
st.markdown("""
|
31 |
+
<style>
|
32 |
+
.document-card {
|
33 |
+
border-radius: 10px;
|
34 |
+
padding: 1.5rem;
|
35 |
+
margin-bottom: 1.5rem;
|
36 |
+
background-color: white;
|
37 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
|
38 |
+
transition: transform 0.2s;
|
39 |
+
}
|
40 |
+
.document-card:hover {
|
41 |
+
transform: translateY(-2px);
|
42 |
+
}
|
43 |
+
.stButton>button {
|
44 |
+
background-color: #4285F4;
|
45 |
+
color: white;
|
46 |
+
border-radius: 8px;
|
47 |
+
padding: 0.5rem 1.5rem;
|
48 |
+
font-weight: 500;
|
49 |
+
}
|
50 |
+
.analysis-section {
|
51 |
+
border-left: 4px solid #4285F4;
|
52 |
+
padding-left: 1rem;
|
53 |
+
margin-top: 1.5rem;
|
54 |
+
}
|
55 |
+
</style>
|
56 |
+
""", unsafe_allow_html=True)
|
57 |
+
|
58 |
+
# App Header
|
59 |
+
st.title("📄 PDF Document Analyzer")
|
60 |
+
st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")
|
61 |
+
|
62 |
+
# Load prompt
|
63 |
+
PROMPT ="""Please analyze the provided images of the real estate document set and perform the following actions:
|
64 |
+
|
65 |
+
1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
|
66 |
+
2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
|
67 |
+
3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
|
68 |
+
4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
|
69 |
+
5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
|
70 |
+
6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
|
71 |
+
7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
|
72 |
+
8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
|
73 |
+
* Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
|
74 |
+
* Location (Document Name/Page, e.g., Sale Contract Pg 2)
|
75 |
+
* Line Item(s) (Approximate line number or location description)
|
76 |
+
* Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
|
77 |
+
* Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
|
78 |
+
* Details (Specifics like names, text of the checkbox, description of the issue or document status)
|
79 |
+
* Secondary Question (if applicable) (The question generated in step 4)
|
80 |
+
|
81 |
+
Please apply this analysis to the entire set of documents provided.
|
82 |
+
"""
|
83 |
+
|
84 |
+
# Sidebar Configuration
|
85 |
+
with st.sidebar:
|
86 |
+
st.header("Configuration")
|
87 |
+
api_key = st.text_input(
|
88 |
+
"Enter Gemini API Key:",
|
89 |
+
type="password",
|
90 |
+
value=os.getenv("GEMINI_API_KEY", "")
|
91 |
+
)
|
92 |
+
if api_key:
|
93 |
+
configure_gemini(api_key)
|
94 |
+
|
95 |
+
st.markdown("---")
|
96 |
+
st.info("""
|
97 |
+
**Features:**
|
98 |
+
- PDF processing using images partitioned by page
|
99 |
+
- Individual analysis for each document
|
100 |
+
- Downloadable CSV reports
|
101 |
+
""")
|
102 |
+
|
103 |
+
# Main App Content
|
104 |
+
uploaded_files = st.file_uploader(
|
105 |
+
"Upload PDF Documents",
|
106 |
+
type=["pdf"],
|
107 |
+
accept_multiple_files=True,
|
108 |
+
help="Upload multiple PDF documents for analysis"
|
109 |
+
)
|
110 |
+
|
111 |
+
if uploaded_files and api_key:
|
112 |
+
st.success(f"✅ {len(uploaded_files)} PDF(s) ready for analysis")
|
113 |
+
|
114 |
+
# Process each PDF separately
|
115 |
+
for i, uploaded_file in enumerate(uploaded_files):
|
116 |
+
with st.container():
|
117 |
+
st.markdown(f"### 📑 Document {i+1}: {uploaded_file.name}")
|
118 |
+
|
119 |
+
# Display document info
|
120 |
+
metadata = get_pdf_metadata(uploaded_file.getvalue())
|
121 |
+
col1, col2, col3 = st.columns(3)
|
122 |
+
with col1:
|
123 |
+
st.metric("Pages", metadata['page_count'])
|
124 |
+
with col2:
|
125 |
+
st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
|
126 |
+
with col3:
|
127 |
+
if st.button(f"Analyze Document", key=f"analyze_{i}"):
|
128 |
+
with st.spinner(f"Analyzing {uploaded_file.name}..."):
|
129 |
+
try:
|
130 |
+
# Analyze PDF directly
|
131 |
+
# Convert PDF to images
|
132 |
+
images = pdf_to_images(uploaded_file.getvalue())
|
133 |
+
|
134 |
+
# Analyze document
|
135 |
+
raw_response = analyze_single_document(images, PROMPT)
|
136 |
+
|
137 |
+
# raw_response = analyze_pdf_directly(
|
138 |
+
# pdf_bytes=uploaded_file.getvalue(),
|
139 |
+
# prompt=PROMPT,
|
140 |
+
# model_name="gemini-1.5-pro" # or "gemini-1.5-flash"
|
141 |
+
# )
|
142 |
+
|
143 |
+
# Process response
|
144 |
+
csv_data = extract_csv_from_response(raw_response)
|
145 |
+
|
146 |
+
# Display results in expandable section
|
147 |
+
with st.expander("View Analysis Results", expanded=True):
|
148 |
+
if csv_data:
|
149 |
+
df = csv_to_dataframe(csv_data)
|
150 |
+
print(f"DataFrame: {df}")
|
151 |
+
if not df.empty:
|
152 |
+
st.dataframe(df)
|
153 |
+
|
154 |
+
# Download button
|
155 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
156 |
+
csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv"
|
157 |
+
|
158 |
+
st.download_button(
|
159 |
+
label="Download Analysis",
|
160 |
+
data=csv_data,
|
161 |
+
file_name=csv_filename,
|
162 |
+
mime="text/csv",
|
163 |
+
key=f"download_{i}"
|
164 |
+
)
|
165 |
+
else:
|
166 |
+
st.warning("No tabular data found in response")
|
167 |
+
st.markdown("### Full Response")
|
168 |
+
st.write(raw_response)
|
169 |
+
else:
|
170 |
+
st.warning("No CSV data found in response")
|
171 |
+
st.markdown("### Full Response")
|
172 |
+
st.write(raw_response)
|
173 |
+
|
174 |
+
except Exception as e:
|
175 |
+
st.error(f"Analysis failed: {str(e)}")
|
176 |
+
|
177 |
+
st.markdown("---")
|
178 |
+
|
179 |
+
elif not api_key:
|
180 |
+
st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed")
|
181 |
+
|
182 |
+
elif not uploaded_files:
|
183 |
+
st.info("📤 Please upload PDF documents using the file uploader above")
|
dockerfile
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.9-slim
|
5 |
+
|
6 |
+
# Install system dependencies first as root
|
7 |
+
RUN apt-get update && \
|
8 |
+
apt-get install -y --no-install-recommends poppler-utils && \
|
9 |
+
rm -rf /var/lib/apt/lists/*
|
10 |
+
|
11 |
+
# Create non-root user
|
12 |
+
RUN useradd -m -u 1000 user
|
13 |
+
USER user
|
14 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
15 |
+
WORKDIR /app
|
16 |
+
|
17 |
+
# Copy requirements first for better caching
|
18 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
19 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
20 |
+
|
21 |
+
# Copy the rest of the application
|
22 |
+
COPY --chown=user . /app
|
23 |
+
|
24 |
+
# Run the application
|
25 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
google-generativeai
|
3 |
+
pandas
|
4 |
+
pillow
|
5 |
+
python-dotenv
|
6 |
+
PyPDF2>=3.0.0
|
7 |
+
pdf2image>=1.16.3
|
8 |
+
poppler-utils
|
utils.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import google.generativeai as genai
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
import io
|
5 |
+
import tempfile
|
6 |
+
from PyPDF2 import PdfReader
|
7 |
+
import re
|
8 |
+
import csv
|
9 |
+
from PIL import Image
|
10 |
+
import os
|
11 |
+
import pandas as pd
|
12 |
+
import io
|
13 |
+
import tempfile
|
14 |
+
from PyPDF2 import PdfReader
|
15 |
+
from pdf2image import convert_from_bytes
|
16 |
+
|
17 |
+
def configure_gemini(api_key: str):
|
18 |
+
"""Configure Gemini API with the provided key"""
|
19 |
+
genai.configure(api_key=api_key)
|
20 |
+
|
21 |
+
def pdf_to_images(pdf_bytes: bytes) -> list:
|
22 |
+
"""Convert PDF bytes to list of PIL Images"""
|
23 |
+
return convert_from_bytes(pdf_bytes)
|
24 |
+
|
25 |
+
def analyze_single_document(images: list, prompt: str) -> dict:
|
26 |
+
"""Analyze a single document and return results"""
|
27 |
+
model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp-01-21')
|
28 |
+
response = model.generate_content([prompt] + images)
|
29 |
+
return response.text
|
30 |
+
def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"):
|
31 |
+
"""Analyze a PDF directly using Gemini's PDF support"""
|
32 |
+
model = genai.GenerativeModel(model_name)
|
33 |
+
|
34 |
+
# Create a temporary PDF file
|
35 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
|
36 |
+
tmp_file.write(pdf_bytes)
|
37 |
+
tmp_file_path = tmp_file.name
|
38 |
+
|
39 |
+
try:
|
40 |
+
# Use the file upload feature
|
41 |
+
response = model.generate_content(
|
42 |
+
[prompt, genai.upload_file(tmp_file_path)]
|
43 |
+
)
|
44 |
+
print(f"Response: {response}")
|
45 |
+
return response.text
|
46 |
+
finally:
|
47 |
+
# Clean up temporary file
|
48 |
+
if os.path.exists(tmp_file_path):
|
49 |
+
os.unlink(tmp_file_path)
|
50 |
+
|
51 |
+
def extract_response_text(response) -> str:
|
52 |
+
"""Extract text content from Gemini response object"""
|
53 |
+
try:
|
54 |
+
if hasattr(response, 'text'):
|
55 |
+
return response.text
|
56 |
+
elif hasattr(response, 'result') and hasattr(response.result, 'candidates'):
|
57 |
+
for candidate in response.result.candidates:
|
58 |
+
if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
|
59 |
+
for part in candidate.content.parts:
|
60 |
+
if hasattr(part, 'text'):
|
61 |
+
return part.text
|
62 |
+
return str(response)
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Error extracting response text: {str(e)}")
|
65 |
+
return str(response)
|
66 |
+
|
67 |
+
def extract_csv_from_response(response) -> str:
|
68 |
+
"""Extract CSV data from Gemini response"""
|
69 |
+
try:
|
70 |
+
# Get the text content from the response
|
71 |
+
response_text = extract_response_text(response)
|
72 |
+
|
73 |
+
# Extract CSV content between ```csv markers
|
74 |
+
csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL)
|
75 |
+
if csv_match:
|
76 |
+
return csv_match.group(1).strip()
|
77 |
+
|
78 |
+
# Fallback: Try to find any CSV-like content
|
79 |
+
lines = []
|
80 |
+
in_csv = False
|
81 |
+
for line in response_text.split('\n'):
|
82 |
+
if ',' in line and ('Category,' in line or 'Location,' in line):
|
83 |
+
in_csv = True
|
84 |
+
if in_csv:
|
85 |
+
lines.append(line)
|
86 |
+
if lines:
|
87 |
+
return '\n'.join(lines)
|
88 |
+
|
89 |
+
return response_text # Return full response if no CSV found
|
90 |
+
except Exception as e:
|
91 |
+
print(f"Error extracting CSV: {str(e)}")
|
92 |
+
return response.text if hasattr(response, 'text') else str(response)
|
93 |
+
|
94 |
+
def csv_to_dataframe(csv_data: str) -> pd.DataFrame:
|
95 |
+
"""Convert CSV string to pandas DataFrame with error handling"""
|
96 |
+
if not csv_data.strip():
|
97 |
+
return pd.DataFrame()
|
98 |
+
|
99 |
+
try:
|
100 |
+
# Clean line breaks and extra spaces
|
101 |
+
cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()])
|
102 |
+
|
103 |
+
# Use CSV reader to handle irregular fields
|
104 |
+
rows = []
|
105 |
+
reader = csv.reader(io.StringIO(cleaned_data),
|
106 |
+
delimiter=',',
|
107 |
+
quotechar='"',
|
108 |
+
skipinitialspace=True)
|
109 |
+
|
110 |
+
header = next(reader)
|
111 |
+
for row in reader:
|
112 |
+
if len(row) > len(header):
|
113 |
+
# Combine extra fields into the last column
|
114 |
+
row = row[:len(header)-1] + [','.join(row[len(header)-1:])]
|
115 |
+
rows.append(row)
|
116 |
+
|
117 |
+
return pd.DataFrame(rows, columns=header)
|
118 |
+
|
119 |
+
except Exception as e:
|
120 |
+
print(f"CSV conversion error: {str(e)}")
|
121 |
+
try:
|
122 |
+
# Fallback to pandas with flexible parsing
|
123 |
+
return pd.read_csv(io.StringIO(cleaned_data),
|
124 |
+
on_bad_lines='warn',
|
125 |
+
engine='python',
|
126 |
+
quotechar='"',
|
127 |
+
skipinitialspace=True)
|
128 |
+
except Exception as fallback_error:
|
129 |
+
print(f"Fallback conversion failed: {str(fallback_error)}")
|
130 |
+
return pd.DataFrame()
|
131 |
+
|
132 |
+
|
133 |
+
def save_csv(csv_data: str, filename: str) -> str:
|
134 |
+
"""Save CSV data to file"""
|
135 |
+
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
136 |
+
csvfile.write(csv_data.strip())
|
137 |
+
return filename
|
138 |
+
|
139 |
+
def get_pdf_metadata(pdf_bytes: bytes) -> dict:
|
140 |
+
"""Extract basic PDF metadata"""
|
141 |
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
142 |
+
return {
|
143 |
+
'page_count': len(reader.pages),
|
144 |
+
'author': reader.metadata.author if reader.metadata else None,
|
145 |
+
'title': reader.metadata.title if reader.metadata else None
|
146 |
+
}
|