Nechba commited on
Commit
7d7653b
·
1 Parent(s): 5ced2c9

first commit

Browse files
Files changed (5) hide show
  1. .env +1 -0
  2. app.py +183 -0
  3. dockerfile +25 -0
  4. requirements.txt +8 -0
  5. utils.py +146 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GEMINI_API_KEY=AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from utils import (
5
+ configure_gemini,
6
+ analyze_pdf_directly,
7
+ csv_to_dataframe,
8
+ save_csv,
9
+ get_pdf_metadata,
10
+ extract_csv_from_response,
11
+ pdf_to_images,
12
+ analyze_single_document
13
+ )
14
+ import base64
15
+ from datetime import datetime
16
+ import tempfile
17
+
18
+ # Load environment variables
19
+ load_dotenv()
20
+
21
+ # Configure page settings
22
+ st.set_page_config(
23
+ page_title="PDF Document Analyzer",
24
+ page_icon="📄",
25
+ layout="wide",
26
+ initial_sidebar_state="expanded"
27
+ )
28
+
29
+ # Custom CSS styling
30
+ st.markdown("""
31
+ <style>
32
+ .document-card {
33
+ border-radius: 10px;
34
+ padding: 1.5rem;
35
+ margin-bottom: 1.5rem;
36
+ background-color: white;
37
+ box-shadow: 0 4px 12px rgba(0,0,0,0.1);
38
+ transition: transform 0.2s;
39
+ }
40
+ .document-card:hover {
41
+ transform: translateY(-2px);
42
+ }
43
+ .stButton>button {
44
+ background-color: #4285F4;
45
+ color: white;
46
+ border-radius: 8px;
47
+ padding: 0.5rem 1.5rem;
48
+ font-weight: 500;
49
+ }
50
+ .analysis-section {
51
+ border-left: 4px solid #4285F4;
52
+ padding-left: 1rem;
53
+ margin-top: 1.5rem;
54
+ }
55
+ </style>
56
+ """, unsafe_allow_html=True)
57
+
58
+ # App Header
59
+ st.title("📄 PDF Document Analyzer")
60
+ st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")
61
+
62
+ # Load prompt
63
+ PROMPT ="""Please analyze the provided images of the real estate document set and perform the following actions:
64
+
65
+ 1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
66
+ 2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
67
+ 3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
68
+ 4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
69
+ 5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
70
+ 6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
71
+ 7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
72
+ 8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
73
+ * Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
74
+ * Location (Document Name/Page, e.g., Sale Contract Pg 2)
75
+ * Line Item(s) (Approximate line number or location description)
76
+ * Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
77
+ * Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
78
+ * Details (Specifics like names, text of the checkbox, description of the issue or document status)
79
+ * Secondary Question (if applicable) (The question generated in step 4)
80
+
81
+ Please apply this analysis to the entire set of documents provided.
82
+ """
83
+
84
+ # Sidebar Configuration
85
+ with st.sidebar:
86
+ st.header("Configuration")
87
+ api_key = st.text_input(
88
+ "Enter Gemini API Key:",
89
+ type="password",
90
+ value=os.getenv("GEMINI_API_KEY", "")
91
+ )
92
+ if api_key:
93
+ configure_gemini(api_key)
94
+
95
+ st.markdown("---")
96
+ st.info("""
97
+ **Features:**
98
+ - PDF processing using images partitioned by page
99
+ - Individual analysis for each document
100
+ - Downloadable CSV reports
101
+ """)
102
+
103
+ # Main App Content
104
+ uploaded_files = st.file_uploader(
105
+ "Upload PDF Documents",
106
+ type=["pdf"],
107
+ accept_multiple_files=True,
108
+ help="Upload multiple PDF documents for analysis"
109
+ )
110
+
111
+ if uploaded_files and api_key:
112
+ st.success(f"✅ {len(uploaded_files)} PDF(s) ready for analysis")
113
+
114
+ # Process each PDF separately
115
+ for i, uploaded_file in enumerate(uploaded_files):
116
+ with st.container():
117
+ st.markdown(f"### 📑 Document {i+1}: {uploaded_file.name}")
118
+
119
+ # Display document info
120
+ metadata = get_pdf_metadata(uploaded_file.getvalue())
121
+ col1, col2, col3 = st.columns(3)
122
+ with col1:
123
+ st.metric("Pages", metadata['page_count'])
124
+ with col2:
125
+ st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
126
+ with col3:
127
+ if st.button(f"Analyze Document", key=f"analyze_{i}"):
128
+ with st.spinner(f"Analyzing {uploaded_file.name}..."):
129
+ try:
130
+ # Analyze PDF directly
131
+ # Convert PDF to images
132
+ images = pdf_to_images(uploaded_file.getvalue())
133
+
134
+ # Analyze document
135
+ raw_response = analyze_single_document(images, PROMPT)
136
+
137
+ # raw_response = analyze_pdf_directly(
138
+ # pdf_bytes=uploaded_file.getvalue(),
139
+ # prompt=PROMPT,
140
+ # model_name="gemini-1.5-pro" # or "gemini-1.5-flash"
141
+ # )
142
+
143
+ # Process response
144
+ csv_data = extract_csv_from_response(raw_response)
145
+
146
+ # Display results in expandable section
147
+ with st.expander("View Analysis Results", expanded=True):
148
+ if csv_data:
149
+ df = csv_to_dataframe(csv_data)
150
+ print(f"DataFrame: {df}")
151
+ if not df.empty:
152
+ st.dataframe(df)
153
+
154
+ # Download button
155
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
156
+ csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv"
157
+
158
+ st.download_button(
159
+ label="Download Analysis",
160
+ data=csv_data,
161
+ file_name=csv_filename,
162
+ mime="text/csv",
163
+ key=f"download_{i}"
164
+ )
165
+ else:
166
+ st.warning("No tabular data found in response")
167
+ st.markdown("### Full Response")
168
+ st.write(raw_response)
169
+ else:
170
+ st.warning("No CSV data found in response")
171
+ st.markdown("### Full Response")
172
+ st.write(raw_response)
173
+
174
+ except Exception as e:
175
+ st.error(f"Analysis failed: {str(e)}")
176
+
177
+ st.markdown("---")
178
+
179
+ elif not api_key:
180
+ st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed")
181
+
182
+ elif not uploaded_files:
183
+ st.info("📤 Please upload PDF documents using the file uploader above")
dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9-slim
5
+
6
+ # Install system dependencies first as root
7
+ RUN apt-get update && \
8
+ apt-get install -y --no-install-recommends poppler-utils && \
9
+ rm -rf /var/lib/apt/lists/*
10
+
11
+ # Create non-root user
12
+ RUN useradd -m -u 1000 user
13
+ USER user
14
+ ENV PATH="/home/user/.local/bin:$PATH"
15
+ WORKDIR /app
16
+
17
+ # Copy requirements first for better caching
18
+ COPY --chown=user ./requirements.txt requirements.txt
19
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
20
+
21
+ # Copy the rest of the application
22
+ COPY --chown=user . /app
23
+
24
+ # Run the application
25
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ google-generativeai
3
+ pandas
4
+ pillow
5
+ python-dotenv
6
+ PyPDF2>=3.0.0
7
+ pdf2image>=1.16.3
8
+ poppler-utils
utils.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ import os
3
+ import pandas as pd
4
+ import io
5
+ import tempfile
6
+ from PyPDF2 import PdfReader
7
+ import re
8
+ import csv
9
+ from PIL import Image
10
+ import os
11
+ import pandas as pd
12
+ import io
13
+ import tempfile
14
+ from PyPDF2 import PdfReader
15
+ from pdf2image import convert_from_bytes
16
+
17
+ def configure_gemini(api_key: str):
18
+ """Configure Gemini API with the provided key"""
19
+ genai.configure(api_key=api_key)
20
+
21
+ def pdf_to_images(pdf_bytes: bytes) -> list:
22
+ """Convert PDF bytes to list of PIL Images"""
23
+ return convert_from_bytes(pdf_bytes)
24
+
25
+ def analyze_single_document(images: list, prompt: str) -> dict:
26
+ """Analyze a single document and return results"""
27
+ model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp-01-21')
28
+ response = model.generate_content([prompt] + images)
29
+ return response.text
30
+ def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"):
31
+ """Analyze a PDF directly using Gemini's PDF support"""
32
+ model = genai.GenerativeModel(model_name)
33
+
34
+ # Create a temporary PDF file
35
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
36
+ tmp_file.write(pdf_bytes)
37
+ tmp_file_path = tmp_file.name
38
+
39
+ try:
40
+ # Use the file upload feature
41
+ response = model.generate_content(
42
+ [prompt, genai.upload_file(tmp_file_path)]
43
+ )
44
+ print(f"Response: {response}")
45
+ return response.text
46
+ finally:
47
+ # Clean up temporary file
48
+ if os.path.exists(tmp_file_path):
49
+ os.unlink(tmp_file_path)
50
+
51
+ def extract_response_text(response) -> str:
52
+ """Extract text content from Gemini response object"""
53
+ try:
54
+ if hasattr(response, 'text'):
55
+ return response.text
56
+ elif hasattr(response, 'result') and hasattr(response.result, 'candidates'):
57
+ for candidate in response.result.candidates:
58
+ if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
59
+ for part in candidate.content.parts:
60
+ if hasattr(part, 'text'):
61
+ return part.text
62
+ return str(response)
63
+ except Exception as e:
64
+ print(f"Error extracting response text: {str(e)}")
65
+ return str(response)
66
+
67
+ def extract_csv_from_response(response) -> str:
68
+ """Extract CSV data from Gemini response"""
69
+ try:
70
+ # Get the text content from the response
71
+ response_text = extract_response_text(response)
72
+
73
+ # Extract CSV content between ```csv markers
74
+ csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL)
75
+ if csv_match:
76
+ return csv_match.group(1).strip()
77
+
78
+ # Fallback: Try to find any CSV-like content
79
+ lines = []
80
+ in_csv = False
81
+ for line in response_text.split('\n'):
82
+ if ',' in line and ('Category,' in line or 'Location,' in line):
83
+ in_csv = True
84
+ if in_csv:
85
+ lines.append(line)
86
+ if lines:
87
+ return '\n'.join(lines)
88
+
89
+ return response_text # Return full response if no CSV found
90
+ except Exception as e:
91
+ print(f"Error extracting CSV: {str(e)}")
92
+ return response.text if hasattr(response, 'text') else str(response)
93
+
94
+ def csv_to_dataframe(csv_data: str) -> pd.DataFrame:
95
+ """Convert CSV string to pandas DataFrame with error handling"""
96
+ if not csv_data.strip():
97
+ return pd.DataFrame()
98
+
99
+ try:
100
+ # Clean line breaks and extra spaces
101
+ cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()])
102
+
103
+ # Use CSV reader to handle irregular fields
104
+ rows = []
105
+ reader = csv.reader(io.StringIO(cleaned_data),
106
+ delimiter=',',
107
+ quotechar='"',
108
+ skipinitialspace=True)
109
+
110
+ header = next(reader)
111
+ for row in reader:
112
+ if len(row) > len(header):
113
+ # Combine extra fields into the last column
114
+ row = row[:len(header)-1] + [','.join(row[len(header)-1:])]
115
+ rows.append(row)
116
+
117
+ return pd.DataFrame(rows, columns=header)
118
+
119
+ except Exception as e:
120
+ print(f"CSV conversion error: {str(e)}")
121
+ try:
122
+ # Fallback to pandas with flexible parsing
123
+ return pd.read_csv(io.StringIO(cleaned_data),
124
+ on_bad_lines='warn',
125
+ engine='python',
126
+ quotechar='"',
127
+ skipinitialspace=True)
128
+ except Exception as fallback_error:
129
+ print(f"Fallback conversion failed: {str(fallback_error)}")
130
+ return pd.DataFrame()
131
+
132
+
133
+ def save_csv(csv_data: str, filename: str) -> str:
134
+ """Save CSV data to file"""
135
+ with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
136
+ csvfile.write(csv_data.strip())
137
+ return filename
138
+
139
+ def get_pdf_metadata(pdf_bytes: bytes) -> dict:
140
+ """Extract basic PDF metadata"""
141
+ reader = PdfReader(io.BytesIO(pdf_bytes))
142
+ return {
143
+ 'page_count': len(reader.pages),
144
+ 'author': reader.metadata.author if reader.metadata else None,
145
+ 'title': reader.metadata.title if reader.metadata else None
146
+ }