File size: 9,841 Bytes
3a55e0a
1a1ac75
dbf5064
3a55e0a
 
 
 
 
 
 
a1ae797
326d072
 
3a55e0a
 
 
 
 
326d072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1ae797
dbf5064
 
 
 
 
 
 
 
 
 
 
2012a23
72eb62e
2012a23
72eb62e
2012a23
72eb62e
2012a23
72eb62e
2012a23
72eb62e
2012a23
72eb62e
2012a23
72eb62e
2012a23
72eb62e
2012a23
 
 
 
 
 
 
 
72eb62e
dbf5064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1ae797
 
6be9120
a1ae797
 
3a55e0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import google.generativeai as genai

genai.configure(api_key="AIzaSyDxp4tYzBK7RB8y3jIIF4TpyPZgCQP8NTY")
import os
import pandas as pd
import io
import tempfile
from PyPDF2 import PdfReader
import re
import csv
from PIL import Image
import fitz  # PyMuPDF
from PIL import Image

def configure_gemini(api_key: str):
    """Configure Gemini API with the provided key"""
    genai.configure(api_key=api_key)

# def pdf_to_images(pdf_bytes: bytes) -> list:
#     """Convert PDF bytes to list of PIL Images"""
#     return convert_from_bytes(pdf_bytes)



def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
    """Convert PDF to PIL Images using PyMuPDF (no poppler needed)."""
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    images = []
    for page in doc:
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images


def process_local_pdf(pdf_bytes: bytes):
    """
    Process a local PDF file with Gemini AI.

    Args:
        file_path: Path to the PDF file
        prompt: The prompt template to use (should contain {page_num} if needed)
        api_key: Your Google AI Studio API key
    """
    # Configure Gemini
prompt = """Please analyze the provided images of the real estate document set and perform the following actions:

1. **Identify Parties**: Determine and list all present parties involved in the transaction. Always identify and include **Seller 1** and **Buyer 1** if they are present in the documents. Additionally, include **Seller 2** and **Buyer 2** only if they are explicitly mentioned.

2. **Identify Missing Items**: For each identified party, including at minimum **Seller 1** and **Buyer 1**, check all pages for any missing signatures or initials. Only check for **Seller 2** or **Buyer 2** if they were identified in step 1.

3. **Identify Checked Boxes**: Locate and list all checkboxes that have been marked or checked.

4. **Generate Secondary Questions**: For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.

5. **Check for Required Paperwork**: Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous'.

6. **Identify Conflicts**: Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).

7. **Provide Location**: For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).

8. **Format Output**: Present all findings in CSV format with the following columns:
   - **Category**: (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
   - **Location**: (e.g., Sale Contract (Image 8 Pg 1))
   - **Line Item(s)**: (e.g., 4)
   - **Item Type**: (e.g., Seller 1, Buyer 1, Seller Signature, Seller Initials)
   - **Status**: (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict)
   - **Details**: (e.g., "Seller signature line (top line) is empty.", "Two initial boxes for Seller (approx line 106-107 area) are empty.")
   - **Secondary Question** (if applicable): (e.g., "Is the Buyer aware they are waiving the home warranty?", "Has the Buyer received and reviewed the Seller's Disclosure?")
"""

    # Convert to images
    images = pdf_to_images(pdf_bytes)

    # Process each page
    combined_df = pd.DataFrame()
    for i, img in enumerate(images):
        try:
            model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25')  # Updated model name
            local_prompt = prompt.format(i+1)

            # Send both the prompt and image to Gemini
            response = model.generate_content([local_prompt, img])

            # Extract CSV response
            answer_csv = extract_csv_from_response(response)
            answer_df = csv_to_dataframe(answer_csv)

            # Combine DataFrames if needed
            if not answer_df.empty:
                combined_df = pd.concat([combined_df, answer_df], ignore_index=True)

            print(f"Processed page {i+1}")
            print("Response:")
            print(answer_csv)
            print("\n" + "="*50 + "\n")

        except Exception as e:
            print(f"Error processing page {i+1}: {str(e)}")

    return combined_df

def analyze_single_document(images: list, prompt: str) -> dict:
    """Analyze a single document and return results"""
    model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25')
    response = model.generate_content([prompt] + images)
    return response.text
def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"):
    """Analyze a PDF directly using Gemini's PDF support"""
    model = genai.GenerativeModel(model_name)
    
    # Create a temporary PDF file
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
        tmp_file.write(pdf_bytes)
        tmp_file_path = tmp_file.name
    
    try:
        # Use the file upload feature
        response = model.generate_content(
            [prompt, genai.upload_file(tmp_file_path)]
        )
        print(f"Response: {response}")
        return response.text
    finally:
        # Clean up temporary file
        if os.path.exists(tmp_file_path):
            os.unlink(tmp_file_path)

def extract_response_text(response) -> str:
    """Extract text content from Gemini response object"""
    try:
        if hasattr(response, 'text'):
            return response.text
        elif hasattr(response, 'result') and hasattr(response.result, 'candidates'):
            for candidate in response.result.candidates:
                if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
                    for part in candidate.content.parts:
                        if hasattr(part, 'text'):
                            return part.text
        return str(response)
    except Exception as e:
        print(f"Error extracting response text: {str(e)}")
        return str(response)
    
def extract_csv_from_response(response) -> str:
    """Extract CSV data from Gemini response"""
    try:
        # Get the text content from the response
        response_text = extract_response_text(response)
        
        # Extract CSV content between ```csv markers
        csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL)
        if csv_match:
            return csv_match.group(1).strip()
        
        # Fallback: Try to find any CSV-like content
        lines = []
        in_csv = False
        for line in response_text.split('\n'):
            if ',' in line and ('Category,' in line or 'Location,' in line):
                in_csv = True
            if in_csv:
                lines.append(line)
        if lines:
            return '\n'.join(lines)
        
        return response_text  # Return full response if no CSV found
    except Exception as e:
        print(f"Error extracting CSV: {str(e)}")
        return response.text if hasattr(response, 'text') else str(response)

def csv_to_dataframe(csv_data: str) -> pd.DataFrame:
    """Convert CSV string to pandas DataFrame with error handling"""
    if not csv_data.strip():
        return pd.DataFrame()
    
    try:
        # Clean line breaks and extra spaces
        cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()])
        
        # Use CSV reader to handle irregular fields
        rows = []
        reader = csv.reader(io.StringIO(cleaned_data), 
                        delimiter=',', 
                        quotechar='"',
                        skipinitialspace=True)
        
        header = next(reader)
        for row in reader:
            if len(row) > len(header):
                # Combine extra fields into the last column
                row = row[:len(header)-1] + [','.join(row[len(header)-1:])]
            rows.append(row)
        
        return pd.DataFrame(rows, columns=header)
    
    except Exception as e:
        print(f"CSV conversion error: {str(e)}")
        try:
            # Fallback to pandas with flexible parsing
            return pd.read_csv(io.StringIO(cleaned_data), 
                on_bad_lines='warn',
                engine='python',
                quotechar='"',
                skipinitialspace=True)
        except Exception as fallback_error:
            print(f"Fallback conversion failed: {str(fallback_error)}")
            return pd.DataFrame()


def save_csv(csv_data: str, filename: str) -> str:
    """Save CSV data to file"""
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        csvfile.write(csv_data.strip())
    return filename

def get_pdf_metadata(pdf_bytes: bytes) -> dict:
    """Extract basic PDF metadata"""
    reader = PdfReader(io.BytesIO(pdf_bytes))
    return {
        'page_count': len(reader.pages),
        'author': reader.metadata.author if reader.metadata else None,
        'title': reader.metadata.title if reader.metadata else None
    }