Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import google.generativeai as genai
|
2 |
|
3 |
-
genai.configure(api_key="
|
4 |
import os
|
5 |
import pandas as pd
|
6 |
import io
|
@@ -32,6 +32,69 @@ def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
|
|
32 |
images.append(img)
|
33 |
return images
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def analyze_single_document(images: list, prompt: str) -> dict:
|
36 |
"""Analyze a single document and return results"""
|
37 |
model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25')
|
|
|
1 |
import google.generativeai as genai
|
2 |
|
3 |
+
genai.configure(api_key="AIzaSyDxp4tYzBK7RB8y3jIIF4TpyPZgCQP8NTY")
|
4 |
import os
|
5 |
import pandas as pd
|
6 |
import io
|
|
|
32 |
images.append(img)
|
33 |
return images
|
34 |
|
35 |
+
|
36 |
+
def process_local_pdf(pdf_bytes: bytes):
|
37 |
+
"""
|
38 |
+
Process a local PDF file with Gemini AI.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
file_path: Path to the PDF file
|
42 |
+
prompt: The prompt template to use (should contain {page_num} if needed)
|
43 |
+
api_key: Your Google AI Studio API key
|
44 |
+
"""
|
45 |
+
# Configure Gemini
|
46 |
+
prompt ="""Please analyze the provided images of the real estate document set and perform the following actions:
|
47 |
+
|
48 |
+
1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
|
49 |
+
2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
|
50 |
+
3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
|
51 |
+
4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
|
52 |
+
5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
|
53 |
+
6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
|
54 |
+
7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
|
55 |
+
8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
|
56 |
+
* Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
|
57 |
+
* Location (Document Name/Page, e.g., Sale Contract Pg 2)
|
58 |
+
* Image number (just make image number {} done)
|
59 |
+
* Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
|
60 |
+
* Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
|
61 |
+
* Details (Specifics like names, text of the checkbox, description of the issue or document status)
|
62 |
+
* Secondary Question (if applicable) (The question generated in step 4)
|
63 |
+
|
64 |
+
Please apply this analysis to the entire set of documents provided.
|
65 |
+
"""
|
66 |
+
|
67 |
+
# Convert to images
|
68 |
+
images = pdf_to_images(pdf_bytes)
|
69 |
+
|
70 |
+
# Process each page
|
71 |
+
combined_df = pd.DataFrame()
|
72 |
+
for i, img in enumerate(images):
|
73 |
+
try:
|
74 |
+
model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25') # Updated model name
|
75 |
+
local_prompt = prompt.format(i+1)
|
76 |
+
|
77 |
+
# Send both the prompt and image to Gemini
|
78 |
+
response = model.generate_content([local_prompt, img])
|
79 |
+
|
80 |
+
# Extract CSV response
|
81 |
+
answer_csv = extract_csv_from_response(response)
|
82 |
+
answer_df = csv_to_dataframe(answer_csv)
|
83 |
+
|
84 |
+
# Combine DataFrames if needed
|
85 |
+
if not answer_df.empty:
|
86 |
+
combined_df = pd.concat([combined_df, answer_df], ignore_index=True)
|
87 |
+
|
88 |
+
print(f"Processed page {i+1}")
|
89 |
+
print("Response:")
|
90 |
+
print(answer_csv)
|
91 |
+
print("\n" + "="*50 + "\n")
|
92 |
+
|
93 |
+
except Exception as e:
|
94 |
+
print(f"Error processing page {i+1}: {str(e)}")
|
95 |
+
|
96 |
+
return combined_df
|
97 |
+
|
98 |
def analyze_single_document(images: list, prompt: str) -> dict:
|
99 |
"""Analyze a single document and return results"""
|
100 |
model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25')
|