Nechba commited on
Commit
dbf5064
·
verified ·
1 Parent(s): 30582c7

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +64 -1
utils.py CHANGED
@@ -1,6 +1,6 @@
1
  import google.generativeai as genai
2
 
3
- genai.configure(api_key="AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA")
4
  import os
5
  import pandas as pd
6
  import io
@@ -32,6 +32,69 @@ def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
32
  images.append(img)
33
  return images
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def analyze_single_document(images: list, prompt: str) -> dict:
36
  """Analyze a single document and return results"""
37
  model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25')
 
1
  import google.generativeai as genai
2
 
3
+ genai.configure(api_key="AIzaSyDxp4tYzBK7RB8y3jIIF4TpyPZgCQP8NTY")
4
  import os
5
  import pandas as pd
6
  import io
 
32
  images.append(img)
33
  return images
34
 
35
+
36
+ def process_local_pdf(pdf_bytes: bytes):
37
+ """
38
+ Process a local PDF file with Gemini AI.
39
+
40
+ Args:
41
+ file_path: Path to the PDF file
42
+ prompt: The prompt template to use (should contain {page_num} if needed)
43
+ api_key: Your Google AI Studio API key
44
+ """
45
+ # Configure Gemini
46
+ prompt ="""Please analyze the provided images of the real estate document set and perform the following actions:
47
+
48
+ 1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
49
+ 2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
50
+ 3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
51
+ 4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
52
+ 5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
53
+ 6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
54
+ 7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
55
+ 8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
56
+ * Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
57
+ * Location (Document Name/Page, e.g., Sale Contract Pg 2)
58
+ * Image number (just make image number {} done)
59
+ * Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
60
+ * Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
61
+ * Details (Specifics like names, text of the checkbox, description of the issue or document status)
62
+ * Secondary Question (if applicable) (The question generated in step 4)
63
+
64
+ Please apply this analysis to the entire set of documents provided.
65
+ """
66
+
67
+ # Convert to images
68
+ images = pdf_to_images(pdf_bytes)
69
+
70
+ # Process each page
71
+ combined_df = pd.DataFrame()
72
+ for i, img in enumerate(images):
73
+ try:
74
+ model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25') # Updated model name
75
+ local_prompt = prompt.format(i+1)
76
+
77
+ # Send both the prompt and image to Gemini
78
+ response = model.generate_content([local_prompt, img])
79
+
80
+ # Extract CSV response
81
+ answer_csv = extract_csv_from_response(response)
82
+ answer_df = csv_to_dataframe(answer_csv)
83
+
84
+ # Combine DataFrames if needed
85
+ if not answer_df.empty:
86
+ combined_df = pd.concat([combined_df, answer_df], ignore_index=True)
87
+
88
+ print(f"Processed page {i+1}")
89
+ print("Response:")
90
+ print(answer_csv)
91
+ print("\n" + "="*50 + "\n")
92
+
93
+ except Exception as e:
94
+ print(f"Error processing page {i+1}: {str(e)}")
95
+
96
+ return combined_df
97
+
98
  def analyze_single_document(images: list, prompt: str) -> dict:
99
  """Analyze a single document and return results"""
100
  model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25')