Spaces:
Running
Running
from google import genai | |
from google.genai import types | |
#genai.configure(api_key="AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA") | |
import os | |
import pandas as pd | |
import io | |
import tempfile | |
from PyPDF2 import PdfReader | |
import re | |
import csv | |
from PIL import Image | |
import fitz # PyMuPDF | |
from PIL import Image | |
# def configure_gemini(api_key: str): | |
# """Configure Gemini API with the provided key""" | |
# genai.configure(api_key=api_key) | |
# def pdf_to_images(pdf_bytes: bytes) -> list: | |
# """Convert PDF bytes to list of PIL Images""" | |
# return convert_from_bytes(pdf_bytes) | |
def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]: | |
"""Convert PDF to PIL Images using PyMuPDF (no poppler needed).""" | |
doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
images = [] | |
for page in doc: | |
pix = page.get_pixmap() | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
images.append(img) | |
return images | |
def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]: | |
"""Convert PDF to PIL Images using PyMuPDF (no poppler needed).""" | |
doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
images = [] | |
for page in doc: | |
pix = page.get_pixmap() | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
images.append(img) | |
return images | |
def pil_image_to_bytes(pil_image: Image.Image) -> bytes: | |
"""Convert a PIL image to JPEG bytes.""" | |
with io.BytesIO() as output: | |
pil_image.save(output, format="JPEG") | |
return output.getvalue() | |
def analyze_pdf_images_with_gemini(pdf_bytes): | |
question="""Can you let me know that signatures and initials are missing and if certain checkboxes are checked prompt me with a secondary question? and also determine seller 1 seller 2 buyer 1 buyer 2 ... , WITH MAXIMUM details like Signature line for .... | |
can we also add line item? | |
the last part would be remember certain check boxes need extra paper work and so it needs to let me know that the file has it or missing it is that possible | |
give me the results in CSV FORM and the columns are as follow | |
1. **Category:** This column groups the findings or checklist items based on their nature or the type of issue. Examples include: | |
* `Parties`: Information about the buyers and sellers involved. | |
* `Missing Item...`: Indicates required signatures, initials, or information that is absent from the specified document. | |
* `Checked B...`: Refers to items where a specific checkbox (likely designated as "Box B" or similar in the original form, though here it just seems to indicate *a* box was checked) has been marked, signifying a particular choice or agreement. | |
* `Conflict`: Highlights inconsistencies or contradictory clauses found within the documents. | |
2. **Location:** This column specifies exactly *where* in the transaction documents the item being discussed can be found. It typically includes: | |
* The name of the document (e.g., `Sale Contract`, `Addendum`, `Release Agreement`, `Exclusive Buyer Agency`, `Lead Based Paint`, `In Present Condition Addendum`). | |
* An internal reference, possibly to an image scan (`Image X`). | |
* The page number within that document (`Pg Y`). | |
3. **Line Item(s):** This provides more specific location detail *within* the page mentioned in the "Location" column. It can be: | |
* A page number (often repeating the one from "Location"). | |
* Specific line numbers or a range of lines (e.g., `725-726`, `10 & 22`, `696; 697`). | |
* Sometimes blank if the item applies to the whole page or a general section. | |
4. **Item Type:** This column identifies the *specific element* or clause being reviewed or checked on that line. Examples: | |
* Identification of parties (`4 Seller 1`, `6 Buyer 2`). The numbers might be internal codes or refer to lines on the original form. | |
* Signature or initial fields (`Seller Signature Block`, `Bottom Right Initials`, `Buyer Signature`). | |
* Specific clauses or addenda being checked for presence/selection (`Home Warranty Waiver`, `Seller's Disclosure Addendum Attached`, `Inspection Clause 1`). | |
* Specific choices within clauses (`Brokerage Consents`, `Seller Knowledge (Presence)`). | |
5. **Status:** This column indicates the outcome of the review for the specific `Item Type`. | |
* `Identified`: The required information (like a party's name) was found. | |
* `Missing`: The required item (like a signature, initial, or checked box) was not found. | |
* `Checked`: A specific checkbox related to the `Item Type` was found to be marked (ticked). | |
* `Conflict Detected`: An inconsistency or contradiction was found relating to this item. | |
6. **Details:** This column provides specific information elaborating on the `Status`. | |
* If `Identified`, it shows the name or value found (e.g., `VB ONE LLC`, `Dimas Miguel Guerra Pena`). | |
* If `Missing`, it describes more precisely what is missing (e.g., `Entire block for Seller... is empty`, `Two initial boxes for Seller... are empty`). | |
* If `Checked`, it often shows the text associated with the checked box or confirms the selection (e.g., `[X] BUYER waives the opportunity...`, `[X] Seller's Disclosure and Condition...`). | |
* If `Conflict Detected`, it might briefly describe the nature of the conflict (e.g., referencing conflicting checked boxes). | |
7. **Secondary Question (if applicable):** This column raises follow-up questions, potential issues, or points needing clarification based on the finding in the row. It highlights risks or implications that need to be addressed. | |
GIVE ME IN CSV FORM DIRECTLY DONT WRITE ANYTHING ELSE | |
""" | |
# Initialize Gemini client | |
images = pdf_to_images(pdf_bytes) | |
client = genai.Client(api_key="AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA") | |
# Prepare image parts | |
contents = [question] | |
for img in images: | |
img_bytes = pil_image_to_bytes(img) | |
contents.append(types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")) | |
# Generate content using Gemini | |
response = client.models.generate_content( | |
model="gemini-2.0-flash", | |
contents=contents | |
) | |
return response | |
def process_local_pdf(pdf_bytes: bytes): | |
""" | |
Process a local PDF file with Gemini AI. | |
Args: | |
file_path: Path to the PDF file | |
prompt: The prompt template to use (should contain {page_num} if needed) | |
api_key: Your Google AI Studio API key | |
""" | |
# Configure Gemini | |
prompt = """Please analyze the provided images of the real estate document set and perform the following actions: | |
1. **Identify Parties**: Determine and list all present parties involved in the transaction. Always identify and include **Seller 1** and **Buyer 1** if they are present in the documents. Additionally, include **Seller 2** and **Buyer 2** only if they are explicitly mentioned. | |
2. **Identify Missing Items**: For each identified party, including at minimum **Seller 1** and **Buyer 1**, check all pages for any missing signatures or initials. Only check for **Seller 2** or **Buyer 2** if they were identified in step 1. | |
3. **Identify Checked Boxes**: Locate and list all checkboxes that have been marked or checked. | |
4. **Generate Secondary Questions**: For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved. | |
5. **Check for Required Paperwork**: Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous'. | |
6. **Identify Conflicts**: Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously). | |
7. **Provide Location**: For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block). | |
8. **Format Output**: Present all findings in CSV format with the following columns: | |
- **Category**: (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict) | |
- **Location**: (e.g., Sale Contract (Image 8 Pg 1)) | |
- **Line Item(s)**: (e.g., 4) | |
- **Item Type**: (e.g., Seller 1, Buyer 1, Seller Signature, Seller Initials) | |
- **Status**: (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict) | |
- **Details**: (e.g., "Seller signature line (top line) is empty.", "Two initial boxes for Seller (approx line 106-107 area) are empty.") | |
- **Secondary Question** (if applicable): (e.g., "Is the Buyer aware they are waiving the home warranty?", "Has the Buyer received and reviewed the Seller's Disclosure?") | |
""" | |
# Convert to images | |
images = pdf_to_images(pdf_bytes) | |
# Process each page | |
combined_df = pd.DataFrame() | |
for i, img in enumerate(images): | |
try: | |
model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25') # Updated model name | |
local_prompt = prompt.format(i+1) | |
# Send both the prompt and image to Gemini | |
response = model.generate_content([local_prompt, img]) | |
# Extract CSV response | |
answer_csv = extract_csv_from_response(response) | |
answer_df = csv_to_dataframe(answer_csv) | |
# Combine DataFrames if needed | |
if not answer_df.empty: | |
combined_df = pd.concat([combined_df, answer_df], ignore_index=True) | |
print(f"Processed page {i+1}") | |
print("Response:") | |
print(answer_csv) | |
print("\n" + "="*50 + "\n") | |
except Exception as e: | |
print(f"Error processing page {i+1}: {str(e)}") | |
return combined_df | |
def analyze_single_document(images: list, prompt: str) -> dict: | |
"""Analyze a single document and return results""" | |
model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25') | |
response = model.generate_content([prompt] + images) | |
return response.text | |
def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"): | |
"""Analyze a PDF directly using Gemini's PDF support""" | |
model = genai.GenerativeModel(model_name) | |
# Create a temporary PDF file | |
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: | |
tmp_file.write(pdf_bytes) | |
tmp_file_path = tmp_file.name | |
try: | |
# Use the file upload feature | |
response = model.generate_content( | |
[prompt, genai.upload_file(tmp_file_path)] | |
) | |
print(f"Response: {response}") | |
return response.text | |
finally: | |
# Clean up temporary file | |
if os.path.exists(tmp_file_path): | |
os.unlink(tmp_file_path) | |
def extract_response_text(response) -> str: | |
"""Extract text content from Gemini response object""" | |
try: | |
if hasattr(response, 'text'): | |
return response.text | |
elif hasattr(response, 'result') and hasattr(response.result, 'candidates'): | |
for candidate in response.result.candidates: | |
if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'): | |
for part in candidate.content.parts: | |
if hasattr(part, 'text'): | |
return part.text | |
return str(response) | |
except Exception as e: | |
print(f"Error extracting response text: {str(e)}") | |
return str(response) | |
def extract_csv_from_response(response) -> str: | |
"""Extract CSV data from Gemini response""" | |
try: | |
# Get the text content from the response | |
response_text = extract_response_text(response) | |
# Extract CSV content between ```csv markers | |
csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL) | |
if csv_match: | |
return csv_match.group(1).strip() | |
# Fallback: Try to find any CSV-like content | |
lines = [] | |
in_csv = False | |
for line in response_text.split('\n'): | |
if ',' in line and ('Category,' in line or 'Location,' in line): | |
in_csv = True | |
if in_csv: | |
lines.append(line) | |
if lines: | |
return '\n'.join(lines) | |
return response_text # Return full response if no CSV found | |
except Exception as e: | |
print(f"Error extracting CSV: {str(e)}") | |
return response.text if hasattr(response, 'text') else str(response) | |
def csv_to_dataframe(csv_data: str) -> pd.DataFrame: | |
"""Convert CSV string to pandas DataFrame with error handling""" | |
if not csv_data.strip(): | |
return pd.DataFrame() | |
try: | |
# Clean line breaks and extra spaces | |
cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()]) | |
# Use CSV reader to handle irregular fields | |
rows = [] | |
reader = csv.reader(io.StringIO(cleaned_data), | |
delimiter=',', | |
quotechar='"', | |
skipinitialspace=True) | |
header = next(reader) | |
for row in reader: | |
if len(row) > len(header): | |
# Combine extra fields into the last column | |
row = row[:len(header)-1] + [','.join(row[len(header)-1:])] | |
rows.append(row) | |
return pd.DataFrame(rows, columns=header) | |
except Exception as e: | |
print(f"CSV conversion error: {str(e)}") | |
try: | |
# Fallback to pandas with flexible parsing | |
return pd.read_csv(io.StringIO(cleaned_data), | |
on_bad_lines='warn', | |
engine='python', | |
quotechar='"', | |
skipinitialspace=True) | |
except Exception as fallback_error: | |
print(f"Fallback conversion failed: {str(fallback_error)}") | |
return pd.DataFrame() | |
def save_csv(csv_data: str, filename: str) -> str: | |
"""Save CSV data to file""" | |
with open(filename, 'w', newline='', encoding='utf-8') as csvfile: | |
csvfile.write(csv_data.strip()) | |
return filename | |
def get_pdf_metadata(pdf_bytes: bytes) -> dict: | |
"""Extract basic PDF metadata""" | |
reader = PdfReader(io.BytesIO(pdf_bytes)) | |
return { | |
'page_count': len(reader.pages), | |
'author': reader.metadata.author if reader.metadata else None, | |
'title': reader.metadata.title if reader.metadata else None | |
} |