csv-generation-img

Sleeping

File size: 9,841 Bytes

import google.generativeai as genai

genai.configure(api_key="AIzaSyDxp4tYzBK7RB8y3jIIF4TpyPZgCQP8NTY")
import os
import pandas as pd
import io
import tempfile
from PyPDF2 import PdfReader
import re
import csv
from PIL import Image
import fitz  # PyMuPDF
from PIL import Image

def configure_gemini(api_key: str):
    """Configure Gemini API with the provided key"""
    genai.configure(api_key=api_key)

# def pdf_to_images(pdf_bytes: bytes) -> list:
#     """Convert PDF bytes to list of PIL Images"""
#     return convert_from_bytes(pdf_bytes)



def pdf_to_images(pdf_bytes: bytes) -> list[Image.Image]:
    """Convert PDF to PIL Images using PyMuPDF (no poppler needed)."""
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    images = []
    for page in doc:
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images


def process_local_pdf(pdf_bytes: bytes):
    """
    Process a local PDF file with Gemini AI.

    Args:
        file_path: Path to the PDF file
        prompt: The prompt template to use (should contain {page_num} if needed)
        api_key: Your Google AI Studio API key
    """
    # Configure Gemini
prompt = """Please analyze the provided images of the real estate document set and perform the following actions:

1. **Identify Parties**: Determine and list all present parties involved in the transaction. Always identify and include **Seller 1** and **Buyer 1** if they are present in the documents. Additionally, include **Seller 2** and **Buyer 2** only if they are explicitly mentioned.

2. **Identify Missing Items**: For each identified party, including at minimum **Seller 1** and **Buyer 1**, check all pages for any missing signatures or initials. Only check for **Seller 2** or **Buyer 2** if they were identified in step 1.

3. **Identify Checked Boxes**: Locate and list all checkboxes that have been marked or checked.

4. **Generate Secondary Questions**: For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.

5. **Check for Required Paperwork**: Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous'.

6. **Identify Conflicts**: Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).

7. **Provide Location**: For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).

8. **Format Output**: Present all findings in CSV format with the following columns:
   - **Category**: (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
   - **Location**: (e.g., Sale Contract (Image 8 Pg 1))
   - **Line Item(s)**: (e.g., 4)
   - **Item Type**: (e.g., Seller 1, Buyer 1, Seller Signature, Seller Initials)
   - **Status**: (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict)
   - **Details**: (e.g., "Seller signature line (top line) is empty.", "Two initial boxes for Seller (approx line 106-107 area) are empty.")
   - **Secondary Question** (if applicable): (e.g., "Is the Buyer aware they are waiving the home warranty?", "Has the Buyer received and reviewed the Seller's Disclosure?")
"""

    # Convert to images
    images = pdf_to_images(pdf_bytes)

    # Process each page
    combined_df = pd.DataFrame()
    for i, img in enumerate(images):
        try:
            model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25')  # Updated model name
            local_prompt = prompt.format(i+1)

            # Send both the prompt and image to Gemini
            response = model.generate_content([local_prompt, img])

            # Extract CSV response
            answer_csv = extract_csv_from_response(response)
            answer_df = csv_to_dataframe(answer_csv)

            # Combine DataFrames if needed
            if not answer_df.empty:
                combined_df = pd.concat([combined_df, answer_df], ignore_index=True)

            print(f"Processed page {i+1}")
            print("Response:")
            print(answer_csv)
            print("\n" + "="*50 + "\n")

        except Exception as e:
            print(f"Error processing page {i+1}: {str(e)}")

    return combined_df

def analyze_single_document(images: list, prompt: str) -> dict:
    """Analyze a single document and return results"""
    model = genai.GenerativeModel('gemini-2.5-pro-exp-03-25')
    response = model.generate_content([prompt] + images)
    return response.text
def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"):
    """Analyze a PDF directly using Gemini's PDF support"""
    model = genai.GenerativeModel(model_name)
    
    # Create a temporary PDF file
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
        tmp_file.write(pdf_bytes)
        tmp_file_path = tmp_file.name
    
    try:
        # Use the file upload feature
        response = model.generate_content(
            [prompt, genai.upload_file(tmp_file_path)]
        )
        print(f"Response: {response}")
        return response.text
    finally:
        # Clean up temporary file
        if os.path.exists(tmp_file_path):
            os.unlink(tmp_file_path)

def extract_response_text(response) -> str:
    """Extract text content from Gemini response object"""
    try:
        if hasattr(response, 'text'):
            return response.text
        elif hasattr(response, 'result') and hasattr(response.result, 'candidates'):
            for candidate in response.result.candidates:
                if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
                    for part in candidate.content.parts:
                        if hasattr(part, 'text'):
                            return part.text
        return str(response)
    except Exception as e:
        print(f"Error extracting response text: {str(e)}")
        return str(response)
    
def extract_csv_from_response(response) -> str:
    """Extract CSV data from Gemini response"""
    try:
        # Get the text content from the response
        response_text = extract_response_text(response)
        
        # Extract CSV content between ```csv markers
        csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL)
        if csv_match:
            return csv_match.group(1).strip()
        
        # Fallback: Try to find any CSV-like content
        lines = []
        in_csv = False
        for line in response_text.split('\n'):
            if ',' in line and ('Category,' in line or 'Location,' in line):
                in_csv = True
            if in_csv:
                lines.append(line)
        if lines:
            return '\n'.join(lines)
        
        return response_text  # Return full response if no CSV found
    except Exception as e:
        print(f"Error extracting CSV: {str(e)}")
        return response.text if hasattr(response, 'text') else str(response)

def csv_to_dataframe(csv_data: str) -> pd.DataFrame:
    """Convert CSV string to pandas DataFrame with error handling"""
    if not csv_data.strip():
        return pd.DataFrame()
    
    try:
        # Clean line breaks and extra spaces
        cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()])
        
        # Use CSV reader to handle irregular fields
        rows = []
        reader = csv.reader(io.StringIO(cleaned_data), 
                        delimiter=',', 
                        quotechar='"',
                        skipinitialspace=True)
        
        header = next(reader)
        for row in reader:
            if len(row) > len(header):
                # Combine extra fields into the last column
                row = row[:len(header)-1] + [','.join(row[len(header)-1:])]
            rows.append(row)
        
        return pd.DataFrame(rows, columns=header)
    
    except Exception as e:
        print(f"CSV conversion error: {str(e)}")
        try:
            # Fallback to pandas with flexible parsing
            return pd.read_csv(io.StringIO(cleaned_data), 
                on_bad_lines='warn',
                engine='python',
                quotechar='"',
                skipinitialspace=True)
        except Exception as fallback_error:
            print(f"Fallback conversion failed: {str(fallback_error)}")
            return pd.DataFrame()


def save_csv(csv_data: str, filename: str) -> str:
    """Save CSV data to file"""
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        csvfile.write(csv_data.strip())
    return filename

def get_pdf_metadata(pdf_bytes: bytes) -> dict:
    """Extract basic PDF metadata"""
    reader = PdfReader(io.BytesIO(pdf_bytes))
    return {
        'page_count': len(reader.pages),
        'author': reader.metadata.author if reader.metadata else None,
        'title': reader.metadata.title if reader.metadata else None
    }