Spaces:

Nechba
/

cvs-generation-using-images

Sleeping

App Files Files Community

Nechba commited on Apr 7

Commit

7d7653b

1 Parent(s): 5ced2c9

first commit

Browse files

Files changed (5) hide show

.env +1 -0
app.py +183 -0
dockerfile +25 -0
requirements.txt +8 -0
utils.py +146 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ GEMINI_API_KEY=AIzaSyAP85jSUKncrIGOAhm3Gvo-TYra_e1wmEA

app.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import streamlit as st
+import os
+from dotenv import load_dotenv
+from utils import (
+    configure_gemini,
+    analyze_pdf_directly,
+    csv_to_dataframe,
+    save_csv,
+    get_pdf_metadata,
+    extract_csv_from_response,
+    pdf_to_images,
+    analyze_single_document
+)
+import base64
+from datetime import datetime
+import tempfile
+# Load environment variables
+load_dotenv()
+# Configure page settings
+st.set_page_config(
+    page_title="PDF Document Analyzer",
+    page_icon="📄",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS styling
+st.markdown("""
+    <style>
+    .document-card {
+        border-radius: 10px;
+        padding: 1.5rem;
+        margin-bottom: 1.5rem;
+        background-color: white;
+        box-shadow: 0 4px 12px rgba(0,0,0,0.1);
+        transition: transform 0.2s;
+    }
+    .document-card:hover {
+        transform: translateY(-2px);
+    }
+    .stButton>button {
+        background-color: #4285F4;
+        color: white;
+        border-radius: 8px;
+        padding: 0.5rem 1.5rem;
+        font-weight: 500;
+    }
+    .analysis-section {
+        border-left: 4px solid #4285F4;
+        padding-left: 1rem;
+        margin-top: 1.5rem;
+    }
+    </style>
+""", unsafe_allow_html=True)
+# App Header
+st.title("📄 PDF Document Analyzer")
+st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")
+# Load prompt
+PROMPT  ="""Please analyze the provided images of the real estate document set and perform the following actions:
+1.  *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
+2.  *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents.
+3.  *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked.
+4.  *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
+5.  *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
+6.  *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
+7.  *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
+8.  *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be:
+    *   Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
+    *   Location (Document Name/Page, e.g., Sale Contract Pg 2)
+    *   Line Item(s) (Approximate line number or location description)
+    *   Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
+    *   Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
+    *   Details (Specifics like names, text of the checkbox, description of the issue or document status)
+    *   Secondary Question (if applicable) (The question generated in step 4)
+Please apply this analysis to the entire set of documents provided.
+"""
+# Sidebar Configuration
+with st.sidebar:
+    st.header("Configuration")
+    api_key = st.text_input(
+        "Enter Gemini API Key:",
+        type="password",
+        value=os.getenv("GEMINI_API_KEY", "")
+    )
+    if api_key:
+        configure_gemini(api_key)
+    st.markdown("---")
+    st.info("""
+        **Features:**
+        - PDF processing using images partitioned by page
+        - Individual analysis for each document
+        - Downloadable CSV reports
+    """)
+# Main App Content
+uploaded_files = st.file_uploader(
+    "Upload PDF Documents",
+    type=["pdf"],
+    accept_multiple_files=True,
+    help="Upload multiple PDF documents for analysis"
+)
+if uploaded_files and api_key:
+    st.success(f"✅ {len(uploaded_files)} PDF(s) ready for analysis")
+    # Process each PDF separately
+    for i, uploaded_file in enumerate(uploaded_files):
+        with st.container():
+            st.markdown(f"### 📑 Document {i+1}: {uploaded_file.name}")
+            # Display document info
+            metadata = get_pdf_metadata(uploaded_file.getvalue())
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Pages", metadata['page_count'])
+            with col2:
+                st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
+            with col3:
+                if st.button(f"Analyze Document", key=f"analyze_{i}"):
+                    with st.spinner(f"Analyzing {uploaded_file.name}..."):
+                        try:
+                            # Analyze PDF directly
+                             # Convert PDF to images
+                            images = pdf_to_images(uploaded_file.getvalue())
+                            # Analyze document
+                            raw_response = analyze_single_document(images, PROMPT)
+                            # raw_response = analyze_pdf_directly(
+                            #     pdf_bytes=uploaded_file.getvalue(),
+                            #     prompt=PROMPT,
+                            #     model_name="gemini-1.5-pro"  # or "gemini-1.5-flash"
+                            # )
+                            # Process response
+                            csv_data = extract_csv_from_response(raw_response)
+                            # Display results in expandable section
+                            with st.expander("View Analysis Results", expanded=True):
+                                if csv_data:
+                                    df = csv_to_dataframe(csv_data)
+                                    print(f"DataFrame: {df}")
+                                    if not df.empty:
+                                        st.dataframe(df)
+                                        # Download button
+                                        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                                        csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.csv"
+                                        st.download_button(
+                                            label="Download Analysis",
+                                            data=csv_data,
+                                            file_name=csv_filename,
+                                            mime="text/csv",
+                                            key=f"download_{i}"
+                                        )
+                                    else:
+                                        st.warning("No tabular data found in response")
+                                        st.markdown("### Full Response")
+                                        st.write(raw_response)
+                                else:
+                                    st.warning("No CSV data found in response")
+                                    st.markdown("### Full Response")
+                                    st.write(raw_response)
+                        except Exception as e:
+                            st.error(f"Analysis failed: {str(e)}")
+            st.markdown("---")
+elif not api_key:
+    st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed")
+elif not uploaded_files:
+    st.info("📤 Please upload PDF documents using the file uploader above")

dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9-slim
+# Install system dependencies first as root
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends poppler-utils && \
+    rm -rf /var/lib/apt/lists/*
+# Create non-root user
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+# Copy requirements first for better caching
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy the rest of the application
+COPY --chown=user . /app
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+google-generativeai
+pandas
+pillow
+python-dotenv
+PyPDF2>=3.0.0
+pdf2image>=1.16.3
+poppler-utils

utils.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import google.generativeai as genai
+import os
+import pandas as pd
+import io
+import tempfile
+from PyPDF2 import PdfReader
+import re
+import csv
+from PIL import Image
+import os
+import pandas as pd
+import io
+import tempfile
+from PyPDF2 import PdfReader
+from pdf2image import convert_from_bytes
+def configure_gemini(api_key: str):
+    """Configure Gemini API with the provided key"""
+    genai.configure(api_key=api_key)
+def pdf_to_images(pdf_bytes: bytes) -> list:
+    """Convert PDF bytes to list of PIL Images"""
+    return convert_from_bytes(pdf_bytes)
+def analyze_single_document(images: list, prompt: str) -> dict:
+    """Analyze a single document and return results"""
+    model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp-01-21')
+    response = model.generate_content([prompt] + images)
+    return response.text
+def analyze_pdf_directly(pdf_bytes: bytes, prompt: str, model_name: str = "gemini-1.5-pro"):
+    """Analyze a PDF directly using Gemini's PDF support"""
+    model = genai.GenerativeModel(model_name)
+    # Create a temporary PDF file
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
+        tmp_file.write(pdf_bytes)
+        tmp_file_path = tmp_file.name
+    try:
+        # Use the file upload feature
+        response = model.generate_content(
+            [prompt, genai.upload_file(tmp_file_path)]
+        )
+        print(f"Response: {response}")
+        return response.text
+    finally:
+        # Clean up temporary file
+        if os.path.exists(tmp_file_path):
+            os.unlink(tmp_file_path)
+def extract_response_text(response) -> str:
+    """Extract text content from Gemini response object"""
+    try:
+        if hasattr(response, 'text'):
+            return response.text
+        elif hasattr(response, 'result') and hasattr(response.result, 'candidates'):
+            for candidate in response.result.candidates:
+                if hasattr(candidate, 'content') and hasattr(candidate.content, 'parts'):
+                    for part in candidate.content.parts:
+                        if hasattr(part, 'text'):
+                            return part.text
+        return str(response)
+    except Exception as e:
+        print(f"Error extracting response text: {str(e)}")
+        return str(response)
+def extract_csv_from_response(response) -> str:
+    """Extract CSV data from Gemini response"""
+    try:
+        # Get the text content from the response
+        response_text = extract_response_text(response)
+        # Extract CSV content between ```csv markers
+        csv_match = re.search(r'```csv(.*?)```', response_text, re.DOTALL)
+        if csv_match:
+            return csv_match.group(1).strip()
+        # Fallback: Try to find any CSV-like content
+        lines = []
+        in_csv = False
+        for line in response_text.split('\n'):
+            if ',' in line and ('Category,' in line or 'Location,' in line):
+                in_csv = True
+            if in_csv:
+                lines.append(line)
+        if lines:
+            return '\n'.join(lines)
+        return response_text  # Return full response if no CSV found
+    except Exception as e:
+        print(f"Error extracting CSV: {str(e)}")
+        return response.text if hasattr(response, 'text') else str(response)
+def csv_to_dataframe(csv_data: str) -> pd.DataFrame:
+    """Convert CSV string to pandas DataFrame with error handling"""
+    if not csv_data.strip():
+        return pd.DataFrame()
+    try:
+        # Clean line breaks and extra spaces
+        cleaned_data = "\n".join([line.strip() for line in csv_data.split('\n') if line.strip()])
+        # Use CSV reader to handle irregular fields
+        rows = []
+        reader = csv.reader(io.StringIO(cleaned_data),
+                        delimiter=',',
+                        quotechar='"',
+                        skipinitialspace=True)
+        header = next(reader)
+        for row in reader:
+            if len(row) > len(header):
+                # Combine extra fields into the last column
+                row = row[:len(header)-1] + [','.join(row[len(header)-1:])]
+            rows.append(row)
+        return pd.DataFrame(rows, columns=header)
+    except Exception as e:
+        print(f"CSV conversion error: {str(e)}")
+        try:
+            # Fallback to pandas with flexible parsing
+            return pd.read_csv(io.StringIO(cleaned_data),
+                on_bad_lines='warn',
+                engine='python',
+                quotechar='"',
+                skipinitialspace=True)
+        except Exception as fallback_error:
+            print(f"Fallback conversion failed: {str(fallback_error)}")
+            return pd.DataFrame()
+def save_csv(csv_data: str, filename: str) -> str:
+    """Save CSV data to file"""
+    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
+        csvfile.write(csv_data.strip())
+    return filename
+def get_pdf_metadata(pdf_bytes: bytes) -> dict:
+    """Extract basic PDF metadata"""
+    reader = PdfReader(io.BytesIO(pdf_bytes))
+    return {
+        'page_count': len(reader.pages),
+        'author': reader.metadata.author if reader.metadata else None,
+        'title': reader.metadata.title if reader.metadata else None
+    }