Spaces:

mgbam
/

AuditXCodeInsights

Sleeping

File size: 22,008 Bytes

import streamlit as st
import google.generativeai as genai
import zipfile
import io
import json
import os
from pathlib import Path
import time  # Added for simulating mock delay

# --- Configuration ---
GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-03-25"
MAX_PROMPT_TOKENS_ESTIMATE = 800000  # Adjust as needed

AVAILABLE_ANALYSES = {
    "generate_docs": "Generate Missing Docstrings/Comments",
    "find_bugs": "Identify Potential Bugs & Anti-patterns",
    "check_style": "Check Style Guide Compliance (General)",
    "summarize_modules": "Summarize Complex Modules/Files",
    "suggest_refactoring": "Suggest Refactoring Opportunities"
}

CODE_EXTENSIONS = {
    '.py', '.js', '.java', '.c', '.cpp', '.h', '.cs', '.go',
    '.rb', '.php', '.swift', '.kt', '.ts', '.html', '.css', '.scss', '.sql'
}

# --- Session State Initialization ---
if 'mock_api_call' not in st.session_state:
    st.session_state.mock_api_call = False  # Default to using the real API

# --- Gemini API Setup ---
model = None

def initialize_gemini_model():
    """Initializes the Gemini model if not in mock mode."""
    global model
    if model is None and not st.session_state.mock_api_call:
        try:
            if 'GEMINI_API_KEY' not in st.secrets:
                st.error("🚨 Gemini API Key not found. Add it to `.streamlit/secrets.toml`.")
                st.stop()
            genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
            model = genai.GenerativeModel(GEMINI_MODEL_NAME)
            print("Gemini Model Initialized.")
            return True
        except Exception as e:
            st.error(f"🚨 Error initializing Gemini SDK: {e}")
            st.stop()
            return False
    elif st.session_state.mock_api_call:
        print("Running in Mock Mode. Skipping Gemini initialization.")
        return True  # Allow proceeding in mock mode
    elif model is not None:
        print("Gemini Model already initialized.")
        return True
    return False

# --- Helper Functions ---

def estimate_token_count(text):
    """Roughly estimate token count (assumes ~3-4 characters per token)."""
    return len(text) // 3

def process_zip_file(uploaded_file):
    """
    Extracts code files and their content from the uploaded ZIP file.
    
    Returns:
        code_files (dict): Mapping of file paths to content.
        total_chars (int): Total number of characters in included files.
        file_count (int): Count of processed code files.
        ignored_files (list): List of files skipped or not processed.
    """
    code_files = {}
    total_chars = 0
    file_count = 0
    ignored_files = []

    try:
        with zipfile.ZipFile(io.BytesIO(uploaded_file.getvalue()), 'r') as zip_ref:
            for member in zip_ref.infolist():
                # Skip directories, hidden files, and files with '__' in the name
                if member.is_dir() or any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename:
                    continue

                file_path = Path(member.filename)
                if file_path.suffix.lower() in CODE_EXTENSIONS:
                    try:
                        with zip_ref.open(member) as file:
                            try:
                                content = file.read().decode('utf-8')
                            except UnicodeDecodeError:
                                try:
                                    content = file.read().decode('latin-1')
                                except Exception as decode_err:
                                    ignored_files.append(f"{member.filename} (Decode Error: {decode_err})")
                                    continue

                            code_files[member.filename] = content
                            total_chars += len(content)
                            file_count += 1
                    except Exception as read_err:
                        ignored_files.append(f"{member.filename} (Read Error: {read_err})")
                else:
                    # Only add to ignored if it's not explicitly ignored by path rules above
                    if not (any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename):
                        ignored_files.append(f"{member.filename} (Skipped Extension: {file_path.suffix})")

    except zipfile.BadZipFile:
        st.error("🚨 Invalid or corrupted ZIP file.")
        return None, 0, 0, []
    except Exception as e:
        st.error(f"🚨 Error processing ZIP file: {e}")
        return None, 0, 0, []

    return code_files, total_chars, file_count, ignored_files

def construct_analysis_prompt(code_files_dict, requested_analyses):
    """
    Constructs the prompt for Gemini, including code content and a JSON structure request.
    
    Returns:
        full_prompt (str): The complete prompt.
        included_files (list): List of file names included in the prompt.
    """
    prompt_content = "Analyze the following codebase provided as a collection of file paths and their content.\n\n"
    current_token_estimate = estimate_token_count(prompt_content)
    included_files = []
    concatenated_code = ""

    for filename, content in code_files_dict.items():
        file_marker = f"--- START FILE: {filename} ---\n"
        file_content = f"{content}\n"
        file_end_marker = f"--- END FILE: {filename} ---\n\n"
        segment = file_marker + file_content + file_end_marker
        segment_token_estimate = estimate_token_count(segment)

        if current_token_estimate + segment_token_estimate <= MAX_PROMPT_TOKENS_ESTIMATE:
            concatenated_code += segment
            current_token_estimate += segment_token_estimate
            included_files.append(filename)
        else:
            st.warning(f"⚠️ Codebase may exceed context window estimate (~{MAX_PROMPT_TOKENS_ESTIMATE} tokens). Analysis performed only on the first {len(included_files)} files ({current_token_estimate} tokens).")
            break

    if not included_files:
        st.error("🚨 No code files could be included within the estimated token limit.")
        return None, []

    prompt_content += concatenated_code

    # Build the expected JSON structure dynamically based on the selected analyses
    json_structure_description = "{\n"
    structure_parts = []
    if "generate_docs" in requested_analyses:
        structure_parts.append('    "documentation_suggestions": [{"file": "path/to/file", "line": number, "suggestion": "Suggested docstring/comment"}]')
    if "find_bugs" in requested_analyses:
        structure_parts.append('    "potential_bugs": [{"file": "path/to/file", "line": number, "description": "Description of potential bug/anti-pattern", "severity": "High/Medium/Low"}]')
    if "check_style" in requested_analyses:
        structure_parts.append('    "style_issues": [{"file": "path/to/file", "line": number, "description": "Description of style deviation"}]')
    if "summarize_modules" in requested_analyses:
        structure_parts.append('    "module_summaries": [{"file": "path/to/file", "summary": "One-paragraph summary of the file purpose/functionality"}]')
    if "suggest_refactoring" in requested_analyses:
        structure_parts.append('    "refactoring_suggestions": [{"file": "path/to/file", "line": number, "area": "e.g., function name, class name", "suggestion": "Description of refactoring suggestion"}]')
    json_structure_description += ",\n".join(structure_parts)
    json_structure_description += "\n}"

    prompt_footer = f"""
**Analysis Task:**
Perform the analyses corresponding to the keys present in the JSON structure below, based *only* on the provided code files ({', '.join(included_files)}).

**Output Format:**
Respond ONLY with a single, valid JSON object adhering strictly to the following structure. If no issues/suggestions are found for a category, provide an empty list `[]`. Do not include explanations outside the JSON structure.

{json_structure_description}

**JSON Output Only:**
"""
    full_prompt = prompt_content + prompt_footer
    return full_prompt, included_files

def call_gemini_api(prompt):
    """
    Calls the Gemini API (or simulates it in mock mode) with the provided prompt.
    
    Returns:
        insights (dict): The parsed JSON response from the API.
        error_message (str): An error message if something went wrong.
    """
    if not prompt:
        return None, "Prompt generation failed."

    # --- MOCK MODE LOGIC ---
    if st.session_state.mock_api_call:
        st.info(" MOCK MODE: Simulating API call...")
        time.sleep(2)  # Simulate network/processing delay

        # Simulated successful response
        mock_json_response = json.dumps({
            "documentation_suggestions": [{"file": "mock/core.py", "line": 15, "suggestion": "def process_data(data):\n    \"\"\"Processes the input data using mock logic.\"\"\""}],
            "potential_bugs": [{"file": "mock/utils.py", "line": 22, "description": "Potential division by zero if denominator is not checked.", "severity": "Medium"}],
            "style_issues": [{"file": "mock/core.py", "line": 5, "description": "Variable 'varName' does not follow snake_case convention."}],
            "module_summaries": [
                {"file": "mock/core.py", "summary": "This file contains the core mock processing logic."},
                {"file": "mock/utils.py", "summary": "Utility functions for mocking."}
            ],
            "refactoring_suggestions": [{"file": "mock/utils.py", "line": 30, "area": "calculate_metrics function", "suggestion": "Function is too long (> 50 lines), consider breaking it down."}]
        })
        st.success("Mock response generated successfully.")
        return json.loads(mock_json_response), None

    # --- REAL API CALL LOGIC ---
    else:
        if not initialize_gemini_model():
            return None, "Gemini Model Initialization Failed."
        if model is None:
            return None, "Gemini model not available."

        try:
            st.write(f"📡 Sending request to {GEMINI_MODEL_NAME}...")
            response = model.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(temperature=0.2),
                safety_settings=[
                    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                ]
            )
            st.write("✅ Response received from AI.")

            try:
                json_response_text = response.text.strip()
                # Remove potential markdown code block fences
                if json_response_text.startswith("```json"):
                    json_response_text = json_response_text[7:]
                if json_response_text.startswith("```"):
                    json_response_text = json_response_text[3:]
                if json_response_text.endswith("```"):
                    json_response_text = json_response_text[:-3]

                # Extract JSON object boundaries
                json_start = json_response_text.find('{')
                json_end = json_response_text.rfind('}') + 1

                if json_start != -1 and json_end != -1 and json_end > json_start:
                    final_json_text = json_response_text[json_start:json_end]
                    insights = json.loads(final_json_text)
                    return insights, None
                else:
                    st.warning("⚠️ Could not find valid JSON object boundaries ({...}) in response. Displaying raw text.")
                    return {"raw_response": response.text}, "AI response did not contain clear JSON object, showing raw text."

            except json.JSONDecodeError as json_err:
                st.error(f"🚨 Error parsing JSON response from AI: {json_err}")
                st.error("Raw AI Response:")
                st.code(response.text, language='text')
                return None, f"AI response was not valid JSON: {json_err}"
            except AttributeError:
                st.error("🚨 Unexpected API response structure.")
                st.code(f"Response object: {response}", language='text')
                try:
                    block_reason = response.prompt_feedback.block_reason
                    if block_reason:
                        return None, f"Content blocked by API. Reason: {block_reason}"
                except Exception:
                    pass
                return None, "Unexpected response structure from API."
            except Exception as e:
                st.error(f"🚨 Unexpected issue processing AI response: {e}")
                try:
                    st.code(f"Response object: {response}", language='text')
                except Exception:
                    pass
                return None, f"Unexpected response structure: {e}"

        except Exception as e:
            st.error(f"🚨 An error occurred during API call: {e}")
            error_msg = f"API call failed: {e}"
            if hasattr(e, 'message'):
                if "429" in e.message:
                    error_msg = "API Quota Exceeded or Rate Limit hit. Check your Google Cloud/AI Studio dashboard."
                elif "API key not valid" in e.message:
                    error_msg = "Invalid Gemini API Key. Please check `.streamlit/secrets.toml`."
                elif "blocked" in e.message.lower():
                    error_msg = "Content blocked due to safety settings. Review input code or adjust safety settings if appropriate."
            elif "block_reason: SAFETY" in str(e):
                error_msg = "Content blocked due to safety settings. Review input code or adjust safety settings if appropriate."

            return None, error_msg

def display_results(results_json, requested_analyses):
    """Renders the analysis results in the Streamlit interface."""
    st.header("📊 Analysis Report")

    if not isinstance(results_json, dict):
        st.error("Invalid results format received.")
        st.json(results_json)
        return

    if "raw_response" in results_json:
        st.subheader("Raw AI Response (JSON Parsing Failed)")
        st.code(results_json["raw_response"], language='text')
        return

    def display_list_items(items, fields):
        if items:
            for item in items:
                details = []
                for field_key, field_label in fields.items():
                    value = item.get(field_key, 'N/A')
                    if value != 'N/A':
                        details.append(f"**{field_label}:** {value}")
                st.markdown("- " + " - ".join(details))
                # Display multi-line outputs when applicable
                if 'suggestion' in item:
                    st.code(item['suggestion'], language='text')
                elif 'description' in item:
                    st.markdown(f"  > {item['description']}")
                elif 'summary' in item:
                    st.markdown(f"  > {item['summary']}")
        else:
            st.markdown("_No items found for this category._")
        st.divider()

    display_config = {
        "generate_docs": {
            "key": "documentation_suggestions",
            "title": AVAILABLE_ANALYSES["generate_docs"],
            "fields": {"file": "File", "line": "Line"}
        },
        "find_bugs": {
            "key": "potential_bugs",
            "title": AVAILABLE_ANALYSES["find_bugs"],
            "fields": {"file": "File", "line": "Line", "severity": "Severity"}
        },
        "check_style": {
            "key": "style_issues",
            "title": AVAILABLE_ANALYSES["check_style"],
            "fields": {"file": "File", "line": "Line"}
        },
        "summarize_modules": {
            "key": "module_summaries",
            "title": AVAILABLE_ANALYSES["summarize_modules"],
            "fields": {"file": "File"}
        },
        "suggest_refactoring": {
            "key": "refactoring_suggestions",
            "title": AVAILABLE_ANALYSES["suggest_refactoring"],
            "fields": {"file": "File", "line": "Line", "area": "Area"}
        }
    }

    any_results = False
    for analysis_key in requested_analyses:
        if analysis_key in display_config:
            config = display_config[analysis_key]
            st.subheader(config["title"])
            items = results_json.get(config["key"], [])
            display_list_items(items, config["fields"])
            if items:
                any_results = True

    if not any_results:
        st.info("No specific findings were identified in the analysis based on your selections.")

    st.download_button(
        label="Download Full Report (JSON)",
        data=json.dumps(results_json, indent=4),
        file_name="code_audit_report.json",
        mime="application/json"
    )

# --- Streamlit App Main Interface ---
st.set_page_config(page_title="Codebase Audit Assistant", layout="wide")

st.title("🤖 Codebase Audit & Documentation Assistant")
st.markdown(f"Upload your codebase (`.zip`) for analysis using **{GEMINI_MODEL_NAME}**.")

# Sidebar controls
with st.sidebar:
    st.header("⚙️ Analysis Controls")
    st.session_state.mock_api_call = st.toggle(
        "🧪 Enable Mock API Mode (for Testing)",
        value=st.session_state.mock_api_call,
        help="If enabled, uses fake data instead of calling the real Gemini API. Saves cost during testing."
    )
    if st.session_state.mock_api_call:
        st.info("Mock API Mode ACTIVE")
    else:
        st.info("Using REAL Gemini API")

    st.divider()
    st.header("🔎 Select Analyses")
    selected_analyses = []
    for key, name in AVAILABLE_ANALYSES.items():
        if st.checkbox(name, value=True, key=f"cb_{key}"):
            selected_analyses.append(key)

    st.divider()
    st.header("📄 How To Use")
    st.info(
        "1. Set API Key in `.streamlit/secrets.toml` (if not using Mock Mode).\n"
        "2. Toggle Mock Mode if needed.\n"
        "3. Select desired analyses.\n"
        "4. Create a **ZIP archive** of your codebase.\n"
        "5. Upload the `.zip` file.\n"
        "6. Click 'Analyze Codebase'.\n"
        "7. Review the report."
    )
    st.info(f"**Note:** Only files with common code extensions ({', '.join(CODE_EXTENSIONS)}) are processed. Analysis might be limited (~{MAX_PROMPT_TOKENS_ESTIMATE:,} est. tokens).")
    st.divider()
    st.warning("⚠️ **Privacy:** Code content is sent to the Google Gemini API if Mock Mode is OFF. Do not upload sensitive code if uncomfortable.")

# Main content area
uploaded_file = st.file_uploader("📁 Upload Codebase ZIP File", type=['zip'], key="file_uploader")
analysis_triggered = False
results_cache = None  # To store results briefly

if uploaded_file:
    st.success(f"✅ File '{uploaded_file.name}' uploaded.")
    with st.spinner("Inspecting ZIP file..."):
        code_files, total_chars, file_count, ignored_files = process_zip_file(uploaded_file)

    if code_files is not None:
        st.info(f"Found **{file_count}** relevant code files ({total_chars:,} characters). Est. tokens: ~{estimate_token_count(total_chars):,}")
        if ignored_files:
            with st.expander(f"View {len(ignored_files)} Skipped/Ignored Files"):
                st.code("\n".join(ignored_files), language='text')

        analyze_button_disabled = (not selected_analyses or file_count == 0)
        analyze_button_label = "Analyze Codebase" if not analyze_button_disabled else "Select Analyses or Upload Valid Code"
        if st.button(analyze_button_label, type="primary", disabled=analyze_button_disabled):
            analysis_triggered = True
            if not selected_analyses:
                st.warning("Please select at least one analysis type from the sidebar.")
            elif file_count == 0:
                st.warning("No relevant code files found in the ZIP archive to analyze.")
            else:
                st.divider()
                with st.spinner(f"🚀 Preparing prompt & contacting AI ({'Mock Mode' if st.session_state.mock_api_call else GEMINI_MODEL_NAME})... This may take time."):
                    analysis_prompt, included_files_in_prompt = construct_analysis_prompt(code_files, selected_analyses)
                    if analysis_prompt and included_files_in_prompt:
                        st.write(f"Analyzing {len(included_files_in_prompt)} files...")
                        results_json, error_message = call_gemini_api(analysis_prompt)
                        results_cache = (results_json, error_message)
                    elif not included_files_in_prompt:
                        results_cache = (None, "Could not proceed: No files included in prompt (check token limits/errors).")
                    else:
                        results_cache = (None, "Failed to generate analysis prompt.")
    else:
        # Error during ZIP processing (error already displayed)
        pass

if analysis_triggered and results_cache:
    results_json, error_message = results_cache
    st.divider()
    if error_message:
        st.error(f"Analysis Failed: {error_message}")
        if results_json and isinstance(results_json, dict) and "raw_response" in results_json:
            st.subheader("Raw AI Response")
            st.code(results_json["raw_response"], language='text')
    elif results_json:
        display_results(results_json, selected_analyses)
    else:
        st.error("Analysis did not return results or an unknown error occurred.")
elif not uploaded_file:
    st.info("Upload a ZIP file containing your source code to begin.")

st.divider()
st.markdown("_Assistant powered by Google Gemini._")