Spaces:

mgbam
/

AuditXCodeInsights

Sleeping

File size: 27,540 Bytes

import streamlit as st
import google.generativeai as genai
import zipfile
import io
import json
import os  # For API key usage
from pathlib import Path
import time
import re
import plotly.express as px
import pandas as pd

# --- Configuration ---
MAX_PROMPT_TOKENS_ESTIMATE = 800000  # Estimated token limit for the prompt
RESULTS_PAGE_SIZE = 25

AVAILABLE_ANALYSES = {  
    "generate_docs": "Generate Missing Docstrings/Comments",
    "find_bugs": "Identify Potential Bugs & Anti-patterns",
    "check_style": "Check Style Guide Compliance (General)",
    "summarize_modules": "Summarize Complex Modules/Files",
    "suggest_refactoring": "Suggest Refactoring Opportunities",
}
CODE_EXTENSIONS = {
    '.py', '.js', '.java', '.c', '.cpp', '.h', '.cs', '.go', '.rb',
    '.php', '.swift', '.kt', '.ts', '.html', '.css', '.scss', '.sql'
}

# --- Session State Initialization ---
if 'mock_api_call' not in st.session_state:
    st.session_state.mock_api_call = False
if 'analysis_results' not in st.session_state:
    st.session_state.analysis_results = None
if 'error_message' not in st.session_state:
    st.session_state.error_message = None
if 'analysis_requested' not in st.session_state:
    st.session_state.analysis_requested = False
if 'selected_model_name' not in st.session_state:
    st.session_state.selected_model_name = None  # Holds internal model name
if 'available_models_dict' not in st.session_state:
    st.session_state.available_models_dict = {}  # Mapping: display_name -> internal name

# --- Gemini API Setup & Model Discovery ---
model = None  # Global variable for the initialized model instance

@st.cache_data(ttl=3600)
def get_available_models():
    """Lists models supporting 'generateContent' using the API key."""
    model_dict = {}
    try:
        if 'GEMINI_API_KEY' not in st.secrets:
            print("API key not found in secrets during model listing attempt.")
            return {}
        genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
        print("Listing available models via API...")
        for m in genai.list_models():
            if 'generateContent' in m.supported_generation_methods:
                model_dict[m.display_name] = m.name
        print(f"Found {len(model_dict)} compatible models.")
        return model_dict
    except Exception as e:
        st.error(f"🚨 Error listing available models: {e}")
        return {}

def initialize_gemini_model():
    """Initializes the Gemini model based on the selected name."""
    global model
    selected_name = st.session_state.get('selected_model_name')
    if selected_name and model is None and not st.session_state.mock_api_call:
        try:
            if 'GEMINI_API_KEY' not in st.secrets:
                st.error("🚨 Gemini API Key not found. Add it to `.streamlit/secrets.toml`.")
                st.stop()
            genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
            print(f"Initializing Gemini Model: {selected_name}")
            model = genai.GenerativeModel(model_name=selected_name)
            print(f"Gemini Model Initialized ({selected_name}).")
            return True
        except Exception as e:
            st.error(f"🚨 Error initializing selected Gemini model '{selected_name}': {e}")
            st.session_state.selected_model_name = None
            st.stop()
            return False
    elif st.session_state.mock_api_call:
        return True
    elif model is not None and model.model_name == selected_name:
        return True
    elif model is not None and model.model_name != selected_name:
        print("Model changed. Re-initializing...")
        model = None
        return initialize_gemini_model()
    elif not selected_name and not st.session_state.mock_api_call:
        return False
    return False

# --- Helper Functions ---
def estimate_token_count(text):
    """
    Estimates the token count.
    If a string is provided, calculates based on its length.
    If an integer (total char count) is provided, uses that directly.
    """
    if isinstance(text, int):
        return text // 3
    return len(text) // 3

@st.cache_data(max_entries=5)
def process_zip_file_cached(file_id, file_size, file_content_bytes):
    """
    Processes a ZIP file and extracts code files.
    Returns (code_files dict, total_chars, file_count, ignored_files list).
    """
    code_files = {}
    total_chars = 0
    file_count = 0
    ignored_files = []
    status_placeholder = st.empty()
    progress_bar = status_placeholder.progress(0)
    try:
        with zipfile.ZipFile(io.BytesIO(file_content_bytes), 'r') as zip_ref:
            members = zip_ref.infolist()
            total_members = len(members)
            for i, member in enumerate(members):
                if i % 10 == 0:
                    progress_bar.progress(int((i / total_members) * 100))
                if member.is_dir() or any(p.startswith('.') for p in Path(member.filename).parts) or '__' in member.filename:
                    continue
                file_path = Path(member.filename)
                if file_path.suffix.lower() in CODE_EXTENSIONS:
                    try:
                        with zip_ref.open(member) as file:
                            file_bytes = file.read()
                            try:
                                content = file_bytes.decode('utf-8')
                            except UnicodeDecodeError:
                                try:
                                    content = file_bytes.decode('latin-1')
                                except Exception as decode_err:
                                    ignored_files.append(f"{member.filename} (Decode Error: {decode_err})")
                                    continue
                            code_files[member.filename] = content
                            total_chars += len(content)
                            file_count += 1
                    except Exception as read_err:
                        ignored_files.append(f"{member.filename} (Read Error: {read_err})")
                else:
                    if not (any(p.startswith('.') for p in Path(member.filename).parts) or '__' in member.filename):
                        ignored_files.append(f"{member.filename} (Skipped Extension: {file_path.suffix})")
            progress_bar.progress(100)
            status_placeholder.empty()
    except zipfile.BadZipFile:
        status_placeholder.empty()
        st.error("🚨 Invalid ZIP.")
        return None, 0, 0, []
    except Exception as e:
        status_placeholder.empty()
        st.error(f"🚨 ZIP Error: {e}")
        return None, 0, 0, []
    if file_count == 0:
        if not ignored_files:
            st.warning("No code files found.")
        else:
            st.warning("No code files found; some skipped.")
    return code_files, total_chars, file_count, ignored_files

def construct_analysis_prompt(code_files_dict, requested_analyses):
    """
    Constructs the prompt for analysis by including code files and structured instructions.
    The prompt now requests detailed feedback, including line references, severity, and recommended fixes.
    Returns the full prompt and a list of included files.
    """
    prompt_parts = [
        "You are a highly skilled code auditor. Analyze the following codebase in detail.\n",
        "For each issue, provide:\n",
        "  - A short summary with line references (or approximate line references).\n",
        "  - A severity level (Low, Medium, High).\n",
        "  - A recommended fix or code snippet if applicable.\n\n",
        "Here is the code:\n\n"
    ]
    current_token_estimate = estimate_token_count("".join(prompt_parts))
    included_files = []
    code_segments = []
    
    for filename, content in code_files_dict.items():
        segment = f"--- START FILE: {filename} ---\n{content}\n--- END FILE: {filename} ---\n\n"
        segment_token_estimate = estimate_token_count(segment)
        if current_token_estimate + segment_token_estimate <= MAX_PROMPT_TOKENS_ESTIMATE:
            code_segments.append(segment)
            current_token_estimate += segment_token_estimate
            included_files.append(filename)
        else:
            st.warning(f"⚠️ Exceeded context limit after {len(included_files)} files.")
            break
    
    if not included_files:
        st.error("🚨 No code files included in prompt.")
        return None, []
    
    prompt_parts.append("".join(code_segments))
    prompt_parts.append("\n\nYour tasks are:\n")
    if "generate_docs" in requested_analyses:
        prompt_parts.append(
            "1) Generate missing docstrings/comments using PEP 257 style. Provide recommended text and line references.\n"
        )
    if "find_bugs" in requested_analyses:
        prompt_parts.append(
            "2) Identify potential bugs & anti-patterns. For each, include severity, line references, and a recommended fix.\n"
        )
    if "check_style" in requested_analyses:
        prompt_parts.append(
            "3) Check style guide compliance (PEP 8 or similar). Include line references, severity, and suggested changes.\n"
        )
    if "summarize_modules" in requested_analyses:
        prompt_parts.append(
            "4) Summarize each module/file by describing its primary responsibilities.\n"
        )
    if "suggest_refactoring" in requested_analyses:
        prompt_parts.append(
            "5) Suggest refactoring opportunities with code snippets and justification, including line references.\n"
        )
    
    prompt_parts.append(
        "\nFormat your response in valid JSON with the following structure:\n"
        "{\n"
        "   \"documentation_suggestions\": [ {\"file\": \"...\", \"line\": \"...\", \"summary\": \"...\", \"severity\": \"Low|Medium|High\", \"suggestion\": \"...\"}, ... ],\n"
        "   \"potential_bugs\": [ {\"file\": \"...\", \"line\": \"...\", \"summary\": \"...\", \"severity\": \"Low|Medium|High\", \"suggestion\": \"...\"}, ... ],\n"
        "   \"style_issues\": [ ... ],\n"
        "   \"module_summaries\": [ {\"file\": \"...\", \"summary\": \"...\"}, ... ],\n"
        "   \"refactoring_suggestions\": [ {\"file\": \"...\", \"line\": \"...\", \"area\": \"...\", \"summary\": \"...\", \"suggestion\": \"...\"}, ... ]\n"
        "}\n"
        "Only output valid JSON (no markdown formatting)!\n"
    )
    
    full_prompt = "".join(prompt_parts)
    return full_prompt, included_files

def extract_json_from_text(text):
    """
    Attempts to extract a balanced JSON object from the given text.
    It looks for the first '{' and returns the substring until the braces are balanced.
    """
    start = text.find('{')
    if start == -1:
        return None
    count = 0
    for i in range(start, len(text)):
        if text[i] == '{':
            count += 1
        elif text[i] == '}':
            count -= 1
            if count == 0:
                return text[start:i+1]
    return None

def fix_json(text):
    """
    Attempt to fix common JSON formatting issues.
    This function removes trailing commas and inserts commas between adjacent objects.
    """
    # Remove trailing commas before a closing brace or bracket.
    text = re.sub(r',\s*([}\]])', r'\1', text)
    # Insert commas between adjacent objects if missing.
    text = re.sub(r'}\s*{', '},{', text)
    return text

def call_gemini_api(prompt):
    """
    Calls the Gemini API using the provided prompt.
    Returns the parsed JSON insights or an error message.
    """
    if not prompt:
        return None, "Prompt generation failed."
    
    if st.session_state.mock_api_call:
        st.info(" MOCK MODE: Simulating API call...")
        time.sleep(1)
        mock_json_response = json.dumps({
            "documentation_suggestions": [],
            "potential_bugs": [],
            "style_issues": [],
            "module_summaries": [],
            "refactoring_suggestions": []
        })
        st.success("Mock response generated.")
        return json.loads(mock_json_response), None
    else:
        if not initialize_gemini_model():
            return None, "Gemini Model Initialization Failed."
        if model is None:
            return None, "Gemini model not selected or available."
        try:
            api_status = st.empty()
            api_status.info(f"📡 Sending request to {model.model_name} (Est. prompt tokens: {estimate_token_count(prompt):,})... Please wait.")
            start_time = time.time()
            response = model.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(temperature=0.2),
                safety_settings=[
                    {"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"}
                    for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH",
                              "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]
                ]
            )
            end_time = time.time()
            api_status.success(f"✅ Response received from AI ({model.model_name}) in {end_time - start_time:.2f}s.")
            time.sleep(1)
            api_status.empty()
            
            # Remove markdown formatting if present.
            json_response_text = response.text.strip().replace("```json", "").replace("```", "")
            # First, attempt regex extraction.
            match = re.search(r'({.*})', json_response_text, re.DOTALL)
            if match:
                final_json_text = match.group(1)
            else:
                # Fallback: extract using balanced braces.
                final_json_text = extract_json_from_text(json_response_text)
            if not final_json_text:
                st.warning("⚠️ Could not extract a valid JSON object.")
                return {"raw_response": response.text}, "AI response did not contain clear JSON object."
            try:
                insights = json.loads(final_json_text)
                return insights, None
            except json.JSONDecodeError as json_err:
                # Attempt to fix the JSON string.
                fixed_text = fix_json(final_json_text)
                try:
                    insights = json.loads(fixed_text)
                    return insights, None
                except json.JSONDecodeError as json_err2:
                    st.error(f"🚨 Error parsing JSON after fix attempt: {json_err2}")
                    st.code(response.text, language='text')
                    return None, f"AI response not valid JSON: {json_err2}"
        except json.JSONDecodeError as json_err:
            st.error(f"🚨 Error parsing JSON: {json_err}")
            st.code(response.text, language='text')
            return None, f"AI response not valid JSON: {json_err}"
        except AttributeError:
            st.error("🚨 Unexpected API response structure (AttributeError).")
            st.code(f"Response object: {response}", language='text')
            return None, "Unexpected response structure (AttributeError)."
        except Exception as e:
            st.error(f"🚨 Unexpected issue processing response: {e}")
            try:
                st.code(f"Response object: {response}", language='text')
            except Exception:
                pass
            return None, f"Unexpected response structure: {e}"

def display_results(results_json, requested_analyses):
    """
    Displays the analysis results with pagination and a JSON download option.
    """
    st.header("📊 Analysis Report")
    if not isinstance(results_json, dict):
        st.error("Invalid results format.")
        st.json(results_json)
        return
    if "raw_response" in results_json:
        st.subheader("Raw AI Response (JSON Parsing Failed)")
        st.code(results_json["raw_response"], language='text')
        return

    display_config = {
        "generate_docs": {
            "key": "documentation_suggestions",
            "title": AVAILABLE_ANALYSES["generate_docs"],
            "fields": {"file": "File", "line": "Line"}
        },
        "find_bugs": {
            "key": "potential_bugs",
            "title": AVAILABLE_ANALYSES["find_bugs"],
            "fields": {"file": "File", "line": "Line", "severity": "Severity"}
        },
        "check_style": {
            "key": "style_issues",
            "title": AVAILABLE_ANALYSES["check_style"],
            "fields": {"file": "File", "line": "Line"}
        },
        "summarize_modules": {
            "key": "module_summaries",
            "title": AVAILABLE_ANALYSES["summarize_modules"],
            "fields": {"file": "File"}
        },
        "suggest_refactoring": {
            "key": "refactoring_suggestions",
            "title": AVAILABLE_ANALYSES["suggest_refactoring"],
            "fields": {"file": "File", "line": "Line", "area": "Area"}
        },
    }
    any_results_found = False
    for analysis_key in requested_analyses:
        if analysis_key in display_config:
            config = display_config[analysis_key]
            items = results_json.get(config["key"], [])
            total_items = len(items)
            st.subheader(f"{config['title']} ({total_items} found)")
            if items:
                any_results_found = True
                state_key = f"visible_{analysis_key}"
                if state_key not in st.session_state:
                    st.session_state[state_key] = RESULTS_PAGE_SIZE
                visible_count = st.session_state[state_key]
                items_to_display = items[:visible_count]
                for item in items_to_display:
                    details = [
                        f"**{field_label}:** `{item.get(field_key, 'N/A')}`" if field_key == 'file'
                        else f"**{field_label}:** {item.get(field_key, 'N/A')}"
                        for field_key, field_label in config["fields"].items()
                        if item.get(field_key, 'N/A') != 'N/A'
                    ]
                    st.markdown("- " + " - ".join(details))
                    if 'suggestion' in item:
                        st.code(item['suggestion'], language='text')
                    elif 'description' in item:
                        st.markdown(f"  > {item['description']}")
                    elif 'summary' in item:
                        st.markdown(f"  > {item['summary']}")
                if total_items > visible_count:
                    if st.button(f"Show more ({total_items - visible_count} remaining)", key=f"more_{analysis_key}"):
                        st.session_state[state_key] += RESULTS_PAGE_SIZE
                        st.rerun()
            else:
                st.markdown("_No items found for this category._")
            st.divider()
    if not any_results_found:
        st.info("No specific findings were identified.")
    st.download_button(
        label="Download Full Report (JSON)",
        data=json.dumps(results_json, indent=4),
        file_name="code_audit_report.json",
        mime="application/json"
    )

# --- Streamlit App Main Interface ---
st.set_page_config(page_title="Codebase Audit Assistant", layout="wide")
st.title("🤖 Codebase Audit & Documentation Assistant")

# --- Sidebar with Enhancements ---
with st.sidebar:
    # Dark Mode Toggle
    dark_mode = st.checkbox("Enable Dark Mode", value=False)
    if dark_mode:
        st.markdown(
            """
            <style>
            .reportview-container, .main {
                background-color: #2E2E2E;
                color: white;
            }
            </style>
            """,
            unsafe_allow_html=True
        )
    st.header("⚙️ Analysis Controls")
    st.session_state.mock_api_call = st.toggle(
        "🧪 Enable Mock API Mode",
        value=st.session_state.mock_api_call,
        help="Use fake data instead of calling Gemini API."
    )
    st.divider()
    st.header("♊ Select Model")
    if not st.session_state.mock_api_call:
        st.session_state.available_models_dict = get_available_models()
        model_display_names = list(st.session_state.available_models_dict.keys())
        if model_display_names:
            current_model_display_name = None
            if st.session_state.selected_model_name:
                for disp_name, internal_name in st.session_state.available_models_dict.items():
                    if internal_name == st.session_state.selected_model_name:
                        current_model_display_name = disp_name
                        break
            try:
                selected_index = model_display_names.index(current_model_display_name) if current_model_display_name in model_display_names else 0
            except ValueError:
                selected_index = 0
            selected_display_name = st.selectbox(
                "Choose Gemini model:",
                options=model_display_names,
                index=selected_index,
                key="model_selector",
                help="Select the Gemini model to use for analysis."
            )
            st.session_state.selected_model_name = st.session_state.available_models_dict.get(selected_display_name)
            st.info(f"Using REAL Gemini API ({st.session_state.selected_model_name})")
        elif 'GEMINI_API_KEY' in st.secrets:
            st.warning("No compatible models found or error listing models. Check API Key permissions.")
            st.session_state.selected_model_name = None
        else:
            st.warning("Add GEMINI_API_KEY to secrets to list models.")
            st.session_state.selected_model_name = None
    else:
        st.info("Mock API Mode ACTIVE")
        st.session_state.selected_model_name = "mock_model"
    st.divider()
    st.header("🔎 Select Analyses")
    selected_analyses = [
        key for key, name in AVAILABLE_ANALYSES.items() 
        if st.checkbox(name, value=True, key=f"cb_{key}")
    ]
    st.divider()
    st.header("📄 How To Use")
    st.info(
        "1. Set API Key.\n"
        "2. Toggle Mock Mode if needed.\n"
        "3. Select Model (if not Mock).\n"
        "4. Select analyses.\n"
        "5. Upload ZIP.\n"
        "6. Click 'Analyze'.\n"
        "7. Review report."
    )
    st.info(f"Note: Limited by token estimates (~{MAX_PROMPT_TOKENS_ESTIMATE:,} est. tokens).")
    st.divider()
    st.warning("⚠️ **Privacy:** Code sent to Google API if Mock Mode is OFF.")

# Update title dynamically based on selected model
if st.session_state.selected_model_name and not st.session_state.mock_api_call:
    st.markdown(f"Upload codebase (`.zip`) for analysis via **{st.session_state.selected_model_name}**.")
elif st.session_state.mock_api_call:
    st.markdown("Upload codebase (`.zip`) for analysis (Using **Mock Data**).")
else:
    st.markdown("Upload codebase (`.zip`) for analysis.")

# --- Main Content Area ---
uploaded_file = st.file_uploader(
    "📁 Upload Codebase ZIP File",
    type=['zip'],
    key="file_uploader",
    on_change=lambda: st.session_state.update(
        analysis_results=None,
        error_message=None,
        analysis_requested=False
    )
)
analysis_button_placeholder = st.empty()
results_placeholder = st.container()

if uploaded_file:
    st.success(f"✅ File '{uploaded_file.name}' uploaded.")
    uploaded_file_bytes = uploaded_file.getvalue()
    file_id = f"{uploaded_file.name}-{uploaded_file.size}"
    code_files, total_chars, file_count, ignored_files = process_zip_file_cached(
        file_id, uploaded_file.size, uploaded_file_bytes
    )
    if code_files is not None:
        st.info(f"Found **{file_count}** code files ({total_chars:,} chars). Est. tokens: ~{estimate_token_count(total_chars):,}")
        # --- Interactive Metrics Visualization ---
        metrics = {
            "Metric": ["Files Analyzed", "Total Characters", "Token Estimate", "Ignored Files"],
            "Value": [file_count, total_chars, estimate_token_count(total_chars), len(ignored_files)]
        }
        df_metrics = pd.DataFrame(metrics)
        fig = px.bar(df_metrics, x="Metric", y="Value", title="Upload Summary Metrics")
        st.plotly_chart(fig)
        # --- End Metrics Visualization ---
        if ignored_files:
            with st.expander(f"View {len(ignored_files)} Skipped/Ignored Files"):
                st.code("\n".join(ignored_files), language='text')
        
        model_ready = bool(st.session_state.selected_model_name) or st.session_state.mock_api_call
        analyze_button_disabled = (not selected_analyses or file_count == 0 or not model_ready)
        analyze_button_label = "Analyze Codebase"
        if not model_ready:
            analyze_button_label = "Select Model First"
        elif analyze_button_disabled:
            analyze_button_label = "Select Analyses or Upload Valid Code"
        
        if analysis_button_placeholder.button(
            analyze_button_label,
            type="primary",
            disabled=analyze_button_disabled
        ):
            st.session_state.analysis_requested = True
            st.session_state.analysis_results = None
            st.session_state.error_message = None
            if not selected_analyses:
                st.warning("Please select analysis types.")
            elif file_count == 0:
                st.warning("No relevant code files found.")
            elif not model_ready:
                st.warning("Please select a Gemini model from the sidebar.")
            else:
                with results_placeholder:
                    spinner_model_name = (
                        st.session_state.selected_model_name 
                        if not st.session_state.mock_api_call 
                        else "Mock Mode"
                    )
                    spinner_msg = f"🚀 Preparing prompt & contacting AI ({spinner_model_name})... Please wait."
                    with st.spinner(spinner_msg):
                        analysis_prompt, included_files_in_prompt = construct_analysis_prompt(code_files, selected_analyses)
                        if analysis_prompt and included_files_in_prompt:
                            results_json, error_msg = call_gemini_api(analysis_prompt)
                            st.session_state.analysis_results = results_json
                            st.session_state.error_message = error_msg
                        elif not included_files_in_prompt:
                            st.session_state.error_message = "Could not proceed: No files included."
                        else:
                            st.session_state.error_message = "Failed to generate analysis prompt."
                st.rerun()

if st.session_state.analysis_requested:
    with results_placeholder:
        st.divider()
        if st.session_state.error_message:
            st.error(f"Analysis Failed: {st.session_state.error_message}")
            if isinstance(st.session_state.analysis_results, dict) and "raw_response" in st.session_state.analysis_results:
                st.subheader("Raw AI Response")
                st.code(st.session_state.analysis_results["raw_response"], language='text')
        elif st.session_state.analysis_results:
            display_results(st.session_state.analysis_results, selected_analyses)
        else:
            st.info("Analysis initiated, but no results/errors stored.")
elif not uploaded_file:
    results_placeholder.info("Upload a ZIP file to begin.")

results_placeholder.divider()
results_placeholder.markdown("_Assistant powered by Google Gemini._")