mgbam's picture
Update app.py
3a80282 verified
raw
history blame
23.7 kB
import streamlit as st
import google.generativeai as genai
import zipfile
import io
import json
import os
from pathlib import Path
import time # Added for simulating mock delay
# --- Configuration ---
GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-03-25"
MAX_PROMPT_TOKENS_ESTIMATE = 800000 # Adjust as needed
AVAILABLE_ANALYSES = {
"generate_docs": "Generate Missing Docstrings/Comments",
"find_bugs": "Identify Potential Bugs & Anti-patterns",
"check_style": "Check Style Guide Compliance (General)",
"summarize_modules": "Summarize Complex Modules/Files",
"suggest_refactoring": "Suggest Refactoring Opportunities"
}
CODE_EXTENSIONS = {'.py', '.js', '.java', '.c', '.cpp', '.h', '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.ts', '.html', '.css', '.scss', '.sql'}
# --- Session State Initialization ---
# Initialize session state for mock mode toggle if it doesn't exist
if 'mock_api_call' not in st.session_state:
st.session_state.mock_api_call = False # Default to using the real API
# --- Gemini API Setup ---
# Defer full initialization until needed if mock mode might be used first
model = None
def initialize_gemini_model():
global model
if model is None and not st.session_state.mock_api_call:
try:
if 'GEMINI_API_KEY' not in st.secrets:
st.error("🚨 Gemini API Key not found. Add it to `.streamlit/secrets.toml`.")
st.stop()
genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
model = genai.GenerativeModel(GEMINI_MODEL_NAME)
print("Gemini Model Initialized.")
return True
except Exception as e:
st.error(f"🚨 Error initializing Gemini SDK: {e}")
st.stop()
return False
elif st.session_state.mock_api_call:
print("Running in Mock Mode. Skipping Gemini initialization.")
return True # Allow proceeding in mock mode
elif model is not None:
print("Gemini Model already initialized.")
return True
return False
# --- Helper Functions ---
def estimate_token_count(text):
"""Roughly estimate token count (3-4 chars per token)."""
return len(text) // 3
def process_zip_file(uploaded_file):
"""Extracts code files and their content from the uploaded zip file."""
code_files = {}
total_chars = 0
file_count = 0
ignored_files = []
try:
with zipfile.ZipFile(io.BytesIO(uploaded_file.getvalue()), 'r') as zip_ref:
for member in zip_ref.infolist():
if member.is_dir() or any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename:
continue
file_path = Path(member.filename)
if file_path.suffix.lower() in CODE_EXTENSIONS:
try:
with zip_ref.open(member) as file:
try:
content = file.read().decode('utf-8')
except UnicodeDecodeError:
try:
content = file.read().decode('latin-1')
except Exception as decode_err:
ignored_files.append(f"{member.filename} (Decode Error: {decode_err})")
continue
code_files[member.filename] = content
total_chars += len(content)
file_count += 1
except Exception as read_err:
ignored_files.append(f"{member.filename} (Read Error: {read_err})")
else:
# Only add to ignored if it's not explicitly ignored by path rules above
if not (any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename):
ignored_files.append(f"{member.filename} (Skipped Extension: {file_path.suffix})")
except zipfile.BadZipFile:
st.error("🚨 Invalid or corrupted ZIP file.")
return None, 0, 0, []
except Exception as e:
st.error(f"🚨 Error processing ZIP file: {e}")
return None, 0, 0, []
return code_files, total_chars, file_count, ignored_files
def construct_analysis_prompt(code_files_dict, requested_analyses):
"""Constructs the prompt for Gemini, including code content and JSON structure request."""
prompt_content = "Analyze the following codebase provided as a collection of file paths and their content.\n\n"
current_token_estimate = estimate_token_count(prompt_content)
included_files = []
concatenated_code = ""
for filename, content in code_files_dict.items():
file_marker = f"--- START FILE: {filename} ---\n"
file_content = f"{content}\n"
file_end_marker = f"--- END FILE: {filename} ---\n\n"
segment = file_marker + file_content + file_end_marker
segment_token_estimate = estimate_token_count(segment)
if current_token_estimate + segment_token_estimate <= MAX_PROMPT_TOKENS_ESTIMATE:
concatenated_code += segment
current_token_estimate += segment_token_estimate
included_files.append(filename)
else:
st.warning(f"⚠️ Codebase may exceed context window estimate (~{MAX_PROMPT_TOKENS_ESTIMATE} tokens). Analysis performed only on the first {len(included_files)} files ({current_token_estimate} tokens).")
break
if not included_files:
st.error("🚨 No code files could be included within the estimated token limit.")
return None, []
prompt_content += concatenated_code
json_structure_description = "{\n"
# Dynamically build the JSON structure based on selection
structure_parts = []
if "generate_docs" in requested_analyses:
structure_parts.append(' "documentation_suggestions": [{"file": "path/to/file", "line": number, "suggestion": "Suggested docstring/comment"}]')
if "find_bugs" in requested_analyses:
structure_parts.append(' "potential_bugs": [{"file": "path/to/file", "line": number, "description": "Description of potential bug/anti-pattern", "severity": "High/Medium/Low"}]')
if "check_style" in requested_analyses:
structure_parts.append(' "style_issues": [{"file": "path/to/file", "line": number, "description": "Description of style deviation"}]')
if "summarize_modules" in requested_analyses:
structure_parts.append(' "module_summaries": [{"file": "path/to/file", "summary": "One-paragraph summary of the file purpose/functionality"}]')
if "suggest_refactoring" in requested_analyses:
structure_parts.append(' "refactoring_suggestions": [{"file": "path/to/file", "line": number, "area": "e.g., function name, class name", "suggestion": "Description of refactoring suggestion"}]')
json_structure_description += ",\n".join(structure_parts)
json_structure_description += "\n}"
prompt_footer = f"""
**Analysis Task:**
Perform the analyses corresponding to the keys present in the JSON structure below, based *only* on the provided code files ({', '.join(included_files)}).
**Output Format:**
Respond ONLY with a single, valid JSON object adhering strictly to the following structure. If no issues/suggestions are found for a category, provide an empty list `[]`. Do not include explanations outside the JSON structure.
{json_structure_description}
**JSON Output Only:**
"""
full_prompt = prompt_content + prompt_footer
# print(f"--- PROMPT (First 500 chars): ---\n{full_prompt[:500]}\n--------------------------")
# print(f"--- PROMPT (Last 500 chars): ---\n{full_prompt[-500:]}\n--------------------------")
return full_prompt, included_files
def call_gemini_api(prompt):
"""Calls the Gemini API or returns mock data based on session state."""
if not prompt:
return None, "Prompt generation failed."
# --- MOCK MODE LOGIC ---
if st.session_state.mock_api_call:
st.info(" MOCK MODE: Simulating API call...")
time.sleep(2) # Simulate network/processing delay
# --- CHOOSE YOUR MOCK RESPONSE ---
# Option 1: Simulate successful response with some data
mock_json_response = json.dumps({
"documentation_suggestions": [{"file": "mock/core.py", "line": 15, "suggestion": "def process_data(data):\n \"\"\"Processes the input data using mock logic.\"\"\""}],
"potential_bugs": [{"file":"mock/utils.py", "line": 22, "description":"Potential division by zero if denominator is not checked.", "severity":"Medium"}],
"style_issues": [{"file": "mock/core.py", "line": 5, "description": "Variable 'varName' does not follow snake_case convention."}],
"module_summaries": [{"file": "mock/core.py", "summary": "This file contains the core mock processing logic."}, {"file":"mock/utils.py", "summary": "Utility functions for mocking."}],
"refactoring_suggestions": [{"file":"mock/utils.py", "line": 30, "area":"calculate_metrics function", "suggestion": "Function is too long (> 50 lines), consider breaking it down."}]
})
st.success("Mock response generated successfully.")
return json.loads(mock_json_response), None # Return insights, no error
# Option 2: Simulate API error
# st.error("Simulating API error.")
# return None, "MOCK ERROR: Simulated API Quota Exceeded."
# Option 3: Simulate invalid JSON response
# st.warning("Simulating invalid JSON response from AI.")
# return {"raw_response": "{malformed json'"}, "AI response was not valid JSON, showing raw text."
#
# Option 4: Simulate empty results
# mock_empty_json = json.dumps({
# "documentation_suggestions": [], "potential_bugs": [], "style_issues": [],
# "module_summaries": [], "refactoring_suggestions": []
# })
# st.success("Mock response generated (empty results).")
# return json.loads(mock_empty_json), None
# --- END MOCK MODE LOGIC ---
# --- REAL API CALL LOGIC ---
else:
if not initialize_gemini_model(): # Ensure model is ready
return None, "Gemini Model Initialization Failed."
if model is None: # Should not happen if initialize check passed, but safeguard
return None, "Gemini model not available."
try:
st.write(f"πŸ“‘ Sending request to {GEMINI_MODEL_NAME}...")
response = model.generate_content(
prompt,
generation_config=genai.types.GenerationConfig(temperature=0.2),
safety_settings=[
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]
)
st.write("βœ… Response received from AI.")
# Debug: Print raw response text
# print(f"--- RAW API RESPONSE ---\n{response.text}\n------------------------")
try:
# Try to extract JSON robustly
json_response_text = response.text.strip()
# Handle potential markdown code block fences
if json_response_text.startswith("```json"):
json_response_text = json_response_text[7:]
if json_response_text.startswith("```"): # Handle case where ```json wasn't used
json_response_text = json_response_text[3:]
if json_response_text.endswith("```"):
json_response_text = json_response_text[:-3]
# Find the first '{' and the last '}'
json_start = json_response_text.find('{')
json_end = json_response_text.rfind('}') + 1
if json_start != -1 and json_end != -1 and json_end > json_start:
final_json_text = json_response_text[json_start:json_end]
insights = json.loads(final_json_text)
return insights, None
else:
st.warning("⚠️ Could not find valid JSON object boundaries ({...}) in response. Displaying raw text.")
return {"raw_response": response.text}, "AI response did not contain clear JSON object, showing raw text."
except json.JSONDecodeError as json_err:
st.error(f"🚨 Error parsing JSON response from AI: {json_err}")
st.error("Raw AI Response:")
st.code(response.text, language='text')
return None, f"AI response was not valid JSON: {json_err}"
except AttributeError:
# Handle cases where response structure might be different (e.g. blocked)
st.error(f"🚨 Unexpected API response structure.")
st.code(f"Response object: {response}", language='text') # Log the problematic response
# Try to get blocked reason if available
try:
block_reason = response.prompt_feedback.block_reason
if block_reason:
return None, f"Content blocked by API. Reason: {block_reason}"
except Exception:
pass # Ignore if feedback structure isn't as expected
return None, "Unexpected response structure from API."
except Exception as e:
st.error(f"🚨 Unexpected issue processing AI response: {e}")
try: st.code(f"Response object: {response}", language='text')
except: pass
return None, f"Unexpected response structure: {e}"
except Exception as e:
st.error(f"🚨 An error occurred during API call: {e}")
error_msg = f"API call failed: {e}"
# Improved error identification
if hasattr(e, 'message'): # For google.api_core.exceptions
if "429" in e.message:
error_msg = "API Quota Exceeded or Rate Limit hit. Check your Google Cloud/AI Studio dashboard."
elif "API key not valid" in e.message:
error_msg = "Invalid Gemini API Key. Please check `.streamlit/secrets.toml`."
elif "blocked" in e.message.lower(): # General check for safety blocks
error_msg = "Content blocked due to safety settings. Review input code or adjust safety settings if appropriate."
elif "block_reason: SAFETY" in str(e): # Fallback check
error_msg = "Content blocked due to safety settings. Review input code or adjust safety settings if appropriate."
return None, error_msg
def display_results(results_json, requested_analyses):
"""Renders the analysis results in Streamlit."""
st.header("πŸ“Š Analysis Report")
if not isinstance(results_json, dict):
st.error("Invalid results format received.")
st.json(results_json)
return
if "raw_response" in results_json:
st.subheader("Raw AI Response (JSON Parsing Failed)")
st.code(results_json["raw_response"], language='text')
return
# Define display functions for clarity
def display_list_items(items, fields):
if items:
for item in items:
details = []
for field_key, field_label in fields.items():
value = item.get(field_key, 'N/A')
if value != 'N/A': # Only show if value exists
details.append(f"**{field_label}:** {value}")
st.markdown("- " + " - ".join(details))
# Handle specific multi-line outputs like suggestions/summaries
if 'suggestion' in item:
st.code(item['suggestion'], language='text')
elif 'description' in item:
st.markdown(f" > {item['description']}") # Indent description
elif 'summary' in item:
st.markdown(f" > {item['summary']}") # Indent summary
else:
st.markdown("_No items found for this category._")
st.divider()
# Map keys to display configurations
display_config = {
"generate_docs": {
"key": "documentation_suggestions", "title": AVAILABLE_ANALYSES["generate_docs"],
"fields": {"file": "File", "line": "Line"} # Suggestion shown by st.code
},
"find_bugs": {
"key": "potential_bugs", "title": AVAILABLE_ANALYSES["find_bugs"],
"fields": {"file": "File", "line": "Line", "severity": "Severity"} # Description shown separately
},
"check_style": {
"key": "style_issues", "title": AVAILABLE_ANALYSES["check_style"],
"fields": {"file": "File", "line": "Line"} # Description shown separately
},
"summarize_modules": {
"key": "module_summaries", "title": AVAILABLE_ANALYSES["summarize_modules"],
"fields": {"file": "File"} # Summary shown separately
},
"suggest_refactoring": {
"key": "refactoring_suggestions", "title": AVAILABLE_ANALYSES["suggest_refactoring"],
"fields": {"file": "File", "line": "Line", "area": "Area"} # Suggestion shown separately
}
}
# Iterate and display selected sections
any_results = False
for analysis_key in requested_analyses:
if analysis_key in display_config:
config = display_config[analysis_key]
st.subheader(config["title"])
items = results_json.get(config["key"], [])
display_list_items(items, config["fields"])
if items: any_results = True
if not any_results:
st.info("No specific findings were identified in the analysis based on your selections.")
# Download button
st.download_button(
label="Download Full Report (JSON)",
data=json.dumps(results_json, indent=4),
file_name="code_audit_report.json",
mime="application/json"
)
# --- Streamlit App Main Interface ---
st.set_page_config(page_title="Codebase Audit Assistant", layout="wide")
st.title("πŸ€– Codebase Audit & Documentation Assistant")
st.markdown(f"Upload your codebase (`.zip`) for analysis using **{GEMINI_MODEL_NAME}**.")
# Sidebar controls
with st.sidebar:
st.header("βš™οΈ Analysis Controls")
# Mock Mode Toggle
st.session_state.mock_api_call = st.toggle("πŸ§ͺ Enable Mock API Mode (for Testing)", value=st.session_state.mock_api_call,
help="If enabled, uses fake data instead of calling the real Gemini API. Saves cost during testing.")
if st.session_state.mock_api_call:
st.info("Mock API Mode ACTIVE")
else:
st.info("Using REAL Gemini API")
st.divider()
st.header("πŸ”Ž Select Analyses")
selected_analyses = []
for key, name in AVAILABLE_ANALYSES.items():
if st.checkbox(name, value=True, key=f"cb_{key}"):
selected_analyses.append(key)
st.divider()
st.header("πŸ“„ How To Use")
st.info(
"1. Set API Key in `.streamlit/secrets.toml` (if not using Mock Mode).\n"
"2. Toggle Mock Mode if needed.\n"
"3. Select desired analyses.\n"
"4. Create a **ZIP archive** of your codebase.\n"
"5. Upload the `.zip` file.\n"
"6. Click 'Analyze Codebase'.\n"
"7. Review the report."
)
st.info(f"**Note:** Only files with common code extensions ({', '.join(CODE_EXTENSIONS)}) are processed. Analysis might be limited (~{MAX_PROMPT_TOKENS_ESTIMATE:,} est. tokens).")
st.divider()
st.warning("⚠️ **Privacy:** Code content is sent to the Google Gemini API if Mock Mode is OFF. Do not upload sensitive code if uncomfortable.")
# Main content area
uploaded_file = st.file_uploader("πŸ“ Upload Codebase ZIP File", type=['zip'], key="file_uploader")
analysis_triggered = False
results_cache = None # To store results briefly
if uploaded_file:
st.success(f"βœ… File '{uploaded_file.name}' uploaded.")
with st.spinner("Inspecting ZIP file..."):
code_files, total_chars, file_count, ignored_files = process_zip_file(uploaded_file)
if code_files is not None:
st.info(f"Found **{file_count}** relevant code files ({total_chars:,} characters). Est. tokens: ~{estimate_token_count(total_chars):,}")
if ignored_files:
with st.expander(f"View {len(ignored_files)} Skipped/Ignored Files"):
# Use st.code for better formatting of list
st.code("\n".join(ignored_files), language='text')
analyze_button_disabled = (not selected_analyses or file_count == 0)
analyze_button_label = "Analyze Codebase" if not analyze_button_disabled else "Select Analyses or Upload Valid Code"
if st.button(analyze_button_label, type="primary", disabled=analyze_button_disabled):
analysis_triggered = True
if not selected_analyses:
st.warning("Please select at least one analysis type from the sidebar.")
elif file_count == 0:
st.warning("No relevant code files found in the ZIP archive to analyze.")
else:
st.divider()
with st.spinner(f"πŸš€ Preparing prompt & contacting AI ({'Mock Mode' if st.session_state.mock_api_call else GEMINI_MODEL_NAME})... This may take time."):
# 1. Construct Prompt
analysis_prompt, included_files_in_prompt = construct_analysis_prompt(code_files, selected_analyses)
if analysis_prompt and included_files_in_prompt:
st.write(f"Analyzing {len(included_files_in_prompt)} files...")
# 2. Call API (Real or Mock)
results_json, error_message = call_gemini_api(analysis_prompt)
results_cache = (results_json, error_message) # Store results
elif not included_files_in_prompt:
results_cache = (None, "Could not proceed: No files included in prompt (check token limits/errors).")
else:
results_cache = (None, "Failed to generate analysis prompt.")
else: # Error during zip processing
pass # Error message already shown
# Display results outside the button click block if analysis was triggered
if analysis_triggered and results_cache:
results_json, error_message = results_cache
st.divider()
if error_message:
st.error(f"Analysis Failed: {error_message}")
# Display partial results if available (e.g., raw response on JSON error)
if results_json and isinstance(results_json, dict) and "raw_response" in results_json:
st.subheader("Raw AI Response")
st.code(results_json["raw_response"], language='text')
elif results_json:
display_results(results_json, selected_analyses)
else:
st.error("Analysis did not return results or an unknown error occurred.")
elif not uploaded_file:
st.info("Upload a ZIP file containing your source code to begin.")
st.divider()
st.markdown("_Assistant powered by Google Gemini._")