mgbam's picture
Update app.py
5477235 verified
raw
history blame
22 kB
import streamlit as st
import google.generativeai as genai
import zipfile
import io
import json
import os
from pathlib import Path
import time # Added for simulating mock delay
# --- Configuration ---
GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-03-25"
MAX_PROMPT_TOKENS_ESTIMATE = 800000 # Adjust as needed
AVAILABLE_ANALYSES = {
"generate_docs": "Generate Missing Docstrings/Comments",
"find_bugs": "Identify Potential Bugs & Anti-patterns",
"check_style": "Check Style Guide Compliance (General)",
"summarize_modules": "Summarize Complex Modules/Files",
"suggest_refactoring": "Suggest Refactoring Opportunities"
}
CODE_EXTENSIONS = {
'.py', '.js', '.java', '.c', '.cpp', '.h', '.cs', '.go',
'.rb', '.php', '.swift', '.kt', '.ts', '.html', '.css', '.scss', '.sql'
}
# --- Session State Initialization ---
if 'mock_api_call' not in st.session_state:
st.session_state.mock_api_call = False # Default to using the real API
# --- Gemini API Setup ---
model = None
def initialize_gemini_model():
"""Initializes the Gemini model if not in mock mode."""
global model
if model is None and not st.session_state.mock_api_call:
try:
if 'GEMINI_API_KEY' not in st.secrets:
st.error("🚨 Gemini API Key not found. Add it to `.streamlit/secrets.toml`.")
st.stop()
genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
model = genai.GenerativeModel(GEMINI_MODEL_NAME)
print("Gemini Model Initialized.")
return True
except Exception as e:
st.error(f"🚨 Error initializing Gemini SDK: {e}")
st.stop()
return False
elif st.session_state.mock_api_call:
print("Running in Mock Mode. Skipping Gemini initialization.")
return True # Allow proceeding in mock mode
elif model is not None:
print("Gemini Model already initialized.")
return True
return False
# --- Helper Functions ---
def estimate_token_count(text):
"""Roughly estimate token count (assumes ~3-4 characters per token)."""
return len(text) // 3
def process_zip_file(uploaded_file):
"""
Extracts code files and their content from the uploaded ZIP file.
Returns:
code_files (dict): Mapping of file paths to content.
total_chars (int): Total number of characters in included files.
file_count (int): Count of processed code files.
ignored_files (list): List of files skipped or not processed.
"""
code_files = {}
total_chars = 0
file_count = 0
ignored_files = []
try:
with zipfile.ZipFile(io.BytesIO(uploaded_file.getvalue()), 'r') as zip_ref:
for member in zip_ref.infolist():
# Skip directories, hidden files, and files with '__' in the name
if member.is_dir() or any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename:
continue
file_path = Path(member.filename)
if file_path.suffix.lower() in CODE_EXTENSIONS:
try:
with zip_ref.open(member) as file:
try:
content = file.read().decode('utf-8')
except UnicodeDecodeError:
try:
content = file.read().decode('latin-1')
except Exception as decode_err:
ignored_files.append(f"{member.filename} (Decode Error: {decode_err})")
continue
code_files[member.filename] = content
total_chars += len(content)
file_count += 1
except Exception as read_err:
ignored_files.append(f"{member.filename} (Read Error: {read_err})")
else:
# Only add to ignored if it's not explicitly ignored by path rules above
if not (any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename):
ignored_files.append(f"{member.filename} (Skipped Extension: {file_path.suffix})")
except zipfile.BadZipFile:
st.error("🚨 Invalid or corrupted ZIP file.")
return None, 0, 0, []
except Exception as e:
st.error(f"🚨 Error processing ZIP file: {e}")
return None, 0, 0, []
return code_files, total_chars, file_count, ignored_files
def construct_analysis_prompt(code_files_dict, requested_analyses):
"""
Constructs the prompt for Gemini, including code content and a JSON structure request.
Returns:
full_prompt (str): The complete prompt.
included_files (list): List of file names included in the prompt.
"""
prompt_content = "Analyze the following codebase provided as a collection of file paths and their content.\n\n"
current_token_estimate = estimate_token_count(prompt_content)
included_files = []
concatenated_code = ""
for filename, content in code_files_dict.items():
file_marker = f"--- START FILE: {filename} ---\n"
file_content = f"{content}\n"
file_end_marker = f"--- END FILE: {filename} ---\n\n"
segment = file_marker + file_content + file_end_marker
segment_token_estimate = estimate_token_count(segment)
if current_token_estimate + segment_token_estimate <= MAX_PROMPT_TOKENS_ESTIMATE:
concatenated_code += segment
current_token_estimate += segment_token_estimate
included_files.append(filename)
else:
st.warning(f"⚠️ Codebase may exceed context window estimate (~{MAX_PROMPT_TOKENS_ESTIMATE} tokens). Analysis performed only on the first {len(included_files)} files ({current_token_estimate} tokens).")
break
if not included_files:
st.error("🚨 No code files could be included within the estimated token limit.")
return None, []
prompt_content += concatenated_code
# Build the expected JSON structure dynamically based on the selected analyses
json_structure_description = "{\n"
structure_parts = []
if "generate_docs" in requested_analyses:
structure_parts.append(' "documentation_suggestions": [{"file": "path/to/file", "line": number, "suggestion": "Suggested docstring/comment"}]')
if "find_bugs" in requested_analyses:
structure_parts.append(' "potential_bugs": [{"file": "path/to/file", "line": number, "description": "Description of potential bug/anti-pattern", "severity": "High/Medium/Low"}]')
if "check_style" in requested_analyses:
structure_parts.append(' "style_issues": [{"file": "path/to/file", "line": number, "description": "Description of style deviation"}]')
if "summarize_modules" in requested_analyses:
structure_parts.append(' "module_summaries": [{"file": "path/to/file", "summary": "One-paragraph summary of the file purpose/functionality"}]')
if "suggest_refactoring" in requested_analyses:
structure_parts.append(' "refactoring_suggestions": [{"file": "path/to/file", "line": number, "area": "e.g., function name, class name", "suggestion": "Description of refactoring suggestion"}]')
json_structure_description += ",\n".join(structure_parts)
json_structure_description += "\n}"
prompt_footer = f"""
**Analysis Task:**
Perform the analyses corresponding to the keys present in the JSON structure below, based *only* on the provided code files ({', '.join(included_files)}).
**Output Format:**
Respond ONLY with a single, valid JSON object adhering strictly to the following structure. If no issues/suggestions are found for a category, provide an empty list `[]`. Do not include explanations outside the JSON structure.
{json_structure_description}
**JSON Output Only:**
"""
full_prompt = prompt_content + prompt_footer
return full_prompt, included_files
def call_gemini_api(prompt):
"""
Calls the Gemini API (or simulates it in mock mode) with the provided prompt.
Returns:
insights (dict): The parsed JSON response from the API.
error_message (str): An error message if something went wrong.
"""
if not prompt:
return None, "Prompt generation failed."
# --- MOCK MODE LOGIC ---
if st.session_state.mock_api_call:
st.info(" MOCK MODE: Simulating API call...")
time.sleep(2) # Simulate network/processing delay
# Simulated successful response
mock_json_response = json.dumps({
"documentation_suggestions": [{"file": "mock/core.py", "line": 15, "suggestion": "def process_data(data):\n \"\"\"Processes the input data using mock logic.\"\"\""}],
"potential_bugs": [{"file": "mock/utils.py", "line": 22, "description": "Potential division by zero if denominator is not checked.", "severity": "Medium"}],
"style_issues": [{"file": "mock/core.py", "line": 5, "description": "Variable 'varName' does not follow snake_case convention."}],
"module_summaries": [
{"file": "mock/core.py", "summary": "This file contains the core mock processing logic."},
{"file": "mock/utils.py", "summary": "Utility functions for mocking."}
],
"refactoring_suggestions": [{"file": "mock/utils.py", "line": 30, "area": "calculate_metrics function", "suggestion": "Function is too long (> 50 lines), consider breaking it down."}]
})
st.success("Mock response generated successfully.")
return json.loads(mock_json_response), None
# --- REAL API CALL LOGIC ---
else:
if not initialize_gemini_model():
return None, "Gemini Model Initialization Failed."
if model is None:
return None, "Gemini model not available."
try:
st.write(f"πŸ“‘ Sending request to {GEMINI_MODEL_NAME}...")
response = model.generate_content(
prompt,
generation_config=genai.types.GenerationConfig(temperature=0.2),
safety_settings=[
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]
)
st.write("βœ… Response received from AI.")
try:
json_response_text = response.text.strip()
# Remove potential markdown code block fences
if json_response_text.startswith("```json"):
json_response_text = json_response_text[7:]
if json_response_text.startswith("```"):
json_response_text = json_response_text[3:]
if json_response_text.endswith("```"):
json_response_text = json_response_text[:-3]
# Extract JSON object boundaries
json_start = json_response_text.find('{')
json_end = json_response_text.rfind('}') + 1
if json_start != -1 and json_end != -1 and json_end > json_start:
final_json_text = json_response_text[json_start:json_end]
insights = json.loads(final_json_text)
return insights, None
else:
st.warning("⚠️ Could not find valid JSON object boundaries ({...}) in response. Displaying raw text.")
return {"raw_response": response.text}, "AI response did not contain clear JSON object, showing raw text."
except json.JSONDecodeError as json_err:
st.error(f"🚨 Error parsing JSON response from AI: {json_err}")
st.error("Raw AI Response:")
st.code(response.text, language='text')
return None, f"AI response was not valid JSON: {json_err}"
except AttributeError:
st.error("🚨 Unexpected API response structure.")
st.code(f"Response object: {response}", language='text')
try:
block_reason = response.prompt_feedback.block_reason
if block_reason:
return None, f"Content blocked by API. Reason: {block_reason}"
except Exception:
pass
return None, "Unexpected response structure from API."
except Exception as e:
st.error(f"🚨 Unexpected issue processing AI response: {e}")
try:
st.code(f"Response object: {response}", language='text')
except Exception:
pass
return None, f"Unexpected response structure: {e}"
except Exception as e:
st.error(f"🚨 An error occurred during API call: {e}")
error_msg = f"API call failed: {e}"
if hasattr(e, 'message'):
if "429" in e.message:
error_msg = "API Quota Exceeded or Rate Limit hit. Check your Google Cloud/AI Studio dashboard."
elif "API key not valid" in e.message:
error_msg = "Invalid Gemini API Key. Please check `.streamlit/secrets.toml`."
elif "blocked" in e.message.lower():
error_msg = "Content blocked due to safety settings. Review input code or adjust safety settings if appropriate."
elif "block_reason: SAFETY" in str(e):
error_msg = "Content blocked due to safety settings. Review input code or adjust safety settings if appropriate."
return None, error_msg
def display_results(results_json, requested_analyses):
"""Renders the analysis results in the Streamlit interface."""
st.header("πŸ“Š Analysis Report")
if not isinstance(results_json, dict):
st.error("Invalid results format received.")
st.json(results_json)
return
if "raw_response" in results_json:
st.subheader("Raw AI Response (JSON Parsing Failed)")
st.code(results_json["raw_response"], language='text')
return
def display_list_items(items, fields):
if items:
for item in items:
details = []
for field_key, field_label in fields.items():
value = item.get(field_key, 'N/A')
if value != 'N/A':
details.append(f"**{field_label}:** {value}")
st.markdown("- " + " - ".join(details))
# Display multi-line outputs when applicable
if 'suggestion' in item:
st.code(item['suggestion'], language='text')
elif 'description' in item:
st.markdown(f" > {item['description']}")
elif 'summary' in item:
st.markdown(f" > {item['summary']}")
else:
st.markdown("_No items found for this category._")
st.divider()
display_config = {
"generate_docs": {
"key": "documentation_suggestions",
"title": AVAILABLE_ANALYSES["generate_docs"],
"fields": {"file": "File", "line": "Line"}
},
"find_bugs": {
"key": "potential_bugs",
"title": AVAILABLE_ANALYSES["find_bugs"],
"fields": {"file": "File", "line": "Line", "severity": "Severity"}
},
"check_style": {
"key": "style_issues",
"title": AVAILABLE_ANALYSES["check_style"],
"fields": {"file": "File", "line": "Line"}
},
"summarize_modules": {
"key": "module_summaries",
"title": AVAILABLE_ANALYSES["summarize_modules"],
"fields": {"file": "File"}
},
"suggest_refactoring": {
"key": "refactoring_suggestions",
"title": AVAILABLE_ANALYSES["suggest_refactoring"],
"fields": {"file": "File", "line": "Line", "area": "Area"}
}
}
any_results = False
for analysis_key in requested_analyses:
if analysis_key in display_config:
config = display_config[analysis_key]
st.subheader(config["title"])
items = results_json.get(config["key"], [])
display_list_items(items, config["fields"])
if items:
any_results = True
if not any_results:
st.info("No specific findings were identified in the analysis based on your selections.")
st.download_button(
label="Download Full Report (JSON)",
data=json.dumps(results_json, indent=4),
file_name="code_audit_report.json",
mime="application/json"
)
# --- Streamlit App Main Interface ---
st.set_page_config(page_title="Codebase Audit Assistant", layout="wide")
st.title("πŸ€– Codebase Audit & Documentation Assistant")
st.markdown(f"Upload your codebase (`.zip`) for analysis using **{GEMINI_MODEL_NAME}**.")
# Sidebar controls
with st.sidebar:
st.header("βš™οΈ Analysis Controls")
st.session_state.mock_api_call = st.toggle(
"πŸ§ͺ Enable Mock API Mode (for Testing)",
value=st.session_state.mock_api_call,
help="If enabled, uses fake data instead of calling the real Gemini API. Saves cost during testing."
)
if st.session_state.mock_api_call:
st.info("Mock API Mode ACTIVE")
else:
st.info("Using REAL Gemini API")
st.divider()
st.header("πŸ”Ž Select Analyses")
selected_analyses = []
for key, name in AVAILABLE_ANALYSES.items():
if st.checkbox(name, value=True, key=f"cb_{key}"):
selected_analyses.append(key)
st.divider()
st.header("πŸ“„ How To Use")
st.info(
"1. Set API Key in `.streamlit/secrets.toml` (if not using Mock Mode).\n"
"2. Toggle Mock Mode if needed.\n"
"3. Select desired analyses.\n"
"4. Create a **ZIP archive** of your codebase.\n"
"5. Upload the `.zip` file.\n"
"6. Click 'Analyze Codebase'.\n"
"7. Review the report."
)
st.info(f"**Note:** Only files with common code extensions ({', '.join(CODE_EXTENSIONS)}) are processed. Analysis might be limited (~{MAX_PROMPT_TOKENS_ESTIMATE:,} est. tokens).")
st.divider()
st.warning("⚠️ **Privacy:** Code content is sent to the Google Gemini API if Mock Mode is OFF. Do not upload sensitive code if uncomfortable.")
# Main content area
uploaded_file = st.file_uploader("πŸ“ Upload Codebase ZIP File", type=['zip'], key="file_uploader")
analysis_triggered = False
results_cache = None # To store results briefly
if uploaded_file:
st.success(f"βœ… File '{uploaded_file.name}' uploaded.")
with st.spinner("Inspecting ZIP file..."):
code_files, total_chars, file_count, ignored_files = process_zip_file(uploaded_file)
if code_files is not None:
st.info(f"Found **{file_count}** relevant code files ({total_chars:,} characters). Est. tokens: ~{estimate_token_count(total_chars):,}")
if ignored_files:
with st.expander(f"View {len(ignored_files)} Skipped/Ignored Files"):
st.code("\n".join(ignored_files), language='text')
analyze_button_disabled = (not selected_analyses or file_count == 0)
analyze_button_label = "Analyze Codebase" if not analyze_button_disabled else "Select Analyses or Upload Valid Code"
if st.button(analyze_button_label, type="primary", disabled=analyze_button_disabled):
analysis_triggered = True
if not selected_analyses:
st.warning("Please select at least one analysis type from the sidebar.")
elif file_count == 0:
st.warning("No relevant code files found in the ZIP archive to analyze.")
else:
st.divider()
with st.spinner(f"πŸš€ Preparing prompt & contacting AI ({'Mock Mode' if st.session_state.mock_api_call else GEMINI_MODEL_NAME})... This may take time."):
analysis_prompt, included_files_in_prompt = construct_analysis_prompt(code_files, selected_analyses)
if analysis_prompt and included_files_in_prompt:
st.write(f"Analyzing {len(included_files_in_prompt)} files...")
results_json, error_message = call_gemini_api(analysis_prompt)
results_cache = (results_json, error_message)
elif not included_files_in_prompt:
results_cache = (None, "Could not proceed: No files included in prompt (check token limits/errors).")
else:
results_cache = (None, "Failed to generate analysis prompt.")
else:
# Error during ZIP processing (error already displayed)
pass
if analysis_triggered and results_cache:
results_json, error_message = results_cache
st.divider()
if error_message:
st.error(f"Analysis Failed: {error_message}")
if results_json and isinstance(results_json, dict) and "raw_response" in results_json:
st.subheader("Raw AI Response")
st.code(results_json["raw_response"], language='text')
elif results_json:
display_results(results_json, selected_analyses)
else:
st.error("Analysis did not return results or an unknown error occurred.")
elif not uploaded_file:
st.info("Upload a ZIP file containing your source code to begin.")
st.divider()
st.markdown("_Assistant powered by Google Gemini._")