mgbam's picture
Update app.py
95b3eeb verified
raw
history blame
23.1 kB
import streamlit as st
import google.generativeai as genai
import zipfile
import io
import json
import os
from pathlib import Path
import time
# --- Configuration ---
GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-03-25"
MAX_PROMPT_TOKENS_ESTIMATE = 800000
RESULTS_PAGE_SIZE = 25 # Number of items to show per category initially
AVAILABLE_ANALYSES = {
# ... (keep the same)
"generate_docs": "Generate Missing Docstrings/Comments",
"find_bugs": "Identify Potential Bugs & Anti-patterns",
"check_style": "Check Style Guide Compliance (General)",
"summarize_modules": "Summarize Complex Modules/Files",
"suggest_refactoring": "Suggest Refactoring Opportunities"
}
CODE_EXTENSIONS = {'.py', '.js', '.java', '.c', '.cpp', '.h', '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.ts', '.html', '.css', '.scss', '.sql'}
# --- Session State Initialization ---
if 'mock_api_call' not in st.session_state:
st.session_state.mock_api_call = False
if 'analysis_results' not in st.session_state:
st.session_state.analysis_results = None # Store results here
if 'error_message' not in st.session_state:
st.session_state.error_message = None
if 'analysis_requested' not in st.session_state:
st.session_state.analysis_requested = False # Flag to know when analysis is done
# --- Gemini API Setup ---
model = None
def initialize_gemini_model():
"""Initializes the Gemini API model unless running in mock mode."""
global model
if model is None and not st.session_state.mock_api_call:
try:
if 'GEMINI_API_KEY' not in st.secrets:
st.error("🚨 Gemini API Key not found. Add it to `.streamlit/secrets.toml`.")
st.stop()
genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
model = genai.GenerativeModel(GEMINI_MODEL_NAME)
print("Gemini Model Initialized.")
return True
except Exception as e:
st.error(f"🚨 Error initializing Gemini SDK: {e}")
st.stop()
return False
elif st.session_state.mock_api_call:
# Running in Mock Mode. Skipping Gemini initialization.
return True # Allow proceeding in mock mode
elif model is not None:
# Gemini Model already initialized.
return True
return False
# --- Helper Functions ---
def estimate_token_count(text):
"""Roughly estimate token count (assuming ~3 characters per token)."""
return len(text) // 3
# --- OPTIMIZATION: Cache ZIP processing ---
@st.cache_data(max_entries=5) # Cache results for recent uploads
def process_zip_file_cached(file_id, file_size, file_content_bytes):
"""Extracts code files and their content. Cached function."""
code_files = {}
total_chars = 0
file_count = 0
ignored_files = []
status_placeholder = st.empty() # For progress bar
progress_bar = status_placeholder.progress(0)
try:
with zipfile.ZipFile(io.BytesIO(file_content_bytes), 'r') as zip_ref:
members = zip_ref.infolist()
total_members = len(members)
for i, member in enumerate(members):
# Update progress bar periodically (every 10 files)
if i % 10 == 0:
progress_bar.progress(int((i / total_members) * 100))
if member.is_dir() or any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename:
continue
file_path = Path(member.filename)
if file_path.suffix.lower() in CODE_EXTENSIONS:
try:
with zip_ref.open(member) as file:
file_bytes = file.read()
try:
content = file_bytes.decode('utf-8')
except UnicodeDecodeError:
try:
content = file_bytes.decode('latin-1')
except Exception as decode_err:
ignored_files.append(f"{member.filename} (Decode Error: {decode_err})")
continue
code_files[member.filename] = content
total_chars += len(content)
file_count += 1
except Exception as read_err:
ignored_files.append(f"{member.filename} (Read Error: {read_err})")
else:
if not (any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename):
ignored_files.append(f"{member.filename} (Skipped Extension: {file_path.suffix})")
progress_bar.progress(100) # Ensure it completes
status_placeholder.empty() # Remove progress bar after completion
except zipfile.BadZipFile:
status_placeholder.empty()
st.error("🚨 Invalid or corrupted ZIP file.")
return None, 0, 0, []
except Exception as e:
status_placeholder.empty()
st.error(f"🚨 Error processing ZIP file: {e}")
return None, 0, 0, []
if file_count == 0 and not ignored_files:
st.warning("No files with recognized code extensions found in the ZIP.")
elif file_count == 0 and ignored_files:
st.warning("No files with recognized code extensions found. Some files were skipped.")
print(f"Cache miss or new file: Processed ZIP {file_id}") # Debug print
return code_files, total_chars, file_count, ignored_files
def construct_analysis_prompt(code_files_dict, requested_analyses):
"""Constructs the prompt for Gemini, including code content and JSON structure request."""
prompt_parts = ["Analyze the following codebase provided as a collection of file paths and their content.\n\n"]
current_token_estimate = estimate_token_count(prompt_parts[0])
included_files = []
# Use join for potentially faster concatenation
code_segments = []
# Provide feedback for large codebases
prompt_status = st.empty()
if len(code_files_dict) > 50:
prompt_status.write("Constructing prompt (processing files)...")
for filename, content in code_files_dict.items():
file_marker = f"--- START FILE: {filename} ---\n"
file_content = f"{content}\n"
file_end_marker = f"--- END FILE: {filename} ---\n\n"
segment = file_marker + file_content + file_end_marker
segment_token_estimate = estimate_token_count(segment)
if current_token_estimate + segment_token_estimate <= MAX_PROMPT_TOKENS_ESTIMATE:
code_segments.append(segment)
current_token_estimate += segment_token_estimate
included_files.append(filename)
else:
st.warning(f"⚠️ Codebase may exceed context window estimate (~{MAX_PROMPT_TOKENS_ESTIMATE} tokens). Analysis performed only on the first {len(included_files)} files ({current_token_estimate:,} tokens).")
break
prompt_status.empty() # Clear status message
if not included_files:
st.error("🚨 No code files could be included within the estimated token limit.")
return None, []
concatenated_code = "".join(code_segments)
prompt_parts.append(concatenated_code)
# Generate the expected JSON structure description based on selected analyses
json_structure_description = "{\n"
structure_parts = []
if "generate_docs" in requested_analyses:
structure_parts.append(' "documentation_suggestions": [{"file": "path/to/file", "line": number, "suggestion": "Suggested docstring/comment"}]')
if "find_bugs" in requested_analyses:
structure_parts.append(' "potential_bugs": [{"file": "path/to/file", "line": number, "description": "Description of potential bug/anti-pattern", "severity": "High/Medium/Low"}]')
if "check_style" in requested_analyses:
structure_parts.append(' "style_issues": [{"file": "path/to/file", "line": number, "description": "Description of style deviation"}]')
if "summarize_modules" in requested_analyses:
structure_parts.append(' "module_summaries": [{"file": "path/to/file", "summary": "One-paragraph summary of the file purpose/functionality"}]')
if "suggest_refactoring" in requested_analyses:
structure_parts.append(' "refactoring_suggestions": [{"file": "path/to/file", "line": number, "area": "e.g., function name, class name", "suggestion": "Description of refactoring suggestion"}]')
json_structure_description += ",\n".join(structure_parts)
json_structure_description += "\n}"
prompt_footer = f"""
**Analysis Task:**
Perform the analyses corresponding to the keys present in the JSON structure below, based *only* on the provided code files ({', '.join(included_files)}).
**Output Format:**
Respond ONLY with a single, valid JSON object adhering strictly to the following structure. If no issues/suggestions are found for a category, provide an empty list `[]`. Do not include explanations outside the JSON structure.
{json_structure_description}
**JSON Output Only:**
"""
prompt_parts.append(prompt_footer)
full_prompt = "".join(prompt_parts)
return full_prompt, included_files
def call_gemini_api(prompt):
"""Calls the Gemini API or returns mock data based on session state."""
if not prompt:
return None, "Prompt generation failed."
# MOCK MODE LOGIC
if st.session_state.mock_api_call:
st.info("MOCK MODE: Simulating API call...")
st.write("...") # Minimal feedback in mock mode
time.sleep(1) # Shorter mock delay
mock_json_response = json.dumps({
"documentation_suggestions": [{"file": "mock/core.py", "line": 15, "suggestion": "def process_data(data):\n \"\"\"Processes the input data using mock logic.\"\"\""}],
"potential_bugs": [{"file": "mock/utils.py", "line": 22, "description": "Potential division by zero if denominator is not checked.", "severity": "Medium"}],
"style_issues": [],
"module_summaries": [],
"refactoring_suggestions": []
})
st.success("Mock response generated.")
return json.loads(mock_json_response), None
# REAL API CALL LOGIC
else:
if not initialize_gemini_model():
return None, "Gemini Model Initialization Failed."
if model is None:
return None, "Gemini model not available."
try:
api_status = st.empty()
token_estimate = estimate_token_count(prompt)
api_status.info(f"πŸ“‘ Sending request to {GEMINI_MODEL_NAME} (Estimated prompt tokens: {token_estimate:,})... This can take several minutes depending on code size and model load.")
start_time = time.time()
response = model.generate_content(
prompt,
generation_config=genai.types.GenerationConfig(temperature=0.2),
safety_settings=[
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]
)
end_time = time.time()
api_status.success(f"βœ… Response received from AI in {end_time - start_time:.2f} seconds.")
time.sleep(1)
api_status.empty()
try:
json_response_text = response.text.strip()
if json_response_text.startswith("```json"):
json_response_text = json_response_text[7:]
if json_response_text.startswith("```"):
json_response_text = json_response_text[3:]
if json_response_text.endswith("```"):
json_response_text = json_response_text[:-3]
json_start = json_response_text.find('{')
json_end = json_response_text.rfind('}') + 1
if json_start != -1 and json_end != -1 and json_end > json_start:
final_json_text = json_response_text[json_start:json_end]
insights = json.loads(final_json_text)
return insights, None
else:
st.warning("⚠️ Could not find valid JSON object boundaries ({...}) in response.")
return {"raw_response": response.text}, "AI response did not contain clear JSON object, showing raw text."
except json.JSONDecodeError as json_err:
st.error(f"🚨 Error parsing JSON response from AI: {json_err}")
st.code(response.text, language='text')
return None, f"AI response was not valid JSON: {json_err}"
except AttributeError:
st.error("🚨 Unexpected API response structure (AttributeError).")
st.code(f"Response object: {response}", language='text')
try:
block_reason = response.prompt_feedback.block_reason
if block_reason:
return None, f"Content blocked by API. Reason: {block_reason}"
except Exception:
pass
return None, "Unexpected response structure from API (AttributeError)."
except Exception as e:
st.error(f"🚨 Unexpected issue processing AI response: {e}")
try:
st.code(f"Response object: {response}", language='text')
except Exception:
pass
return None, f"Unexpected response structure: {e}"
except Exception as e:
api_status.empty()
st.error(f"🚨 An error occurred during API call: {e}")
error_msg = f"API call failed: {e}"
if hasattr(e, 'message'):
if "429" in e.message:
error_msg = "API Quota Exceeded or Rate Limit hit."
elif "API key not valid" in e.message:
error_msg = "Invalid Gemini API Key."
elif "blocked" in e.message.lower():
error_msg = "Content blocked due to safety settings."
elif "block_reason: SAFETY" in str(e):
error_msg = "Content blocked due to safety settings."
return None, error_msg
def display_results(results_json, requested_analyses):
"""Renders the analysis results with pagination."""
st.header("πŸ“Š Analysis Report")
if not isinstance(results_json, dict):
st.error("Invalid results format received.")
st.json(results_json)
return
if "raw_response" in results_json:
st.subheader("Raw AI Response (JSON Parsing Failed)")
st.code(results_json["raw_response"], language='text')
return
display_config = {
"generate_docs": {"key": "documentation_suggestions", "title": AVAILABLE_ANALYSES["generate_docs"], "fields": {"file": "File", "line": "Line"}},
"find_bugs": {"key": "potential_bugs", "title": AVAILABLE_ANALYSES["find_bugs"], "fields": {"file": "File", "line": "Line", "severity": "Severity"}},
"check_style": {"key": "style_issues", "title": AVAILABLE_ANALYSES["check_style"], "fields": {"file": "File", "line": "Line"}},
"summarize_modules": {"key": "module_summaries", "title": AVAILABLE_ANALYSES["summarize_modules"], "fields": {"file": "File"}},
"suggest_refactoring": {"key": "refactoring_suggestions", "title": AVAILABLE_ANALYSES["suggest_refactoring"], "fields": {"file": "File", "line": "Line", "area": "Area"}}
}
any_results_found = False
for analysis_key in requested_analyses:
if analysis_key in display_config:
config = display_config[analysis_key]
items = results_json.get(config["key"], [])
total_items = len(items)
st.subheader(f"{config['title']} ({total_items} found)")
if items:
any_results_found = True
state_key = f"visible_{analysis_key}"
if state_key not in st.session_state:
st.session_state[state_key] = RESULTS_PAGE_SIZE
visible_count = st.session_state[state_key]
items_to_display = items[:visible_count]
for item in items_to_display:
details = []
for field_key, field_label in config["fields"].items():
value = item.get(field_key, 'N/A')
if value != 'N/A':
details.append(f"**{field_label}:** `{value}`" if field_key == 'file' else f"**{field_label}:** {value}")
st.markdown("- " + " - ".join(details))
if 'suggestion' in item:
st.code(item['suggestion'], language='text')
elif 'description' in item:
st.markdown(f" > {item['description']}")
elif 'summary' in item:
st.markdown(f" > {item['summary']}")
if total_items > visible_count:
if st.button(f"Show more ({total_items - visible_count} remaining)", key=f"more_{analysis_key}"):
st.session_state[state_key] += RESULTS_PAGE_SIZE
st.rerun()
else:
st.markdown("_No items found for this category._")
st.divider()
if not any_results_found:
st.info("No specific findings were identified in the analysis based on your selections.")
st.download_button(
label="Download Full Report (JSON)",
data=json.dumps(results_json, indent=4),
file_name="code_audit_report.json",
mime="application/json"
)
# --- Streamlit App Main Interface ---
st.set_page_config(page_title="Codebase Audit Assistant", layout="wide")
st.title("πŸ€– Codebase Audit Assistant")
st.markdown(f"Upload codebase (`.zip`) for analysis via **{GEMINI_MODEL_NAME}**.")
with st.sidebar:
st.header("βš™οΈ Analysis Controls")
st.session_state.mock_api_call = st.toggle("πŸ§ͺ Enable Mock API Mode", value=st.session_state.mock_api_call, help="Use fake data instead of calling Gemini API.")
st.info("Mock API Mode ACTIVE" if st.session_state.mock_api_call else "Using REAL Gemini API")
st.divider()
st.header("πŸ”Ž Select Analyses")
selected_analyses = [key for key, name in AVAILABLE_ANALYSES.items() if st.checkbox(name, value=True, key=f"cb_{key}")]
st.divider()
st.header("πŸ“„ How To Use")
st.info("1. Set API Key (if not in Mock Mode).\n2. Toggle Mock Mode if needed.\n3. Select analyses.\n4. Create & Upload a **ZIP** of your code.\n5. Click 'Analyze Codebase'.\n6. Review the report.")
st.info(f"Note: Only common code extensions are supported. Analysis is limited by token estimates (~{MAX_PROMPT_TOKENS_ESTIMATE:,} estimated tokens).")
st.divider()
st.warning("⚠️ **Privacy:** Code is sent to the Google API if Mock Mode is OFF.")
uploaded_file = st.file_uploader("πŸ“ Upload Codebase ZIP File", type=['zip'], key="file_uploader",
on_change=lambda: st.session_state.update(analysis_results=None, error_message=None, analysis_requested=False))
analysis_button_placeholder = st.empty() # Placeholder for the button
results_placeholder = st.container() # Container for results display
if uploaded_file:
st.success(f"βœ… File '{uploaded_file.name}' uploaded.")
uploaded_file_bytes = uploaded_file.getvalue()
file_id = f"{uploaded_file.name}-{uploaded_file.size}"
code_files, total_chars, file_count, ignored_files = process_zip_file_cached(file_id, uploaded_file.size, uploaded_file_bytes)
if code_files is not None:
st.info(f"Found **{file_count}** relevant code files ({total_chars:,} characters). Est. tokens: ~{estimate_token_count(total_chars):,}")
if ignored_files:
with st.expander(f"View {len(ignored_files)} Skipped/Ignored Files"):
st.code("\n".join(ignored_files), language='text')
analyze_button_disabled = (not selected_analyses or file_count == 0)
analyze_button_label = "Analyze Codebase" if not analyze_button_disabled else "Select Analyses or Upload Valid Code"
if analysis_button_placeholder.button(analyze_button_label, type="primary", disabled=analyze_button_disabled):
st.session_state.analysis_requested = True
st.session_state.analysis_results = None
st.session_state.error_message = None
if not selected_analyses:
st.warning("Please select analysis types.")
elif file_count == 0:
st.warning("No relevant code files found.")
else:
with results_placeholder:
with st.spinner(f"πŸš€ Preparing prompt & contacting AI ({'Mock Mode' if st.session_state.mock_api_call else GEMINI_MODEL_NAME})... Please wait."):
analysis_prompt, included_files_in_prompt = construct_analysis_prompt(code_files, selected_analyses)
if analysis_prompt and included_files_in_prompt:
results_json, error_msg = call_gemini_api(analysis_prompt)
st.session_state.analysis_results = results_json
st.session_state.error_message = error_msg
elif not included_files_in_prompt:
st.session_state.error_message = "Could not proceed: No files included (check token limits/errors)."
else:
st.session_state.error_message = "Failed to generate analysis prompt."
st.rerun()
if st.session_state.analysis_requested:
with results_placeholder:
st.divider()
if st.session_state.error_message:
st.error(f"Analysis Failed: {st.session_state.error_message}")
if isinstance(st.session_state.analysis_results, dict) and "raw_response" in st.session_state.analysis_results:
st.subheader("Raw AI Response")
st.code(st.session_state.analysis_results["raw_response"], language='text')
elif st.session_state.analysis_results:
display_results(st.session_state.analysis_results, selected_analyses)
else:
st.info("Analysis initiated, but no results or errors were stored. Please try again.")
elif not uploaded_file:
results_placeholder.info("Upload a ZIP file containing your source code to begin.")
results_placeholder.divider()
results_placeholder.markdown("_Assistant powered by Google Gemini._")