Spaces:
Sleeping
Sleeping
import streamlit as st | |
import google.generativeai as genai | |
import zipfile | |
import io | |
import json | |
import os | |
from pathlib import Path | |
import time | |
# --- Configuration --- | |
GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-03-25" | |
MAX_PROMPT_TOKENS_ESTIMATE = 800000 | |
RESULTS_PAGE_SIZE = 25 # Number of items to show per category initially | |
AVAILABLE_ANALYSES = { | |
# ... (keep the same) | |
"generate_docs": "Generate Missing Docstrings/Comments", | |
"find_bugs": "Identify Potential Bugs & Anti-patterns", | |
"check_style": "Check Style Guide Compliance (General)", | |
"summarize_modules": "Summarize Complex Modules/Files", | |
"suggest_refactoring": "Suggest Refactoring Opportunities" | |
} | |
CODE_EXTENSIONS = {'.py', '.js', '.java', '.c', '.cpp', '.h', '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.ts', '.html', '.css', '.scss', '.sql'} | |
# --- Session State Initialization --- | |
if 'mock_api_call' not in st.session_state: | |
st.session_state.mock_api_call = False | |
if 'analysis_results' not in st.session_state: | |
st.session_state.analysis_results = None # Store results here | |
if 'error_message' not in st.session_state: | |
st.session_state.error_message = None | |
if 'analysis_requested' not in st.session_state: | |
st.session_state.analysis_requested = False # Flag to know when analysis is done | |
# --- Gemini API Setup --- | |
model = None | |
def initialize_gemini_model(): | |
"""Initializes the Gemini API model unless running in mock mode.""" | |
global model | |
if model is None and not st.session_state.mock_api_call: | |
try: | |
if 'GEMINI_API_KEY' not in st.secrets: | |
st.error("π¨ Gemini API Key not found. Add it to `.streamlit/secrets.toml`.") | |
st.stop() | |
genai.configure(api_key=st.secrets["GEMINI_API_KEY"]) | |
model = genai.GenerativeModel(GEMINI_MODEL_NAME) | |
print("Gemini Model Initialized.") | |
return True | |
except Exception as e: | |
st.error(f"π¨ Error initializing Gemini SDK: {e}") | |
st.stop() | |
return False | |
elif st.session_state.mock_api_call: | |
# Running in Mock Mode. Skipping Gemini initialization. | |
return True # Allow proceeding in mock mode | |
elif model is not None: | |
# Gemini Model already initialized. | |
return True | |
return False | |
# --- Helper Functions --- | |
def estimate_token_count(text): | |
"""Roughly estimate token count (assuming ~3 characters per token).""" | |
return len(text) // 3 | |
# --- OPTIMIZATION: Cache ZIP processing --- | |
# Cache results for recent uploads | |
def process_zip_file_cached(file_id, file_size, file_content_bytes): | |
"""Extracts code files and their content. Cached function.""" | |
code_files = {} | |
total_chars = 0 | |
file_count = 0 | |
ignored_files = [] | |
status_placeholder = st.empty() # For progress bar | |
progress_bar = status_placeholder.progress(0) | |
try: | |
with zipfile.ZipFile(io.BytesIO(file_content_bytes), 'r') as zip_ref: | |
members = zip_ref.infolist() | |
total_members = len(members) | |
for i, member in enumerate(members): | |
# Update progress bar periodically (every 10 files) | |
if i % 10 == 0: | |
progress_bar.progress(int((i / total_members) * 100)) | |
if member.is_dir() or any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename: | |
continue | |
file_path = Path(member.filename) | |
if file_path.suffix.lower() in CODE_EXTENSIONS: | |
try: | |
with zip_ref.open(member) as file: | |
file_bytes = file.read() | |
try: | |
content = file_bytes.decode('utf-8') | |
except UnicodeDecodeError: | |
try: | |
content = file_bytes.decode('latin-1') | |
except Exception as decode_err: | |
ignored_files.append(f"{member.filename} (Decode Error: {decode_err})") | |
continue | |
code_files[member.filename] = content | |
total_chars += len(content) | |
file_count += 1 | |
except Exception as read_err: | |
ignored_files.append(f"{member.filename} (Read Error: {read_err})") | |
else: | |
if not (any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename): | |
ignored_files.append(f"{member.filename} (Skipped Extension: {file_path.suffix})") | |
progress_bar.progress(100) # Ensure it completes | |
status_placeholder.empty() # Remove progress bar after completion | |
except zipfile.BadZipFile: | |
status_placeholder.empty() | |
st.error("π¨ Invalid or corrupted ZIP file.") | |
return None, 0, 0, [] | |
except Exception as e: | |
status_placeholder.empty() | |
st.error(f"π¨ Error processing ZIP file: {e}") | |
return None, 0, 0, [] | |
if file_count == 0 and not ignored_files: | |
st.warning("No files with recognized code extensions found in the ZIP.") | |
elif file_count == 0 and ignored_files: | |
st.warning("No files with recognized code extensions found. Some files were skipped.") | |
print(f"Cache miss or new file: Processed ZIP {file_id}") # Debug print | |
return code_files, total_chars, file_count, ignored_files | |
def construct_analysis_prompt(code_files_dict, requested_analyses): | |
"""Constructs the prompt for Gemini, including code content and JSON structure request.""" | |
prompt_parts = ["Analyze the following codebase provided as a collection of file paths and their content.\n\n"] | |
current_token_estimate = estimate_token_count(prompt_parts[0]) | |
included_files = [] | |
# Use join for potentially faster concatenation | |
code_segments = [] | |
# Provide feedback for large codebases | |
prompt_status = st.empty() | |
if len(code_files_dict) > 50: | |
prompt_status.write("Constructing prompt (processing files)...") | |
for filename, content in code_files_dict.items(): | |
file_marker = f"--- START FILE: {filename} ---\n" | |
file_content = f"{content}\n" | |
file_end_marker = f"--- END FILE: {filename} ---\n\n" | |
segment = file_marker + file_content + file_end_marker | |
segment_token_estimate = estimate_token_count(segment) | |
if current_token_estimate + segment_token_estimate <= MAX_PROMPT_TOKENS_ESTIMATE: | |
code_segments.append(segment) | |
current_token_estimate += segment_token_estimate | |
included_files.append(filename) | |
else: | |
st.warning(f"β οΈ Codebase may exceed context window estimate (~{MAX_PROMPT_TOKENS_ESTIMATE} tokens). Analysis performed only on the first {len(included_files)} files ({current_token_estimate:,} tokens).") | |
break | |
prompt_status.empty() # Clear status message | |
if not included_files: | |
st.error("π¨ No code files could be included within the estimated token limit.") | |
return None, [] | |
concatenated_code = "".join(code_segments) | |
prompt_parts.append(concatenated_code) | |
# Generate the expected JSON structure description based on selected analyses | |
json_structure_description = "{\n" | |
structure_parts = [] | |
if "generate_docs" in requested_analyses: | |
structure_parts.append(' "documentation_suggestions": [{"file": "path/to/file", "line": number, "suggestion": "Suggested docstring/comment"}]') | |
if "find_bugs" in requested_analyses: | |
structure_parts.append(' "potential_bugs": [{"file": "path/to/file", "line": number, "description": "Description of potential bug/anti-pattern", "severity": "High/Medium/Low"}]') | |
if "check_style" in requested_analyses: | |
structure_parts.append(' "style_issues": [{"file": "path/to/file", "line": number, "description": "Description of style deviation"}]') | |
if "summarize_modules" in requested_analyses: | |
structure_parts.append(' "module_summaries": [{"file": "path/to/file", "summary": "One-paragraph summary of the file purpose/functionality"}]') | |
if "suggest_refactoring" in requested_analyses: | |
structure_parts.append(' "refactoring_suggestions": [{"file": "path/to/file", "line": number, "area": "e.g., function name, class name", "suggestion": "Description of refactoring suggestion"}]') | |
json_structure_description += ",\n".join(structure_parts) | |
json_structure_description += "\n}" | |
prompt_footer = f""" | |
**Analysis Task:** | |
Perform the analyses corresponding to the keys present in the JSON structure below, based *only* on the provided code files ({', '.join(included_files)}). | |
**Output Format:** | |
Respond ONLY with a single, valid JSON object adhering strictly to the following structure. If no issues/suggestions are found for a category, provide an empty list `[]`. Do not include explanations outside the JSON structure. | |
{json_structure_description} | |
**JSON Output Only:** | |
""" | |
prompt_parts.append(prompt_footer) | |
full_prompt = "".join(prompt_parts) | |
return full_prompt, included_files | |
def call_gemini_api(prompt): | |
"""Calls the Gemini API or returns mock data based on session state.""" | |
if not prompt: | |
return None, "Prompt generation failed." | |
# MOCK MODE LOGIC | |
if st.session_state.mock_api_call: | |
st.info("MOCK MODE: Simulating API call...") | |
st.write("...") # Minimal feedback in mock mode | |
time.sleep(1) # Shorter mock delay | |
mock_json_response = json.dumps({ | |
"documentation_suggestions": [{"file": "mock/core.py", "line": 15, "suggestion": "def process_data(data):\n \"\"\"Processes the input data using mock logic.\"\"\""}], | |
"potential_bugs": [{"file": "mock/utils.py", "line": 22, "description": "Potential division by zero if denominator is not checked.", "severity": "Medium"}], | |
"style_issues": [], | |
"module_summaries": [], | |
"refactoring_suggestions": [] | |
}) | |
st.success("Mock response generated.") | |
return json.loads(mock_json_response), None | |
# REAL API CALL LOGIC | |
else: | |
if not initialize_gemini_model(): | |
return None, "Gemini Model Initialization Failed." | |
if model is None: | |
return None, "Gemini model not available." | |
try: | |
api_status = st.empty() | |
token_estimate = estimate_token_count(prompt) | |
api_status.info(f"π‘ Sending request to {GEMINI_MODEL_NAME} (Estimated prompt tokens: {token_estimate:,})... This can take several minutes depending on code size and model load.") | |
start_time = time.time() | |
response = model.generate_content( | |
prompt, | |
generation_config=genai.types.GenerationConfig(temperature=0.2), | |
safety_settings=[ | |
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, | |
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, | |
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, | |
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, | |
] | |
) | |
end_time = time.time() | |
api_status.success(f"β Response received from AI in {end_time - start_time:.2f} seconds.") | |
time.sleep(1) | |
api_status.empty() | |
try: | |
json_response_text = response.text.strip() | |
if json_response_text.startswith("```json"): | |
json_response_text = json_response_text[7:] | |
if json_response_text.startswith("```"): | |
json_response_text = json_response_text[3:] | |
if json_response_text.endswith("```"): | |
json_response_text = json_response_text[:-3] | |
json_start = json_response_text.find('{') | |
json_end = json_response_text.rfind('}') + 1 | |
if json_start != -1 and json_end != -1 and json_end > json_start: | |
final_json_text = json_response_text[json_start:json_end] | |
insights = json.loads(final_json_text) | |
return insights, None | |
else: | |
st.warning("β οΈ Could not find valid JSON object boundaries ({...}) in response.") | |
return {"raw_response": response.text}, "AI response did not contain clear JSON object, showing raw text." | |
except json.JSONDecodeError as json_err: | |
st.error(f"π¨ Error parsing JSON response from AI: {json_err}") | |
st.code(response.text, language='text') | |
return None, f"AI response was not valid JSON: {json_err}" | |
except AttributeError: | |
st.error("π¨ Unexpected API response structure (AttributeError).") | |
st.code(f"Response object: {response}", language='text') | |
try: | |
block_reason = response.prompt_feedback.block_reason | |
if block_reason: | |
return None, f"Content blocked by API. Reason: {block_reason}" | |
except Exception: | |
pass | |
return None, "Unexpected response structure from API (AttributeError)." | |
except Exception as e: | |
st.error(f"π¨ Unexpected issue processing AI response: {e}") | |
try: | |
st.code(f"Response object: {response}", language='text') | |
except Exception: | |
pass | |
return None, f"Unexpected response structure: {e}" | |
except Exception as e: | |
api_status.empty() | |
st.error(f"π¨ An error occurred during API call: {e}") | |
error_msg = f"API call failed: {e}" | |
if hasattr(e, 'message'): | |
if "429" in e.message: | |
error_msg = "API Quota Exceeded or Rate Limit hit." | |
elif "API key not valid" in e.message: | |
error_msg = "Invalid Gemini API Key." | |
elif "blocked" in e.message.lower(): | |
error_msg = "Content blocked due to safety settings." | |
elif "block_reason: SAFETY" in str(e): | |
error_msg = "Content blocked due to safety settings." | |
return None, error_msg | |
def display_results(results_json, requested_analyses): | |
"""Renders the analysis results with pagination.""" | |
st.header("π Analysis Report") | |
if not isinstance(results_json, dict): | |
st.error("Invalid results format received.") | |
st.json(results_json) | |
return | |
if "raw_response" in results_json: | |
st.subheader("Raw AI Response (JSON Parsing Failed)") | |
st.code(results_json["raw_response"], language='text') | |
return | |
display_config = { | |
"generate_docs": {"key": "documentation_suggestions", "title": AVAILABLE_ANALYSES["generate_docs"], "fields": {"file": "File", "line": "Line"}}, | |
"find_bugs": {"key": "potential_bugs", "title": AVAILABLE_ANALYSES["find_bugs"], "fields": {"file": "File", "line": "Line", "severity": "Severity"}}, | |
"check_style": {"key": "style_issues", "title": AVAILABLE_ANALYSES["check_style"], "fields": {"file": "File", "line": "Line"}}, | |
"summarize_modules": {"key": "module_summaries", "title": AVAILABLE_ANALYSES["summarize_modules"], "fields": {"file": "File"}}, | |
"suggest_refactoring": {"key": "refactoring_suggestions", "title": AVAILABLE_ANALYSES["suggest_refactoring"], "fields": {"file": "File", "line": "Line", "area": "Area"}} | |
} | |
any_results_found = False | |
for analysis_key in requested_analyses: | |
if analysis_key in display_config: | |
config = display_config[analysis_key] | |
items = results_json.get(config["key"], []) | |
total_items = len(items) | |
st.subheader(f"{config['title']} ({total_items} found)") | |
if items: | |
any_results_found = True | |
state_key = f"visible_{analysis_key}" | |
if state_key not in st.session_state: | |
st.session_state[state_key] = RESULTS_PAGE_SIZE | |
visible_count = st.session_state[state_key] | |
items_to_display = items[:visible_count] | |
for item in items_to_display: | |
details = [] | |
for field_key, field_label in config["fields"].items(): | |
value = item.get(field_key, 'N/A') | |
if value != 'N/A': | |
details.append(f"**{field_label}:** `{value}`" if field_key == 'file' else f"**{field_label}:** {value}") | |
st.markdown("- " + " - ".join(details)) | |
if 'suggestion' in item: | |
st.code(item['suggestion'], language='text') | |
elif 'description' in item: | |
st.markdown(f" > {item['description']}") | |
elif 'summary' in item: | |
st.markdown(f" > {item['summary']}") | |
if total_items > visible_count: | |
if st.button(f"Show more ({total_items - visible_count} remaining)", key=f"more_{analysis_key}"): | |
st.session_state[state_key] += RESULTS_PAGE_SIZE | |
st.rerun() | |
else: | |
st.markdown("_No items found for this category._") | |
st.divider() | |
if not any_results_found: | |
st.info("No specific findings were identified in the analysis based on your selections.") | |
st.download_button( | |
label="Download Full Report (JSON)", | |
data=json.dumps(results_json, indent=4), | |
file_name="code_audit_report.json", | |
mime="application/json" | |
) | |
# --- Streamlit App Main Interface --- | |
st.set_page_config(page_title="Codebase Audit Assistant", layout="wide") | |
st.title("π€ Codebase Audit Assistant") | |
st.markdown(f"Upload codebase (`.zip`) for analysis via **{GEMINI_MODEL_NAME}**.") | |
with st.sidebar: | |
st.header("βοΈ Analysis Controls") | |
st.session_state.mock_api_call = st.toggle("π§ͺ Enable Mock API Mode", value=st.session_state.mock_api_call, help="Use fake data instead of calling Gemini API.") | |
st.info("Mock API Mode ACTIVE" if st.session_state.mock_api_call else "Using REAL Gemini API") | |
st.divider() | |
st.header("π Select Analyses") | |
selected_analyses = [key for key, name in AVAILABLE_ANALYSES.items() if st.checkbox(name, value=True, key=f"cb_{key}")] | |
st.divider() | |
st.header("π How To Use") | |
st.info("1. Set API Key (if not in Mock Mode).\n2. Toggle Mock Mode if needed.\n3. Select analyses.\n4. Create & Upload a **ZIP** of your code.\n5. Click 'Analyze Codebase'.\n6. Review the report.") | |
st.info(f"Note: Only common code extensions are supported. Analysis is limited by token estimates (~{MAX_PROMPT_TOKENS_ESTIMATE:,} estimated tokens).") | |
st.divider() | |
st.warning("β οΈ **Privacy:** Code is sent to the Google API if Mock Mode is OFF.") | |
uploaded_file = st.file_uploader("π Upload Codebase ZIP File", type=['zip'], key="file_uploader", | |
on_change=lambda: st.session_state.update(analysis_results=None, error_message=None, analysis_requested=False)) | |
analysis_button_placeholder = st.empty() # Placeholder for the button | |
results_placeholder = st.container() # Container for results display | |
if uploaded_file: | |
st.success(f"β File '{uploaded_file.name}' uploaded.") | |
uploaded_file_bytes = uploaded_file.getvalue() | |
file_id = f"{uploaded_file.name}-{uploaded_file.size}" | |
code_files, total_chars, file_count, ignored_files = process_zip_file_cached(file_id, uploaded_file.size, uploaded_file_bytes) | |
if code_files is not None: | |
st.info(f"Found **{file_count}** relevant code files ({total_chars:,} characters). Est. tokens: ~{estimate_token_count(total_chars):,}") | |
if ignored_files: | |
with st.expander(f"View {len(ignored_files)} Skipped/Ignored Files"): | |
st.code("\n".join(ignored_files), language='text') | |
analyze_button_disabled = (not selected_analyses or file_count == 0) | |
analyze_button_label = "Analyze Codebase" if not analyze_button_disabled else "Select Analyses or Upload Valid Code" | |
if analysis_button_placeholder.button(analyze_button_label, type="primary", disabled=analyze_button_disabled): | |
st.session_state.analysis_requested = True | |
st.session_state.analysis_results = None | |
st.session_state.error_message = None | |
if not selected_analyses: | |
st.warning("Please select analysis types.") | |
elif file_count == 0: | |
st.warning("No relevant code files found.") | |
else: | |
with results_placeholder: | |
with st.spinner(f"π Preparing prompt & contacting AI ({'Mock Mode' if st.session_state.mock_api_call else GEMINI_MODEL_NAME})... Please wait."): | |
analysis_prompt, included_files_in_prompt = construct_analysis_prompt(code_files, selected_analyses) | |
if analysis_prompt and included_files_in_prompt: | |
results_json, error_msg = call_gemini_api(analysis_prompt) | |
st.session_state.analysis_results = results_json | |
st.session_state.error_message = error_msg | |
elif not included_files_in_prompt: | |
st.session_state.error_message = "Could not proceed: No files included (check token limits/errors)." | |
else: | |
st.session_state.error_message = "Failed to generate analysis prompt." | |
st.rerun() | |
if st.session_state.analysis_requested: | |
with results_placeholder: | |
st.divider() | |
if st.session_state.error_message: | |
st.error(f"Analysis Failed: {st.session_state.error_message}") | |
if isinstance(st.session_state.analysis_results, dict) and "raw_response" in st.session_state.analysis_results: | |
st.subheader("Raw AI Response") | |
st.code(st.session_state.analysis_results["raw_response"], language='text') | |
elif st.session_state.analysis_results: | |
display_results(st.session_state.analysis_results, selected_analyses) | |
else: | |
st.info("Analysis initiated, but no results or errors were stored. Please try again.") | |
elif not uploaded_file: | |
results_placeholder.info("Upload a ZIP file containing your source code to begin.") | |
results_placeholder.divider() | |
results_placeholder.markdown("_Assistant powered by Google Gemini._") | |