import streamlit as st import google.generativeai as genai import zipfile import io import json import os from pathlib import Path import time # --- Configuration --- GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-03-25" MAX_PROMPT_TOKENS_ESTIMATE = 800000 RESULTS_PAGE_SIZE = 25 # Number of items to show per category initially AVAILABLE_ANALYSES = { # ... (keep the same) "generate_docs": "Generate Missing Docstrings/Comments", "find_bugs": "Identify Potential Bugs & Anti-patterns", "check_style": "Check Style Guide Compliance (General)", "summarize_modules": "Summarize Complex Modules/Files", "suggest_refactoring": "Suggest Refactoring Opportunities" } CODE_EXTENSIONS = {'.py', '.js', '.java', '.c', '.cpp', '.h', '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.ts', '.html', '.css', '.scss', '.sql'} # --- Session State Initialization --- if 'mock_api_call' not in st.session_state: st.session_state.mock_api_call = False if 'analysis_results' not in st.session_state: st.session_state.analysis_results = None # Store results here if 'error_message' not in st.session_state: st.session_state.error_message = None if 'analysis_requested' not in st.session_state: st.session_state.analysis_requested = False # Flag to know when analysis is done # --- Gemini API Setup --- model = None def initialize_gemini_model(): """Initializes the Gemini API model unless running in mock mode.""" global model if model is None and not st.session_state.mock_api_call: try: if 'GEMINI_API_KEY' not in st.secrets: st.error("๐Ÿšจ Gemini API Key not found. Add it to `.streamlit/secrets.toml`.") st.stop() genai.configure(api_key=st.secrets["GEMINI_API_KEY"]) model = genai.GenerativeModel(GEMINI_MODEL_NAME) print("Gemini Model Initialized.") return True except Exception as e: st.error(f"๐Ÿšจ Error initializing Gemini SDK: {e}") st.stop() return False elif st.session_state.mock_api_call: # Running in Mock Mode. Skipping Gemini initialization. return True # Allow proceeding in mock mode elif model is not None: # Gemini Model already initialized. return True return False # --- Helper Functions --- def estimate_token_count(text): """Roughly estimate token count (assuming ~3 characters per token).""" return len(text) // 3 # --- OPTIMIZATION: Cache ZIP processing --- @st.cache_data(max_entries=5) # Cache results for recent uploads def process_zip_file_cached(file_id, file_size, file_content_bytes): """Extracts code files and their content. Cached function.""" code_files = {} total_chars = 0 file_count = 0 ignored_files = [] status_placeholder = st.empty() # For progress bar progress_bar = status_placeholder.progress(0) try: with zipfile.ZipFile(io.BytesIO(file_content_bytes), 'r') as zip_ref: members = zip_ref.infolist() total_members = len(members) for i, member in enumerate(members): # Update progress bar periodically (every 10 files) if i % 10 == 0: progress_bar.progress(int((i / total_members) * 100)) if member.is_dir() or any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename: continue file_path = Path(member.filename) if file_path.suffix.lower() in CODE_EXTENSIONS: try: with zip_ref.open(member) as file: file_bytes = file.read() try: content = file_bytes.decode('utf-8') except UnicodeDecodeError: try: content = file_bytes.decode('latin-1') except Exception as decode_err: ignored_files.append(f"{member.filename} (Decode Error: {decode_err})") continue code_files[member.filename] = content total_chars += len(content) file_count += 1 except Exception as read_err: ignored_files.append(f"{member.filename} (Read Error: {read_err})") else: if not (any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename): ignored_files.append(f"{member.filename} (Skipped Extension: {file_path.suffix})") progress_bar.progress(100) # Ensure it completes status_placeholder.empty() # Remove progress bar after completion except zipfile.BadZipFile: status_placeholder.empty() st.error("๐Ÿšจ Invalid or corrupted ZIP file.") return None, 0, 0, [] except Exception as e: status_placeholder.empty() st.error(f"๐Ÿšจ Error processing ZIP file: {e}") return None, 0, 0, [] if file_count == 0 and not ignored_files: st.warning("No files with recognized code extensions found in the ZIP.") elif file_count == 0 and ignored_files: st.warning("No files with recognized code extensions found. Some files were skipped.") print(f"Cache miss or new file: Processed ZIP {file_id}") # Debug print return code_files, total_chars, file_count, ignored_files def construct_analysis_prompt(code_files_dict, requested_analyses): """Constructs the prompt for Gemini, including code content and JSON structure request.""" prompt_parts = ["Analyze the following codebase provided as a collection of file paths and their content.\n\n"] current_token_estimate = estimate_token_count(prompt_parts[0]) included_files = [] # Use join for potentially faster concatenation code_segments = [] # Provide feedback for large codebases prompt_status = st.empty() if len(code_files_dict) > 50: prompt_status.write("Constructing prompt (processing files)...") for filename, content in code_files_dict.items(): file_marker = f"--- START FILE: {filename} ---\n" file_content = f"{content}\n" file_end_marker = f"--- END FILE: {filename} ---\n\n" segment = file_marker + file_content + file_end_marker segment_token_estimate = estimate_token_count(segment) if current_token_estimate + segment_token_estimate <= MAX_PROMPT_TOKENS_ESTIMATE: code_segments.append(segment) current_token_estimate += segment_token_estimate included_files.append(filename) else: st.warning(f"โš ๏ธ Codebase may exceed context window estimate (~{MAX_PROMPT_TOKENS_ESTIMATE} tokens). Analysis performed only on the first {len(included_files)} files ({current_token_estimate:,} tokens).") break prompt_status.empty() # Clear status message if not included_files: st.error("๐Ÿšจ No code files could be included within the estimated token limit.") return None, [] concatenated_code = "".join(code_segments) prompt_parts.append(concatenated_code) # Generate the expected JSON structure description based on selected analyses json_structure_description = "{\n" structure_parts = [] if "generate_docs" in requested_analyses: structure_parts.append(' "documentation_suggestions": [{"file": "path/to/file", "line": number, "suggestion": "Suggested docstring/comment"}]') if "find_bugs" in requested_analyses: structure_parts.append(' "potential_bugs": [{"file": "path/to/file", "line": number, "description": "Description of potential bug/anti-pattern", "severity": "High/Medium/Low"}]') if "check_style" in requested_analyses: structure_parts.append(' "style_issues": [{"file": "path/to/file", "line": number, "description": "Description of style deviation"}]') if "summarize_modules" in requested_analyses: structure_parts.append(' "module_summaries": [{"file": "path/to/file", "summary": "One-paragraph summary of the file purpose/functionality"}]') if "suggest_refactoring" in requested_analyses: structure_parts.append(' "refactoring_suggestions": [{"file": "path/to/file", "line": number, "area": "e.g., function name, class name", "suggestion": "Description of refactoring suggestion"}]') json_structure_description += ",\n".join(structure_parts) json_structure_description += "\n}" prompt_footer = f""" **Analysis Task:** Perform the analyses corresponding to the keys present in the JSON structure below, based *only* on the provided code files ({', '.join(included_files)}). **Output Format:** Respond ONLY with a single, valid JSON object adhering strictly to the following structure. If no issues/suggestions are found for a category, provide an empty list `[]`. Do not include explanations outside the JSON structure. {json_structure_description} **JSON Output Only:** """ prompt_parts.append(prompt_footer) full_prompt = "".join(prompt_parts) return full_prompt, included_files def call_gemini_api(prompt): """Calls the Gemini API or returns mock data based on session state.""" if not prompt: return None, "Prompt generation failed." # MOCK MODE LOGIC if st.session_state.mock_api_call: st.info("MOCK MODE: Simulating API call...") st.write("...") # Minimal feedback in mock mode time.sleep(1) # Shorter mock delay mock_json_response = json.dumps({ "documentation_suggestions": [{"file": "mock/core.py", "line": 15, "suggestion": "def process_data(data):\n \"\"\"Processes the input data using mock logic.\"\"\""}], "potential_bugs": [{"file": "mock/utils.py", "line": 22, "description": "Potential division by zero if denominator is not checked.", "severity": "Medium"}], "style_issues": [], "module_summaries": [], "refactoring_suggestions": [] }) st.success("Mock response generated.") return json.loads(mock_json_response), None # REAL API CALL LOGIC else: if not initialize_gemini_model(): return None, "Gemini Model Initialization Failed." if model is None: return None, "Gemini model not available." try: api_status = st.empty() token_estimate = estimate_token_count(prompt) api_status.info(f"๐Ÿ“ก Sending request to {GEMINI_MODEL_NAME} (Estimated prompt tokens: {token_estimate:,})... This can take several minutes depending on code size and model load.") start_time = time.time() response = model.generate_content( prompt, generation_config=genai.types.GenerationConfig(temperature=0.2), safety_settings=[ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}, ] ) end_time = time.time() api_status.success(f"โœ… Response received from AI in {end_time - start_time:.2f} seconds.") time.sleep(1) api_status.empty() try: json_response_text = response.text.strip() if json_response_text.startswith("```json"): json_response_text = json_response_text[7:] if json_response_text.startswith("```"): json_response_text = json_response_text[3:] if json_response_text.endswith("```"): json_response_text = json_response_text[:-3] json_start = json_response_text.find('{') json_end = json_response_text.rfind('}') + 1 if json_start != -1 and json_end != -1 and json_end > json_start: final_json_text = json_response_text[json_start:json_end] insights = json.loads(final_json_text) return insights, None else: st.warning("โš ๏ธ Could not find valid JSON object boundaries ({...}) in response.") return {"raw_response": response.text}, "AI response did not contain clear JSON object, showing raw text." except json.JSONDecodeError as json_err: st.error(f"๐Ÿšจ Error parsing JSON response from AI: {json_err}") st.code(response.text, language='text') return None, f"AI response was not valid JSON: {json_err}" except AttributeError: st.error("๐Ÿšจ Unexpected API response structure (AttributeError).") st.code(f"Response object: {response}", language='text') try: block_reason = response.prompt_feedback.block_reason if block_reason: return None, f"Content blocked by API. Reason: {block_reason}" except Exception: pass return None, "Unexpected response structure from API (AttributeError)." except Exception as e: st.error(f"๐Ÿšจ Unexpected issue processing AI response: {e}") try: st.code(f"Response object: {response}", language='text') except Exception: pass return None, f"Unexpected response structure: {e}" except Exception as e: api_status.empty() st.error(f"๐Ÿšจ An error occurred during API call: {e}") error_msg = f"API call failed: {e}" if hasattr(e, 'message'): if "429" in e.message: error_msg = "API Quota Exceeded or Rate Limit hit." elif "API key not valid" in e.message: error_msg = "Invalid Gemini API Key." elif "blocked" in e.message.lower(): error_msg = "Content blocked due to safety settings." elif "block_reason: SAFETY" in str(e): error_msg = "Content blocked due to safety settings." return None, error_msg def display_results(results_json, requested_analyses): """Renders the analysis results with pagination.""" st.header("๐Ÿ“Š Analysis Report") if not isinstance(results_json, dict): st.error("Invalid results format received.") st.json(results_json) return if "raw_response" in results_json: st.subheader("Raw AI Response (JSON Parsing Failed)") st.code(results_json["raw_response"], language='text') return display_config = { "generate_docs": {"key": "documentation_suggestions", "title": AVAILABLE_ANALYSES["generate_docs"], "fields": {"file": "File", "line": "Line"}}, "find_bugs": {"key": "potential_bugs", "title": AVAILABLE_ANALYSES["find_bugs"], "fields": {"file": "File", "line": "Line", "severity": "Severity"}}, "check_style": {"key": "style_issues", "title": AVAILABLE_ANALYSES["check_style"], "fields": {"file": "File", "line": "Line"}}, "summarize_modules": {"key": "module_summaries", "title": AVAILABLE_ANALYSES["summarize_modules"], "fields": {"file": "File"}}, "suggest_refactoring": {"key": "refactoring_suggestions", "title": AVAILABLE_ANALYSES["suggest_refactoring"], "fields": {"file": "File", "line": "Line", "area": "Area"}} } any_results_found = False for analysis_key in requested_analyses: if analysis_key in display_config: config = display_config[analysis_key] items = results_json.get(config["key"], []) total_items = len(items) st.subheader(f"{config['title']} ({total_items} found)") if items: any_results_found = True state_key = f"visible_{analysis_key}" if state_key not in st.session_state: st.session_state[state_key] = RESULTS_PAGE_SIZE visible_count = st.session_state[state_key] items_to_display = items[:visible_count] for item in items_to_display: details = [] for field_key, field_label in config["fields"].items(): value = item.get(field_key, 'N/A') if value != 'N/A': details.append(f"**{field_label}:** `{value}`" if field_key == 'file' else f"**{field_label}:** {value}") st.markdown("- " + " - ".join(details)) if 'suggestion' in item: st.code(item['suggestion'], language='text') elif 'description' in item: st.markdown(f" > {item['description']}") elif 'summary' in item: st.markdown(f" > {item['summary']}") if total_items > visible_count: if st.button(f"Show more ({total_items - visible_count} remaining)", key=f"more_{analysis_key}"): st.session_state[state_key] += RESULTS_PAGE_SIZE st.rerun() else: st.markdown("_No items found for this category._") st.divider() if not any_results_found: st.info("No specific findings were identified in the analysis based on your selections.") st.download_button( label="Download Full Report (JSON)", data=json.dumps(results_json, indent=4), file_name="code_audit_report.json", mime="application/json" ) # --- Streamlit App Main Interface --- st.set_page_config(page_title="Codebase Audit Assistant", layout="wide") st.title("๐Ÿค– Codebase Audit Assistant") st.markdown(f"Upload codebase (`.zip`) for analysis via **{GEMINI_MODEL_NAME}**.") with st.sidebar: st.header("โš™๏ธ Analysis Controls") st.session_state.mock_api_call = st.toggle("๐Ÿงช Enable Mock API Mode", value=st.session_state.mock_api_call, help="Use fake data instead of calling Gemini API.") st.info("Mock API Mode ACTIVE" if st.session_state.mock_api_call else "Using REAL Gemini API") st.divider() st.header("๐Ÿ”Ž Select Analyses") selected_analyses = [key for key, name in AVAILABLE_ANALYSES.items() if st.checkbox(name, value=True, key=f"cb_{key}")] st.divider() st.header("๐Ÿ“„ How To Use") st.info("1. Set API Key (if not in Mock Mode).\n2. Toggle Mock Mode if needed.\n3. Select analyses.\n4. Create & Upload a **ZIP** of your code.\n5. Click 'Analyze Codebase'.\n6. Review the report.") st.info(f"Note: Only common code extensions are supported. Analysis is limited by token estimates (~{MAX_PROMPT_TOKENS_ESTIMATE:,} estimated tokens).") st.divider() st.warning("โš ๏ธ **Privacy:** Code is sent to the Google API if Mock Mode is OFF.") uploaded_file = st.file_uploader("๐Ÿ“ Upload Codebase ZIP File", type=['zip'], key="file_uploader", on_change=lambda: st.session_state.update(analysis_results=None, error_message=None, analysis_requested=False)) analysis_button_placeholder = st.empty() # Placeholder for the button results_placeholder = st.container() # Container for results display if uploaded_file: st.success(f"โœ… File '{uploaded_file.name}' uploaded.") uploaded_file_bytes = uploaded_file.getvalue() file_id = f"{uploaded_file.name}-{uploaded_file.size}" code_files, total_chars, file_count, ignored_files = process_zip_file_cached(file_id, uploaded_file.size, uploaded_file_bytes) if code_files is not None: st.info(f"Found **{file_count}** relevant code files ({total_chars:,} characters). Est. tokens: ~{estimate_token_count(total_chars):,}") if ignored_files: with st.expander(f"View {len(ignored_files)} Skipped/Ignored Files"): st.code("\n".join(ignored_files), language='text') analyze_button_disabled = (not selected_analyses or file_count == 0) analyze_button_label = "Analyze Codebase" if not analyze_button_disabled else "Select Analyses or Upload Valid Code" if analysis_button_placeholder.button(analyze_button_label, type="primary", disabled=analyze_button_disabled): st.session_state.analysis_requested = True st.session_state.analysis_results = None st.session_state.error_message = None if not selected_analyses: st.warning("Please select analysis types.") elif file_count == 0: st.warning("No relevant code files found.") else: with results_placeholder: with st.spinner(f"๐Ÿš€ Preparing prompt & contacting AI ({'Mock Mode' if st.session_state.mock_api_call else GEMINI_MODEL_NAME})... Please wait."): analysis_prompt, included_files_in_prompt = construct_analysis_prompt(code_files, selected_analyses) if analysis_prompt and included_files_in_prompt: results_json, error_msg = call_gemini_api(analysis_prompt) st.session_state.analysis_results = results_json st.session_state.error_message = error_msg elif not included_files_in_prompt: st.session_state.error_message = "Could not proceed: No files included (check token limits/errors)." else: st.session_state.error_message = "Failed to generate analysis prompt." st.rerun() if st.session_state.analysis_requested: with results_placeholder: st.divider() if st.session_state.error_message: st.error(f"Analysis Failed: {st.session_state.error_message}") if isinstance(st.session_state.analysis_results, dict) and "raw_response" in st.session_state.analysis_results: st.subheader("Raw AI Response") st.code(st.session_state.analysis_results["raw_response"], language='text') elif st.session_state.analysis_results: display_results(st.session_state.analysis_results, selected_analyses) else: st.info("Analysis initiated, but no results or errors were stored. Please try again.") elif not uploaded_file: results_placeholder.info("Upload a ZIP file containing your source code to begin.") results_placeholder.divider() results_placeholder.markdown("_Assistant powered by Google Gemini._")