import streamlit as st import google.generativeai as genai import zipfile import io import json import os # For API key usage from pathlib import Path import time import re import plotly.express as px import pandas as pd # --- Configuration --- MAX_PROMPT_TOKENS_ESTIMATE = 800000 # Estimated token limit for the prompt RESULTS_PAGE_SIZE = 25 AVAILABLE_ANALYSES = { "generate_docs": "Generate Missing Docstrings/Comments", "find_bugs": "Identify Potential Bugs & Anti-patterns", "check_style": "Check Style Guide Compliance (General)", "summarize_modules": "Summarize Complex Modules/Files", "suggest_refactoring": "Suggest Refactoring Opportunities", } CODE_EXTENSIONS = { '.py', '.js', '.java', '.c', '.cpp', '.h', '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.ts', '.html', '.css', '.scss', '.sql' } # --- Session State Initialization --- if 'mock_api_call' not in st.session_state: st.session_state.mock_api_call = False if 'analysis_results' not in st.session_state: st.session_state.analysis_results = None if 'error_message' not in st.session_state: st.session_state.error_message = None if 'analysis_requested' not in st.session_state: st.session_state.analysis_requested = False if 'selected_model_name' not in st.session_state: st.session_state.selected_model_name = None # Holds internal model name if 'available_models_dict' not in st.session_state: st.session_state.available_models_dict = {} # Mapping: display_name -> internal name # --- Gemini API Setup & Model Discovery --- model = None # Global variable for the initialized model instance @st.cache_data(ttl=3600) def get_available_models(): """Lists models supporting 'generateContent' using the API key.""" model_dict = {} try: if 'GEMINI_API_KEY' not in st.secrets: print("API key not found in secrets during model listing attempt.") return {} genai.configure(api_key=st.secrets["GEMINI_API_KEY"]) print("Listing available models via API...") for m in genai.list_models(): if 'generateContent' in m.supported_generation_methods: model_dict[m.display_name] = m.name print(f"Found {len(model_dict)} compatible models.") return model_dict except Exception as e: st.error(f"๐Ÿšจ Error listing available models: {e}") return {} def initialize_gemini_model(): """Initializes the Gemini model based on the selected name.""" global model selected_name = st.session_state.get('selected_model_name') if selected_name and model is None and not st.session_state.mock_api_call: try: if 'GEMINI_API_KEY' not in st.secrets: st.error("๐Ÿšจ Gemini API Key not found. Add it to `.streamlit/secrets.toml`.") st.stop() genai.configure(api_key=st.secrets["GEMINI_API_KEY"]) print(f"Initializing Gemini Model: {selected_name}") model = genai.GenerativeModel(model_name=selected_name) print(f"Gemini Model Initialized ({selected_name}).") return True except Exception as e: st.error(f"๐Ÿšจ Error initializing selected Gemini model '{selected_name}': {e}") st.session_state.selected_model_name = None st.stop() return False elif st.session_state.mock_api_call: return True elif model is not None and model.model_name == selected_name: return True elif model is not None and model.model_name != selected_name: print("Model changed. Re-initializing...") model = None return initialize_gemini_model() elif not selected_name and not st.session_state.mock_api_call: return False return False # --- Helper Functions --- def estimate_token_count(text): """ Estimates the token count. If a string is provided, calculates based on its length. If an integer (total char count) is provided, uses that directly. """ if isinstance(text, int): return text // 3 return len(text) // 3 @st.cache_data(max_entries=5) def process_zip_file_cached(file_id, file_size, file_content_bytes): """ Processes a ZIP file and extracts code files. Returns (code_files dict, total_chars, file_count, ignored_files list). """ code_files = {} total_chars = 0 file_count = 0 ignored_files = [] status_placeholder = st.empty() progress_bar = status_placeholder.progress(0) try: with zipfile.ZipFile(io.BytesIO(file_content_bytes), 'r') as zip_ref: members = zip_ref.infolist() total_members = len(members) for i, member in enumerate(members): if i % 10 == 0: progress_bar.progress(int((i / total_members) * 100)) if member.is_dir() or any(p.startswith('.') for p in Path(member.filename).parts) or '__' in member.filename: continue file_path = Path(member.filename) if file_path.suffix.lower() in CODE_EXTENSIONS: try: with zip_ref.open(member) as file: file_bytes = file.read() try: content = file_bytes.decode('utf-8') except UnicodeDecodeError: try: content = file_bytes.decode('latin-1') except Exception as decode_err: ignored_files.append(f"{member.filename} (Decode Error: {decode_err})") continue code_files[member.filename] = content total_chars += len(content) file_count += 1 except Exception as read_err: ignored_files.append(f"{member.filename} (Read Error: {read_err})") else: if not (any(p.startswith('.') for p in Path(member.filename).parts) or '__' in member.filename): ignored_files.append(f"{member.filename} (Skipped Extension: {file_path.suffix})") progress_bar.progress(100) status_placeholder.empty() except zipfile.BadZipFile: status_placeholder.empty() st.error("๐Ÿšจ Invalid ZIP.") return None, 0, 0, [] except Exception as e: status_placeholder.empty() st.error(f"๐Ÿšจ ZIP Error: {e}") return None, 0, 0, [] if file_count == 0: if not ignored_files: st.warning("No code files found.") else: st.warning("No code files found; some skipped.") return code_files, total_chars, file_count, ignored_files def construct_analysis_prompt(code_files_dict, requested_analyses): """ Constructs the prompt for analysis by including code files and structured instructions. The prompt now requests detailed feedback, including line references, severity, and recommended fixes. Returns the full prompt and a list of included files. """ prompt_parts = [ "You are a highly skilled code auditor. Analyze the following codebase in detail.\n", "For each issue, provide:\n", " - A short summary with line references (or approximate line references).\n", " - A severity level (Low, Medium, High).\n", " - A recommended fix or code snippet if applicable.\n\n", "Here is the code:\n\n" ] current_token_estimate = estimate_token_count("".join(prompt_parts)) included_files = [] code_segments = [] for filename, content in code_files_dict.items(): segment = f"--- START FILE: {filename} ---\n{content}\n--- END FILE: {filename} ---\n\n" segment_token_estimate = estimate_token_count(segment) if current_token_estimate + segment_token_estimate <= MAX_PROMPT_TOKENS_ESTIMATE: code_segments.append(segment) current_token_estimate += segment_token_estimate included_files.append(filename) else: st.warning(f"โš ๏ธ Exceeded context limit after {len(included_files)} files.") break if not included_files: st.error("๐Ÿšจ No code files included in prompt.") return None, [] prompt_parts.append("".join(code_segments)) prompt_parts.append("\n\nYour tasks are:\n") if "generate_docs" in requested_analyses: prompt_parts.append( "1) Generate missing docstrings/comments using PEP 257 style. Provide recommended text and line references.\n" ) if "find_bugs" in requested_analyses: prompt_parts.append( "2) Identify potential bugs & anti-patterns. For each, include severity, line references, and a recommended fix.\n" ) if "check_style" in requested_analyses: prompt_parts.append( "3) Check style guide compliance (PEP 8 or similar). Include line references, severity, and suggested changes.\n" ) if "summarize_modules" in requested_analyses: prompt_parts.append( "4) Summarize each module/file by describing its primary responsibilities.\n" ) if "suggest_refactoring" in requested_analyses: prompt_parts.append( "5) Suggest refactoring opportunities with code snippets and justification, including line references.\n" ) prompt_parts.append( "\nFormat your response in valid JSON with the following structure:\n" "{\n" " \"documentation_suggestions\": [ {\"file\": \"...\", \"line\": \"...\", \"summary\": \"...\", \"severity\": \"Low|Medium|High\", \"suggestion\": \"...\"}, ... ],\n" " \"potential_bugs\": [ {\"file\": \"...\", \"line\": \"...\", \"summary\": \"...\", \"severity\": \"Low|Medium|High\", \"suggestion\": \"...\"}, ... ],\n" " \"style_issues\": [ ... ],\n" " \"module_summaries\": [ {\"file\": \"...\", \"summary\": \"...\"}, ... ],\n" " \"refactoring_suggestions\": [ {\"file\": \"...\", \"line\": \"...\", \"area\": \"...\", \"summary\": \"...\", \"suggestion\": \"...\"}, ... ]\n" "}\n" "Only output valid JSON (no markdown formatting)!\n" ) full_prompt = "".join(prompt_parts) return full_prompt, included_files def extract_json_from_text(text): """ Attempts to extract a balanced JSON object from the given text. It looks for the first '{' and returns the substring until the braces are balanced. """ start = text.find('{') if start == -1: return None count = 0 for i in range(start, len(text)): if text[i] == '{': count += 1 elif text[i] == '}': count -= 1 if count == 0: return text[start:i+1] return None def fix_json(text): """ Attempt to fix common JSON formatting issues. This function removes trailing commas and inserts commas between adjacent objects. """ # Remove trailing commas before a closing brace or bracket. text = re.sub(r',\s*([}\]])', r'\1', text) # Insert commas between adjacent objects if missing. text = re.sub(r'}\s*{', '},{', text) return text def call_gemini_api(prompt): """ Calls the Gemini API using the provided prompt. Returns the parsed JSON insights or an error message. """ if not prompt: return None, "Prompt generation failed." if st.session_state.mock_api_call: st.info(" MOCK MODE: Simulating API call...") time.sleep(1) mock_json_response = json.dumps({ "documentation_suggestions": [], "potential_bugs": [], "style_issues": [], "module_summaries": [], "refactoring_suggestions": [] }) st.success("Mock response generated.") return json.loads(mock_json_response), None else: if not initialize_gemini_model(): return None, "Gemini Model Initialization Failed." if model is None: return None, "Gemini model not selected or available." try: api_status = st.empty() api_status.info(f"๐Ÿ“ก Sending request to {model.model_name} (Est. prompt tokens: {estimate_token_count(prompt):,})... Please wait.") start_time = time.time() response = model.generate_content( prompt, generation_config=genai.types.GenerationConfig(temperature=0.2), safety_settings=[ {"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"] ] ) end_time = time.time() api_status.success(f"โœ… Response received from AI ({model.model_name}) in {end_time - start_time:.2f}s.") time.sleep(1) api_status.empty() # Remove markdown formatting if present. json_response_text = response.text.strip().replace("```json", "").replace("```", "") # First, attempt regex extraction. match = re.search(r'({.*})', json_response_text, re.DOTALL) if match: final_json_text = match.group(1) else: # Fallback: extract using balanced braces. final_json_text = extract_json_from_text(json_response_text) if not final_json_text: st.warning("โš ๏ธ Could not extract a valid JSON object.") return {"raw_response": response.text}, "AI response did not contain clear JSON object." try: insights = json.loads(final_json_text) return insights, None except json.JSONDecodeError as json_err: # Attempt to fix the JSON string. fixed_text = fix_json(final_json_text) try: insights = json.loads(fixed_text) return insights, None except json.JSONDecodeError as json_err2: st.error(f"๐Ÿšจ Error parsing JSON after fix attempt: {json_err2}") st.code(response.text, language='text') return None, f"AI response not valid JSON: {json_err2}" except json.JSONDecodeError as json_err: st.error(f"๐Ÿšจ Error parsing JSON: {json_err}") st.code(response.text, language='text') return None, f"AI response not valid JSON: {json_err}" except AttributeError: st.error("๐Ÿšจ Unexpected API response structure (AttributeError).") st.code(f"Response object: {response}", language='text') return None, "Unexpected response structure (AttributeError)." except Exception as e: st.error(f"๐Ÿšจ Unexpected issue processing response: {e}") try: st.code(f"Response object: {response}", language='text') except Exception: pass return None, f"Unexpected response structure: {e}" def display_results(results_json, requested_analyses): """ Displays the analysis results with pagination and a JSON download option. """ st.header("๐Ÿ“Š Analysis Report") if not isinstance(results_json, dict): st.error("Invalid results format.") st.json(results_json) return if "raw_response" in results_json: st.subheader("Raw AI Response (JSON Parsing Failed)") st.code(results_json["raw_response"], language='text') return display_config = { "generate_docs": { "key": "documentation_suggestions", "title": AVAILABLE_ANALYSES["generate_docs"], "fields": {"file": "File", "line": "Line"} }, "find_bugs": { "key": "potential_bugs", "title": AVAILABLE_ANALYSES["find_bugs"], "fields": {"file": "File", "line": "Line", "severity": "Severity"} }, "check_style": { "key": "style_issues", "title": AVAILABLE_ANALYSES["check_style"], "fields": {"file": "File", "line": "Line"} }, "summarize_modules": { "key": "module_summaries", "title": AVAILABLE_ANALYSES["summarize_modules"], "fields": {"file": "File"} }, "suggest_refactoring": { "key": "refactoring_suggestions", "title": AVAILABLE_ANALYSES["suggest_refactoring"], "fields": {"file": "File", "line": "Line", "area": "Area"} }, } any_results_found = False for analysis_key in requested_analyses: if analysis_key in display_config: config = display_config[analysis_key] items = results_json.get(config["key"], []) total_items = len(items) st.subheader(f"{config['title']} ({total_items} found)") if items: any_results_found = True state_key = f"visible_{analysis_key}" if state_key not in st.session_state: st.session_state[state_key] = RESULTS_PAGE_SIZE visible_count = st.session_state[state_key] items_to_display = items[:visible_count] for item in items_to_display: details = [ f"**{field_label}:** `{item.get(field_key, 'N/A')}`" if field_key == 'file' else f"**{field_label}:** {item.get(field_key, 'N/A')}" for field_key, field_label in config["fields"].items() if item.get(field_key, 'N/A') != 'N/A' ] st.markdown("- " + " - ".join(details)) if 'suggestion' in item: st.code(item['suggestion'], language='text') elif 'description' in item: st.markdown(f" > {item['description']}") elif 'summary' in item: st.markdown(f" > {item['summary']}") if total_items > visible_count: if st.button(f"Show more ({total_items - visible_count} remaining)", key=f"more_{analysis_key}"): st.session_state[state_key] += RESULTS_PAGE_SIZE st.rerun() else: st.markdown("_No items found for this category._") st.divider() if not any_results_found: st.info("No specific findings were identified.") st.download_button( label="Download Full Report (JSON)", data=json.dumps(results_json, indent=4), file_name="code_audit_report.json", mime="application/json" ) # --- Streamlit App Main Interface --- st.set_page_config(page_title="Codebase Audit Assistant", layout="wide") st.title("๐Ÿค– Codebase Audit & Documentation Assistant") # --- Sidebar with Enhancements --- with st.sidebar: # Dark Mode Toggle dark_mode = st.checkbox("Enable Dark Mode", value=False) if dark_mode: st.markdown( """ """, unsafe_allow_html=True ) st.header("โš™๏ธ Analysis Controls") st.session_state.mock_api_call = st.toggle( "๐Ÿงช Enable Mock API Mode", value=st.session_state.mock_api_call, help="Use fake data instead of calling Gemini API." ) st.divider() st.header("โ™Š Select Model") if not st.session_state.mock_api_call: st.session_state.available_models_dict = get_available_models() model_display_names = list(st.session_state.available_models_dict.keys()) if model_display_names: current_model_display_name = None if st.session_state.selected_model_name: for disp_name, internal_name in st.session_state.available_models_dict.items(): if internal_name == st.session_state.selected_model_name: current_model_display_name = disp_name break try: selected_index = model_display_names.index(current_model_display_name) if current_model_display_name in model_display_names else 0 except ValueError: selected_index = 0 selected_display_name = st.selectbox( "Choose Gemini model:", options=model_display_names, index=selected_index, key="model_selector", help="Select the Gemini model to use for analysis." ) st.session_state.selected_model_name = st.session_state.available_models_dict.get(selected_display_name) st.info(f"Using REAL Gemini API ({st.session_state.selected_model_name})") elif 'GEMINI_API_KEY' in st.secrets: st.warning("No compatible models found or error listing models. Check API Key permissions.") st.session_state.selected_model_name = None else: st.warning("Add GEMINI_API_KEY to secrets to list models.") st.session_state.selected_model_name = None else: st.info("Mock API Mode ACTIVE") st.session_state.selected_model_name = "mock_model" st.divider() st.header("๐Ÿ”Ž Select Analyses") selected_analyses = [ key for key, name in AVAILABLE_ANALYSES.items() if st.checkbox(name, value=True, key=f"cb_{key}") ] st.divider() st.header("๐Ÿ“„ How To Use") st.info( "1. Set API Key.\n" "2. Toggle Mock Mode if needed.\n" "3. Select Model (if not Mock).\n" "4. Select analyses.\n" "5. Upload ZIP.\n" "6. Click 'Analyze'.\n" "7. Review report." ) st.info(f"Note: Limited by token estimates (~{MAX_PROMPT_TOKENS_ESTIMATE:,} est. tokens).") st.divider() st.warning("โš ๏ธ **Privacy:** Code sent to Google API if Mock Mode is OFF.") # Update title dynamically based on selected model if st.session_state.selected_model_name and not st.session_state.mock_api_call: st.markdown(f"Upload codebase (`.zip`) for analysis via **{st.session_state.selected_model_name}**.") elif st.session_state.mock_api_call: st.markdown("Upload codebase (`.zip`) for analysis (Using **Mock Data**).") else: st.markdown("Upload codebase (`.zip`) for analysis.") # --- Main Content Area --- uploaded_file = st.file_uploader( "๐Ÿ“ Upload Codebase ZIP File", type=['zip'], key="file_uploader", on_change=lambda: st.session_state.update( analysis_results=None, error_message=None, analysis_requested=False ) ) analysis_button_placeholder = st.empty() results_placeholder = st.container() if uploaded_file: st.success(f"โœ… File '{uploaded_file.name}' uploaded.") uploaded_file_bytes = uploaded_file.getvalue() file_id = f"{uploaded_file.name}-{uploaded_file.size}" code_files, total_chars, file_count, ignored_files = process_zip_file_cached( file_id, uploaded_file.size, uploaded_file_bytes ) if code_files is not None: st.info(f"Found **{file_count}** code files ({total_chars:,} chars). Est. tokens: ~{estimate_token_count(total_chars):,}") # --- Interactive Metrics Visualization --- metrics = { "Metric": ["Files Analyzed", "Total Characters", "Token Estimate", "Ignored Files"], "Value": [file_count, total_chars, estimate_token_count(total_chars), len(ignored_files)] } df_metrics = pd.DataFrame(metrics) fig = px.bar(df_metrics, x="Metric", y="Value", title="Upload Summary Metrics") st.plotly_chart(fig) # --- End Metrics Visualization --- if ignored_files: with st.expander(f"View {len(ignored_files)} Skipped/Ignored Files"): st.code("\n".join(ignored_files), language='text') model_ready = bool(st.session_state.selected_model_name) or st.session_state.mock_api_call analyze_button_disabled = (not selected_analyses or file_count == 0 or not model_ready) analyze_button_label = "Analyze Codebase" if not model_ready: analyze_button_label = "Select Model First" elif analyze_button_disabled: analyze_button_label = "Select Analyses or Upload Valid Code" if analysis_button_placeholder.button( analyze_button_label, type="primary", disabled=analyze_button_disabled ): st.session_state.analysis_requested = True st.session_state.analysis_results = None st.session_state.error_message = None if not selected_analyses: st.warning("Please select analysis types.") elif file_count == 0: st.warning("No relevant code files found.") elif not model_ready: st.warning("Please select a Gemini model from the sidebar.") else: with results_placeholder: spinner_model_name = ( st.session_state.selected_model_name if not st.session_state.mock_api_call else "Mock Mode" ) spinner_msg = f"๐Ÿš€ Preparing prompt & contacting AI ({spinner_model_name})... Please wait." with st.spinner(spinner_msg): analysis_prompt, included_files_in_prompt = construct_analysis_prompt(code_files, selected_analyses) if analysis_prompt and included_files_in_prompt: results_json, error_msg = call_gemini_api(analysis_prompt) st.session_state.analysis_results = results_json st.session_state.error_message = error_msg elif not included_files_in_prompt: st.session_state.error_message = "Could not proceed: No files included." else: st.session_state.error_message = "Failed to generate analysis prompt." st.rerun() if st.session_state.analysis_requested: with results_placeholder: st.divider() if st.session_state.error_message: st.error(f"Analysis Failed: {st.session_state.error_message}") if isinstance(st.session_state.analysis_results, dict) and "raw_response" in st.session_state.analysis_results: st.subheader("Raw AI Response") st.code(st.session_state.analysis_results["raw_response"], language='text') elif st.session_state.analysis_results: display_results(st.session_state.analysis_results, selected_analyses) else: st.info("Analysis initiated, but no results/errors stored.") elif not uploaded_file: results_placeholder.info("Upload a ZIP file to begin.") results_placeholder.divider() results_placeholder.markdown("_Assistant powered by Google Gemini._")