Spaces:

mgbam
/

AuditXCodeInsights

Sleeping

App Files Files Community

AuditXCodeInsights / app.py

mgbam

Update app.py

3a80282 verified 3 months ago

raw

history blame

23.7 kB

	import streamlit as st
	import google.generativeai as genai
	import zipfile
	import io
	import json
	import os
	from pathlib import Path
	import time # Added for simulating mock delay

	# --- Configuration ---
	GEMINI_MODEL_NAME = "gemini-2.5-pro-preview-03-25"
	MAX_PROMPT_TOKENS_ESTIMATE = 800000 # Adjust as needed

	AVAILABLE_ANALYSES = {
	"generate_docs": "Generate Missing Docstrings/Comments",
	"find_bugs": "Identify Potential Bugs & Anti-patterns",
	"check_style": "Check Style Guide Compliance (General)",
	"summarize_modules": "Summarize Complex Modules/Files",
	"suggest_refactoring": "Suggest Refactoring Opportunities"
	}

	CODE_EXTENSIONS = {'.py', '.js', '.java', '.c', '.cpp', '.h', '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.ts', '.html', '.css', '.scss', '.sql'}

	# --- Session State Initialization ---
	# Initialize session state for mock mode toggle if it doesn't exist
	if 'mock_api_call' not in st.session_state:
	st.session_state.mock_api_call = False # Default to using the real API

	# --- Gemini API Setup ---
	# Defer full initialization until needed if mock mode might be used first
	model = None
	def initialize_gemini_model():
	global model
	if model is None and not st.session_state.mock_api_call:
	try:
	if 'GEMINI_API_KEY' not in st.secrets:
	st.error("🚨 Gemini API Key not found. Add it to `.streamlit/secrets.toml`.")
	st.stop()
	genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
	model = genai.GenerativeModel(GEMINI_MODEL_NAME)
	print("Gemini Model Initialized.")
	return True
	except Exception as e:
	st.error(f"🚨 Error initializing Gemini SDK: {e}")
	st.stop()
	return False
	elif st.session_state.mock_api_call:
	print("Running in Mock Mode. Skipping Gemini initialization.")
	return True # Allow proceeding in mock mode
	elif model is not None:
	print("Gemini Model already initialized.")
	return True
	return False


	# --- Helper Functions ---

	def estimate_token_count(text):
	"""Roughly estimate token count (3-4 chars per token)."""
	return len(text) // 3

	def process_zip_file(uploaded_file):
	"""Extracts code files and their content from the uploaded zip file."""
	code_files = {}
	total_chars = 0
	file_count = 0
	ignored_files = []

	try:
	with zipfile.ZipFile(io.BytesIO(uploaded_file.getvalue()), 'r') as zip_ref:
	for member in zip_ref.infolist():
	if member.is_dir() or any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename:
	continue

	file_path = Path(member.filename)
	if file_path.suffix.lower() in CODE_EXTENSIONS:
	try:
	with zip_ref.open(member) as file:
	try:
	content = file.read().decode('utf-8')
	except UnicodeDecodeError:
	try:
	content = file.read().decode('latin-1')
	except Exception as decode_err:
	ignored_files.append(f"{member.filename} (Decode Error: {decode_err})")
	continue

	code_files[member.filename] = content
	total_chars += len(content)
	file_count += 1
	except Exception as read_err:
	ignored_files.append(f"{member.filename} (Read Error: {read_err})")
	else:
	# Only add to ignored if it's not explicitly ignored by path rules above
	if not (any(part.startswith('.') for part in Path(member.filename).parts) or '__' in member.filename):
	ignored_files.append(f"{member.filename} (Skipped Extension: {file_path.suffix})")

	except zipfile.BadZipFile:
	st.error("🚨 Invalid or corrupted ZIP file.")
	return None, 0, 0, []
	except Exception as e:
	st.error(f"🚨 Error processing ZIP file: {e}")
	return None, 0, 0, []

	return code_files, total_chars, file_count, ignored_files

	def construct_analysis_prompt(code_files_dict, requested_analyses):
	"""Constructs the prompt for Gemini, including code content and JSON structure request."""
	prompt_content = "Analyze the following codebase provided as a collection of file paths and their content.\n\n"
	current_token_estimate = estimate_token_count(prompt_content)
	included_files = []
	concatenated_code = ""

	for filename, content in code_files_dict.items():
	file_marker = f"--- START FILE: {filename} ---\n"
	file_content = f"{content}\n"
	file_end_marker = f"--- END FILE: {filename} ---\n\n"
	segment = file_marker + file_content + file_end_marker
	segment_token_estimate = estimate_token_count(segment)

	if current_token_estimate + segment_token_estimate <= MAX_PROMPT_TOKENS_ESTIMATE:
	concatenated_code += segment
	current_token_estimate += segment_token_estimate
	included_files.append(filename)
	else:
	st.warning(f"⚠️ Codebase may exceed context window estimate (~{MAX_PROMPT_TOKENS_ESTIMATE} tokens). Analysis performed only on the first {len(included_files)} files ({current_token_estimate} tokens).")
	break

	if not included_files:
	st.error("🚨 No code files could be included within the estimated token limit.")
	return None, []

	prompt_content += concatenated_code

	json_structure_description = "{\n"
	# Dynamically build the JSON structure based on selection
	structure_parts = []
	if "generate_docs" in requested_analyses:
	structure_parts.append(' "documentation_suggestions": [{"file": "path/to/file", "line": number, "suggestion": "Suggested docstring/comment"}]')
	if "find_bugs" in requested_analyses:
	structure_parts.append(' "potential_bugs": [{"file": "path/to/file", "line": number, "description": "Description of potential bug/anti-pattern", "severity": "High/Medium/Low"}]')
	if "check_style" in requested_analyses:
	structure_parts.append(' "style_issues": [{"file": "path/to/file", "line": number, "description": "Description of style deviation"}]')
	if "summarize_modules" in requested_analyses:
	structure_parts.append(' "module_summaries": [{"file": "path/to/file", "summary": "One-paragraph summary of the file purpose/functionality"}]')
	if "suggest_refactoring" in requested_analyses:
	structure_parts.append(' "refactoring_suggestions": [{"file": "path/to/file", "line": number, "area": "e.g., function name, class name", "suggestion": "Description of refactoring suggestion"}]')

	json_structure_description += ",\n".join(structure_parts)
	json_structure_description += "\n}"

	prompt_footer = f"""
	Analysis Task:
	Perform the analyses corresponding to the keys present in the JSON structure below, based only on the provided code files ({', '.join(included_files)}).

	Output Format:
	Respond ONLY with a single, valid JSON object adhering strictly to the following structure. If no issues/suggestions are found for a category, provide an empty list `[]`. Do not include explanations outside the JSON structure.

	{json_structure_description}

	JSON Output Only:
	"""
	full_prompt = prompt_content + prompt_footer
	# print(f"--- PROMPT (First 500 chars): ---\n{full_prompt[:500]}\n--------------------------")
	# print(f"--- PROMPT (Last 500 chars): ---\n{full_prompt[-500:]}\n--------------------------")
	return full_prompt, included_files


	def call_gemini_api(prompt):
	"""Calls the Gemini API or returns mock data based on session state."""
	if not prompt:
	return None, "Prompt generation failed."

	# --- MOCK MODE LOGIC ---
	if st.session_state.mock_api_call:
	st.info(" MOCK MODE: Simulating API call...")
	time.sleep(2) # Simulate network/processing delay

	# --- CHOOSE YOUR MOCK RESPONSE ---
	# Option 1: Simulate successful response with some data
	mock_json_response = json.dumps({
	"documentation_suggestions": [{"file": "mock/core.py", "line": 15, "suggestion": "def process_data(data):\n \"\"\"Processes the input data using mock logic.\"\"\""}],
	"potential_bugs": [{"file":"mock/utils.py", "line": 22, "description":"Potential division by zero if denominator is not checked.", "severity":"Medium"}],
	"style_issues": [{"file": "mock/core.py", "line": 5, "description": "Variable 'varName' does not follow snake_case convention."}],
	"module_summaries": [{"file": "mock/core.py", "summary": "This file contains the core mock processing logic."}, {"file":"mock/utils.py", "summary": "Utility functions for mocking."}],
	"refactoring_suggestions": [{"file":"mock/utils.py", "line": 30, "area":"calculate_metrics function", "suggestion": "Function is too long (> 50 lines), consider breaking it down."}]
	})
	st.success("Mock response generated successfully.")
	return json.loads(mock_json_response), None # Return insights, no error

	# Option 2: Simulate API error
	# st.error("Simulating API error.")
	# return None, "MOCK ERROR: Simulated API Quota Exceeded."

	# Option 3: Simulate invalid JSON response
	# st.warning("Simulating invalid JSON response from AI.")
	# return {"raw_response": "{malformed json'"}, "AI response was not valid JSON, showing raw text."
	#
	# Option 4: Simulate empty results
	# mock_empty_json = json.dumps({
	# "documentation_suggestions": [], "potential_bugs": [], "style_issues": [],
	# "module_summaries": [], "refactoring_suggestions": []
	# })
	# st.success("Mock response generated (empty results).")
	# return json.loads(mock_empty_json), None
	# --- END MOCK MODE LOGIC ---


	# --- REAL API CALL LOGIC ---
	else:
	if not initialize_gemini_model(): # Ensure model is ready
	return None, "Gemini Model Initialization Failed."
	if model is None: # Should not happen if initialize check passed, but safeguard
	return None, "Gemini model not available."

	try:
	st.write(f"📡 Sending request to {GEMINI_MODEL_NAME}...")
	response = model.generate_content(
	prompt,
	generation_config=genai.types.GenerationConfig(temperature=0.2),
	safety_settings=[
	{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
	{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
	{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
	{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
	]
	)
	st.write("✅ Response received from AI.")

	# Debug: Print raw response text
	# print(f"--- RAW API RESPONSE ---\n{response.text}\n------------------------")

	try:
	# Try to extract JSON robustly
	json_response_text = response.text.strip()
	# Handle potential markdown code block fences
	if json_response_text.startswith("```json"):
	json_response_text = json_response_text[7:]
	if json_response_text.startswith("```"): # Handle case where ```json wasn't used
	json_response_text = json_response_text[3:]
	if json_response_text.endswith("```"):
	json_response_text = json_response_text[:-3]

	# Find the first '{' and the last '}'
	json_start = json_response_text.find('{')
	json_end = json_response_text.rfind('}') + 1

	if json_start != -1 and json_end != -1 and json_end > json_start:
	final_json_text = json_response_text[json_start:json_end]
	insights = json.loads(final_json_text)
	return insights, None
	else:
	st.warning("⚠️ Could not find valid JSON object boundaries ({...}) in response. Displaying raw text.")
	return {"raw_response": response.text}, "AI response did not contain clear JSON object, showing raw text."

	except json.JSONDecodeError as json_err:
	st.error(f"🚨 Error parsing JSON response from AI: {json_err}")
	st.error("Raw AI Response:")
	st.code(response.text, language='text')
	return None, f"AI response was not valid JSON: {json_err}"
	except AttributeError:
	# Handle cases where response structure might be different (e.g. blocked)
	st.error(f"🚨 Unexpected API response structure.")
	st.code(f"Response object: {response}", language='text') # Log the problematic response
	# Try to get blocked reason if available
	try:
	block_reason = response.prompt_feedback.block_reason
	if block_reason:
	return None, f"Content blocked by API. Reason: {block_reason}"
	except Exception:
	pass # Ignore if feedback structure isn't as expected
	return None, "Unexpected response structure from API."
	except Exception as e:
	st.error(f"🚨 Unexpected issue processing AI response: {e}")
	try: st.code(f"Response object: {response}", language='text')
	except: pass
	return None, f"Unexpected response structure: {e}"

	except Exception as e:
	st.error(f"🚨 An error occurred during API call: {e}")
	error_msg = f"API call failed: {e}"
	# Improved error identification
	if hasattr(e, 'message'): # For google.api_core.exceptions
	if "429" in e.message:
	error_msg = "API Quota Exceeded or Rate Limit hit. Check your Google Cloud/AI Studio dashboard."
	elif "API key not valid" in e.message:
	error_msg = "Invalid Gemini API Key. Please check `.streamlit/secrets.toml`."
	elif "blocked" in e.message.lower(): # General check for safety blocks
	error_msg = "Content blocked due to safety settings. Review input code or adjust safety settings if appropriate."
	elif "block_reason: SAFETY" in str(e): # Fallback check
	error_msg = "Content blocked due to safety settings. Review input code or adjust safety settings if appropriate."

	return None, error_msg


	def display_results(results_json, requested_analyses):
	"""Renders the analysis results in Streamlit."""
	st.header("📊 Analysis Report")

	if not isinstance(results_json, dict):
	st.error("Invalid results format received.")
	st.json(results_json)
	return

	if "raw_response" in results_json:
	st.subheader("Raw AI Response (JSON Parsing Failed)")
	st.code(results_json["raw_response"], language='text')
	return

	# Define display functions for clarity
	def display_list_items(items, fields):
	if items:
	for item in items:
	details = []
	for field_key, field_label in fields.items():
	value = item.get(field_key, 'N/A')
	if value != 'N/A': # Only show if value exists
	details.append(f"{field_label}: {value}")
	st.markdown("- " + " - ".join(details))
	# Handle specific multi-line outputs like suggestions/summaries
	if 'suggestion' in item:
	st.code(item['suggestion'], language='text')
	elif 'description' in item:
	st.markdown(f" > {item['description']}") # Indent description
	elif 'summary' in item:
	st.markdown(f" > {item['summary']}") # Indent summary
	else:
	st.markdown("_No items found for this category._")
	st.divider()

	# Map keys to display configurations
	display_config = {
	"generate_docs": {
	"key": "documentation_suggestions", "title": AVAILABLE_ANALYSES["generate_docs"],
	"fields": {"file": "File", "line": "Line"} # Suggestion shown by st.code
	},
	"find_bugs": {
	"key": "potential_bugs", "title": AVAILABLE_ANALYSES["find_bugs"],
	"fields": {"file": "File", "line": "Line", "severity": "Severity"} # Description shown separately
	},
	"check_style": {
	"key": "style_issues", "title": AVAILABLE_ANALYSES["check_style"],
	"fields": {"file": "File", "line": "Line"} # Description shown separately
	},
	"summarize_modules": {
	"key": "module_summaries", "title": AVAILABLE_ANALYSES["summarize_modules"],
	"fields": {"file": "File"} # Summary shown separately
	},
	"suggest_refactoring": {
	"key": "refactoring_suggestions", "title": AVAILABLE_ANALYSES["suggest_refactoring"],
	"fields": {"file": "File", "line": "Line", "area": "Area"} # Suggestion shown separately
	}
	}

	# Iterate and display selected sections
	any_results = False
	for analysis_key in requested_analyses:
	if analysis_key in display_config:
	config = display_config[analysis_key]
	st.subheader(config["title"])
	items = results_json.get(config["key"], [])
	display_list_items(items, config["fields"])
	if items: any_results = True

	if not any_results:
	st.info("No specific findings were identified in the analysis based on your selections.")

	# Download button
	st.download_button(
	label="Download Full Report (JSON)",
	data=json.dumps(results_json, indent=4),
	file_name="code_audit_report.json",
	mime="application/json"
	)

	# --- Streamlit App Main Interface ---
	st.set_page_config(page_title="Codebase Audit Assistant", layout="wide")

	st.title("🤖 Codebase Audit & Documentation Assistant")
	st.markdown(f"Upload your codebase (`.zip`) for analysis using {GEMINI_MODEL_NAME}.")

	# Sidebar controls
	with st.sidebar:
	st.header("⚙️ Analysis Controls")
	# Mock Mode Toggle
	st.session_state.mock_api_call = st.toggle("🧪 Enable Mock API Mode (for Testing)", value=st.session_state.mock_api_call,
	help="If enabled, uses fake data instead of calling the real Gemini API. Saves cost during testing.")
	if st.session_state.mock_api_call:
	st.info("Mock API Mode ACTIVE")
	else:
	st.info("Using REAL Gemini API")

	st.divider()
	st.header("🔎 Select Analyses")
	selected_analyses = []
	for key, name in AVAILABLE_ANALYSES.items():
	if st.checkbox(name, value=True, key=f"cb_{key}"):
	selected_analyses.append(key)

	st.divider()
	st.header("📄 How To Use")
	st.info(
	"1. Set API Key in `.streamlit/secrets.toml` (if not using Mock Mode).\n"
	"2. Toggle Mock Mode if needed.\n"
	"3. Select desired analyses.\n"
	"4. Create a ZIP archive of your codebase.\n"
	"5. Upload the `.zip` file.\n"
	"6. Click 'Analyze Codebase'.\n"
	"7. Review the report."
	)
	st.info(f"Note: Only files with common code extensions ({', '.join(CODE_EXTENSIONS)}) are processed. Analysis might be limited (~{MAX_PROMPT_TOKENS_ESTIMATE:,} est. tokens).")

	st.divider()
	st.warning("⚠️ Privacy: Code content is sent to the Google Gemini API if Mock Mode is OFF. Do not upload sensitive code if uncomfortable.")


	# Main content area
	uploaded_file = st.file_uploader("📁 Upload Codebase ZIP File", type=['zip'], key="file_uploader")

	analysis_triggered = False
	results_cache = None # To store results briefly

	if uploaded_file:
	st.success(f"✅ File '{uploaded_file.name}' uploaded.")

	with st.spinner("Inspecting ZIP file..."):
	code_files, total_chars, file_count, ignored_files = process_zip_file(uploaded_file)

	if code_files is not None:
	st.info(f"Found {file_count} relevant code files ({total_chars:,} characters). Est. tokens: ~{estimate_token_count(total_chars):,}")
	if ignored_files:
	with st.expander(f"View {len(ignored_files)} Skipped/Ignored Files"):
	# Use st.code for better formatting of list
	st.code("\n".join(ignored_files), language='text')

	analyze_button_disabled = (not selected_analyses or file_count == 0)
	analyze_button_label = "Analyze Codebase" if not analyze_button_disabled else "Select Analyses or Upload Valid Code"
	if st.button(analyze_button_label, type="primary", disabled=analyze_button_disabled):
	analysis_triggered = True

	if not selected_analyses:
	st.warning("Please select at least one analysis type from the sidebar.")
	elif file_count == 0:
	st.warning("No relevant code files found in the ZIP archive to analyze.")
	else:
	st.divider()
	with st.spinner(f"🚀 Preparing prompt & contacting AI ({'Mock Mode' if st.session_state.mock_api_call else GEMINI_MODEL_NAME})... This may take time."):
	# 1. Construct Prompt
	analysis_prompt, included_files_in_prompt = construct_analysis_prompt(code_files, selected_analyses)

	if analysis_prompt and included_files_in_prompt:
	st.write(f"Analyzing {len(included_files_in_prompt)} files...")
	# 2. Call API (Real or Mock)
	results_json, error_message = call_gemini_api(analysis_prompt)
	results_cache = (results_json, error_message) # Store results
	elif not included_files_in_prompt:
	results_cache = (None, "Could not proceed: No files included in prompt (check token limits/errors).")
	else:
	results_cache = (None, "Failed to generate analysis prompt.")

	else: # Error during zip processing
	pass # Error message already shown

	# Display results outside the button click block if analysis was triggered
	if analysis_triggered and results_cache:
	results_json, error_message = results_cache
	st.divider()
	if error_message:
	st.error(f"Analysis Failed: {error_message}")
	# Display partial results if available (e.g., raw response on JSON error)
	if results_json and isinstance(results_json, dict) and "raw_response" in results_json:
	st.subheader("Raw AI Response")
	st.code(results_json["raw_response"], language='text')

	elif results_json:
	display_results(results_json, selected_analyses)
	else:
	st.error("Analysis did not return results or an unknown error occurred.")


	elif not uploaded_file:
	st.info("Upload a ZIP file containing your source code to begin.")

	st.divider()
	st.markdown("_Assistant powered by Google Gemini._")