diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,4059 +1,1139 @@ -import gradio as gr -import os -import aiohttp -import asyncio -from git import Repo, GitCommandError, InvalidGitRepositoryError, NoSuchPathError -from pathlib import Path -from datetime import datetime, timedelta, timezone -import shutil -import json -import logging -import re -from typing import Dict, List, Optional, Tuple, Any -import subprocess -import plotly.express as px -import plotly.graph_objects as go -import time -import random -import pandas as pd +from flask import Flask, render_template_string, request, jsonify +import requests from collections import Counter -import string -from concurrent.futures import ThreadPoolExecutor -from hdbscan import HDBSCAN -# --- Required Imports --- -import hashlib -from functools import lru_cache -import threading -from http.server import HTTPServer, BaseHTTPRequestHandler -import markdown2 -import websockets -from websockets.server import WebSocketServerProtocol -from websockets.exceptions import ConnectionClosed, ConnectionClosedOK, ConnectionAbortedError, ConnectionResetError, WebSocketException -import signal # For graceful shutdown -# --------------------- - -# Assuming code_editor is available, e.g., installed via pip or included locally -try: - from code_editor import code_editor -except ImportError: - logging.error("The 'code_editor' Gradio component is not installed or available.") - logging.error("Please install it, e.g., 'pip install gradio_code_editor'") - def code_editor(*args, **kwargs): - logging.warning("Using dummy code_editor. Code editing and collaboration will not function.") - # Create a dummy component that looks like a Textbox but is non-interactive - return gr.Textbox(label=kwargs.get('label', 'Code Editor (Unavailable)'), interactive=False, value="Error: Code editor component not found. Install 'gradio_code_editor'.", lines=10) - - -# ========== Configuration ========== -WORKSPACE = Path("./issue_workspace") -WORKSPACE.mkdir(exist_ok=True) -GITHUB_API = "https://api.github.com/repos" -HF_INFERENCE_API = "https://api-inference.huggingface.co/models" -WEBHOOK_PORT = int(os.environ.get("WEBHOOK_PORT", 8000)) -WS_PORT = int(os.environ.get("WS_PORT", 8001)) -GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860)) - -# Configure logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) -# Set a higher logging level for libraries that are too verbose, e.g.: -logging.getLogger("websockets").setLevel(logging.WARNING) -logging.getLogger("aiohttp").setLevel(logging.WARNING) -logging.getLogger("urllib3").setLevel(logging.WARNING) # Might be used by git or other libs -logging.getLogger("git").setLevel(logging.WARNING) -logging.getLogger("hdbscan").setLevel(logging.WARNING) # HDBSCAN can be chatty - - -# Use ThreadPoolExecutor for any synchronous blocking operations if needed, -# but most heavy lifting (API calls, git) is now async. -executor = ThreadPoolExecutor(max_workers=4) - - -# Example HF models (replace with your actual models) -# Ensure these models are suitable for the tasks (text generation, embeddings) -HF_MODELS = { - "Mistral-8x7B": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "Llama-2-7B-chat": "huggingface/llama-2-7b-chat-hf", - "CodeLlama-34B": "codellama/CodeLlama-34b-Instruct-hf", - "Gemma-7B-it": "google/gemma-7b-it", # Added another option -} -# Embedding model ID - fixed, not user selectable -HF_EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2" - -# Select default models defensively -DEFAULT_MODEL_KEY = "Mistral-8x7B" if "Mistral-8x7B" in HF_MODELS else (list(HF_MODELS.keys())[0] if HF_MODELS else None) -DEFAULT_MODEL_ID = HF_MODELS.get(DEFAULT_MODEL_KEY, None) - -DEFAULT_IDLE_MODEL_KEY = "Gemma-7B-it" if "Gemma-7B-it" in HF_MODELS else DEFAULT_MODEL_KEY # Prefer a smaller one if available -DEFAULT_IDLE_MODEL_ID = HF_MODELS.get(DEFAULT_IDLE_MODEL_KEY, DEFAULT_MODEL_ID) - -if not HF_MODELS: - logger.critical("No HF models configured! AI features will be disabled.") -elif DEFAULT_MODEL_ID is None: - logger.critical(f"Default model key '{DEFAULT_MODEL_KEY}' not found in configured models. AI features may be limited.") -if DEFAULT_IDLE_MODEL_ID is None: - logger.warning(f"Idle model key '{DEFAULT_IDLE_MODEL_KEY}' not found or no models configured. Idle tasks may be disabled or use the default model if available.") - - -# --- Idle State Configuration --- -STALE_ISSUE_THRESHOLD_DAYS = 30 -MAX_SUMMARY_COMPUTATIONS_PER_CYCLE = 2 -MAX_CONTEXT_COMPUTATIONS_PER_CYCLE = 3 -MAX_MISSING_INFO_COMPUTATIONS_PER_CYCLE = 1 -MAX_ANALYSIS_COMPUTATIONS_PER_CYCLE = 1 -RECLUSTER_THRESHOLD = 5 # Number of significant webhook changes before re-clustering is flagged -IDLE_PROCESSING_INTERVAL_SECONDS = 60.0 # How often the idle task loop runs - -# ========== Placeholder OTCodeEditor Class ========== -# WARNING: This is a placeholder and DOES NOT implement Operational Transformation. -# Concurrent edits WILL lead to data loss or inconsistencies. -class OTCodeEditor: - """ - Placeholder for an Operational Transformation (OT) enabled code editor backend. - This implementation is NOT thread-safe and does NOT handle concurrent edits correctly. - It merely logs received deltas and maintains a basic revision counter. - The actual document state is held client-side by the Gradio code_editor component. - A real collaborative editor would require a robust OT backend to manage - the authoritative document state and transform operations. - """ - def __init__(self, initial_value: Dict[str, str]): - # In a real OT system, this would initialize the document state - # For this placeholder, we just store the initial files dict - self.files: Dict[str, str] = initial_value.copy() - self.revision = 0 # Basic revision counter, not used for OT logic - logger.debug(f"OTCodeEditor initialized with files: {list(self.files.keys())}") - - def apply_delta(self, delta: Dict[str, Any]): - # VERY basic placeholder: This logs the delta but does NOT perform OT. - # It does NOT handle concurrent edits safely. - # In a real OT system, this would transform the delta against the current state - # and apply it, incrementing the revision based on successful application. - logger.warning(f"Placeholder apply_delta called. Delta: {str(delta)[:200]}. " - "WARNING: Full Operational Transformation is NOT implemented. Concurrent edits are UNSAFE.") - # The Gradio component holds the actual text state client-side. - # A real OT backend would use the delta to update its authoritative state. - # We only increment revision for basic tracking visibility if needed. - self.revision += 1 - - def get_content(self) -> Dict[str, str]: - # This is not used by the current Gradio code_editor integration, - # as the component holds the state client-side. - # In a real OT system, this would return the current authoritative document state. - return self.files.copy() - -# ========== Modern Theme ========== -try: - theme = gr.themes.Soft( - primary_hue="violet", - secondary_hue="emerald", - radius_size="lg", - font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"] - ).set( - button_primary_background_fill="linear-gradient(90deg, #8B5CF6 0%, #EC4899 100%)", - button_primary_text_color="white", - block_label_text_size="lg", - block_label_text_weight="600", - block_title_text_size="lg", - block_title_text_weight="800", - panel_background_fill="white", - block_shadow="*shadow_drop_lg", - # Add some more modern touches - input_background_fill="#f9fafb", - input_border_color="#e5e7eb", - input_border_radius="md", - button_secondary_background_fill="#f3f4f6", - button_secondary_text_color="#374151", - button_secondary_border_color="#d1d5db", - table_border_color="#e5e7eb", - table_row_background_even="#f9fafb", - table_row_background_odd="#ffffff", - # Use slightly softer colors for plots if default is too bright - # (This might need specific Plotly config instead of theme) - ) -except AttributeError as e: - logger.warning(f"Could not apply all theme settings (might be Gradio version difference): {e}. Using default Soft theme.") - theme = gr.themes.Soft() - -# Additional UI/UX Enhancements -custom_css = """ -/* Smooth transitions for buttons and inputs */ -button, input, select, textarea { - transition: background-color 0.3s ease, color 0.3s ease, border-color 0.3s ease; -} - -/* Hover effects for buttons */ -button:hover { - filter: brightness(1.1); -} - -/* Focus styles for inputs */ -input:focus, select:focus, textarea:focus { - outline: none; - border-color: #8B5CF6; - box-shadow: 0 0 5px rgba(139, 92, 246, 0.5); -} - -/* Scrollbar styling for scrollable divs */ -#issue_preview_div::-webkit-scrollbar, -#ai_output_md::-webkit-scrollbar { - width: 8px; -} - -#issue_preview_div::-webkit-scrollbar-thumb, -#ai_output_md::-webkit-scrollbar-thumb { - background-color: #8B5CF6; - border-radius: 4px; -} - -/* Responsive layout improvements */ -@media (max-width: 768px) { - #config-panel { - flex-direction: column !important; - } - #issue_list_df { - font-size: 0.9em !important; - } -} - -/* Enhanced table styles */ -.gradio-dataframe-container table { - border-collapse: separate !important; - border-spacing: 0 8px !important; -} - -.gradio-dataframe-container table tr { - background-color: #f9fafb !important; - border-radius: 8px !important; - box-shadow: 0 1px 3px rgba(0,0,0,0.1); -} - -.gradio-dataframe-container table tr:hover { - background-color: #ede9fe !important; - cursor: pointer; -} - -/* Button styles */ -.gr-button { - border-radius: 8px !important; - font-weight: 600 !important; -} - -/* Accordion header styles */ -#ai_tools_accordion > .label { - font-weight: 700 !important; - font-size: 1.1em !important; - color: #6b21a8 !important; -} - -/* Status bar improvements */ -#status_output_txt textarea { - background-color: #f3f4f6 !important; - border-radius: 6px !important; - font-family: 'Fira Code', monospace !important; - font-size: 0.95em !important; - color: #4b5563 !important; -} - -/* Code editor improvements */ -#code_editor_component { - border: 1px solid #ddd !important; - border-radius: 8px !important; - box-shadow: 0 2px 6px rgba(139, 92, 246, 0.15); -} - -/* Plot improvements */ -#stats_plot_viz, #analytics_severity_plot { - border-radius: 8px; - box-shadow: 0 2px 8px rgba(0,0,0,0.1); - background-color: white; -} -""" - - -# ========== Enhanced Webhook Handler ========== -class WebhookHandler(BaseHTTPRequestHandler): - manager_instance: Optional['IssueManager'] = None - main_loop: Optional[asyncio.AbstractEventLoop] = None # Store reference to the main asyncio loop - - def do_POST(self): - content_length = int(self.headers.get('Content-Length', 0)) - if content_length == 0: - self.send_response(400) - self.send_header("Content-type", "text/plain") - self.end_headers() - self.wfile.write(b"Empty payload") - logger.warning("Received empty webhook payload.") - return - - try: - payload_bytes = self.rfile.read(content_length) - payload = json.loads(payload_bytes.decode('utf-8')) - except json.JSONDecodeError: - logger.error(f"Invalid JSON payload received: {payload_bytes[:500]}") - self.send_response(400) - self.send_header("Content-type", "text/plain") - self.end_headers() - self.wfile.write(b"Invalid JSON payload") - return - except Exception as e: - logger.error(f"Error reading webhook payload: {e}") - self.send_response(500) - self.end_headers() - return - - event = self.headers.get('X-GitHub-Event') - delivery_id = self.headers.get('X-GitHub-Delivery') - logger.info(f"Received GitHub webhook event: {event} (Delivery ID: {delivery_id})") - - if event == 'issues' and WebhookHandler.manager_instance and WebhookHandler.main_loop: - action = payload.get('action') - logger.info(f"Issue action: {action}") - # Handle common actions that affect issue state or content - if action in ['opened', 'reopened', 'closed', 'assigned', 'unassigned', 'edited', 'labeled', 'unlabeled', 'milestoned', 'demilestoned']: - # Check if the loop is running before scheduling - if WebhookHandler.main_loop.is_running(): - # Schedule the async handler to run in the main event loop - # Use run_coroutine_threadsafe as this is called from a different thread - asyncio.run_coroutine_threadsafe( - WebhookHandler.manager_instance.handle_webhook_event(event, action, payload), - WebhookHandler.main_loop - ) - logger.debug(f"Scheduled webhook processing for action '{action}' in main loop.") - else: - logger.error("Asyncio event loop is not running in the target thread for webhook. Cannot process event.") - # Respond with an error as processing couldn't be scheduled - self.send_response(500) - self.send_header("Content-type", "text/plain") - self.end_headers() - self.wfile.write(b"Async processing loop not available.") - return - else: - logger.info(f"Webhook action '{action}' received but not actively handled by current logic.") - elif event == 'ping': - logger.info("Received GitHub webhook ping.") - else: - logger.warning(f"Unhandled event type: {event} or manager/loop not initialized.") - - # Always respond 200 OK for successful receipt, even if action is ignored - self.send_response(200) - self.send_header("Content-type", "text/plain") - self.end_headers() - self.wfile.write(b"OK") - -# ========== AI-Powered Issue Manager ========== -class IssueManager: - def __init__(self): - self.issues: Dict[int, dict] = {} - self.repo_url: Optional[str] = None - self.repo_owner: Optional[str] = None - self.repo_name: Optional[str] = None - self.repo_local_path: Optional[Path] = None - self.repo: Optional[Repo] = None - self.github_token: Optional[str] = None - self.hf_token: Optional[str] = None - self.collaborators: Dict[str, dict] = {} # {client_id: {name: str, status: str}} - self.points: int = 0 # Placeholder for potential gamification - self.severity_rules: Dict[str, List[str]] = { - "Critical": ["critical", "urgent", "security", "crash", "blocker", "p0", "s0"], - "High": ["high", "important", "error", "regression", "major", "p1", "s1"], - "Medium": ["medium", "bug", "performance", "minor", "p2", "s2"], - "Low": ["low", "documentation", "enhancement", "trivial", "feature", "p3", "s3", "chore", "refactor", "question", "help wanted"] - } - self.issue_clusters: Dict[int, List[int]] = {} # {cluster_id: [issue_index_in_list, ...]} - self.issue_list_for_clustering: List[dict] = [] # List of issue dicts used for the last clustering run - self.ws_clients: List[WebSocketServerProtocol] = [] - self.code_editors: Dict[int, OTCodeEditor] = {} # {issue_id: OTCodeEditor_instance} - # Get or create the loop in the thread where the manager is initialized (main thread) - try: - self.main_loop = asyncio.get_running_loop() - logger.debug(f"IssueManager found running asyncio loop: {id(self.main_loop)}") - except RuntimeError: - self.main_loop = asyncio.new_event_loop() - asyncio.set_event_loop(self.main_loop) - logger.debug(f"IssueManager created and set new asyncio loop: {id(self.main_loop)}") - - self.broadcast_task: Optional[asyncio.Task] = None - self.idle_task: Optional[asyncio.Task] = None - - # --- State for Idle Processing Results --- - self.precomputed_context: Dict[int, Dict[str, Any]] = {} # {issue_id: {content: str, files: list, error: str, timestamp: float}} - self.precomputed_summaries: Dict[int, Dict[str, Any]] = {} # {issue_id: {summary: str, error: str, timestamp: float}} - self.precomputed_missing_info: Dict[int, Dict[str, Any]] = {} # {issue_id: {info_needed: str, error: str, timestamp: float}} - self.precomputed_analysis: Dict[int, Dict[str, Any]] = {} # {issue_id: {hypothesis: str, error: str, timestamp: float}} - # self.code_embeddings: Dict[str, List[float]] = {} # Not currently used after clustering - self.potential_duplicates: Dict[int, List[int]] = {} # {issue_id: [duplicate_issue_id, ...]} - self.stale_issues: List[int] = [] # [issue_id, ...] - self.high_priority_candidates: List[int] = [] # [issue_id, ...] - self.last_webhook_time: float = time.time() # Track last webhook for potential future use - self.needs_recluster: bool = False - self._webhook_change_count = 0 - - # --- Configuration for Idle Tasks --- - self.idle_processing_interval = IDLE_PROCESSING_INTERVAL_SECONDS - self.max_context_computations_per_cycle = MAX_CONTEXT_COMPUTATIONS_PER_CYCLE - self.max_summary_computations_per_cycle = MAX_SUMMARY_COMPUTATIONS_PER_CYCLE - self.max_missing_info_computations_per_cycle = MAX_MISSING_INFO_COMPUTATIONS_PER_CYCLE - self.max_analysis_computations_per_cycle = MAX_ANALYSIS_COMPUTATIONS_PER_CYCLE - self.stale_issue_threshold_days = STALE_ISSUE_THRESHOLD_DAYS - self.recluster_threshold = RECLUSTER_THRESHOLD - - # Shutdown signals (placeholders, set by main execution block) - self.stop_ws_server = None - self.stop_webhook_server = None - - def start_broadcast_loop(self): - """Starts the periodic broadcast task.""" - # Ensure task is created in the correct loop - if not self.main_loop.is_running(): - logger.error("Cannot start broadcast loop: Main event loop is not running.") - return - - if not self.broadcast_task or self.broadcast_task.done(): - self.broadcast_task = self.main_loop.create_task(self.broadcast_collaboration_status()) - logger.info("Started collaboration status broadcast loop.") - else: - logger.debug("Broadcast loop already running.") - - - def stop_broadcast_loop(self): - """Stops the periodic broadcast task.""" - if self.broadcast_task and not self.broadcast_task.done(): - logger.info("Stopping collaboration status broadcast loop...") - self.broadcast_task.cancel() - # Await the task to finish cancellation in an async context if needed, - # but cancelling is sufficient to signal it to stop. - self.broadcast_task = None # Clear the reference - - - def _get_issue_hash(self, issue_data: Optional[dict]) -> str: - """Generates a hash based on key issue content for caching AI suggestions.""" - if not issue_data: - return "empty_issue_hash" # Handle cases where issue_data is None - - content = f"{issue_data.get('title', '')}{issue_data.get('body', '')}{','.join(issue_data.get('labels',[]))}" - return hashlib.md5(content.encode('utf-8', errors='ignore')).hexdigest() # Use utf-8 and ignore errors - - - @lru_cache(maxsize=100) - async def cached_suggestion(self, issue_hash: str, model_key: str) -> str: - """Retrieves or generates an AI suggestion, using an LRU cache based on issue content hash.""" - logger.debug(f"Checking cache for suggestion: hash={issue_hash}, model={model_key}") - # The cache decorator handles the cache hit/miss logic. - # If it's a miss, the decorated function body is executed. - - # Find the issue data corresponding to the hash - found_issue = None - # This linear scan is inefficient for many issues, but hashes are only for cache keys. - # A dict mapping hashes to issue IDs could be more efficient if this becomes a bottleneck. - # However, issues dict is not huge usually, so this is likely fine. - for issue in self.issues.values(): - if self._get_issue_hash(issue) == issue_hash: - found_issue = issue - break - - if not found_issue: - logger.error(f"Could not find issue data for hash {issue_hash} in current state. Suggestion cannot be generated.") - # Corrected: Return error message here if issue data is missing - return "Error: Issue data for this suggestion request (hash) not found in current state. The issue might have been updated or closed. Please re-select the issue." - - if model_key not in HF_MODELS or HF_MODELS.get(model_key) is None: - logger.error(f"Invalid or unconfigured model key requested: {model_key}") - return f"Error: Invalid or unconfigured model key: {model_key}" - - logger.info(f"Cache miss or first request for issue hash {issue_hash}. Requesting suggestion from {model_key}.") - # Call the actual suggestion generation function - return await self.suggest_resolution(found_issue, model_key) - - async def handle_webhook_event(self, event: str, action: str, payload: dict): - """Processes incoming webhook events to update the issue state.""" - logger.info(f"Processing webhook event: {event}, action: {action}") - issue_data = payload.get('issue') - repo_data = payload.get('repository') - - if not issue_data or not repo_data: - logger.warning("Webhook payload missing 'issue' or 'repository' data.") - return - - event_repo_url = repo_data.get('html_url') - # Only process events for the currently loaded repository - # Use .rstrip("/") on both sides for robust comparison - if self.repo_url is None or event_repo_url is None or event_repo_url.rstrip("/") != self.repo_url.rstrip("/"): - logger.info(f"Ignoring webhook event for different repository: {event_repo_url} (Current: {self.repo_url})") - return - - issue_number = issue_data.get('number') - if issue_number is None: # Check explicitly for None - logger.warning("Webhook issue data missing 'number'.") - return - - needs_ui_update = False - significant_change = False # Flag for changes affecting clustering/content/AI caches - - if action == 'closed': - logger.info(f"Webhook: Removing closed issue {issue_number} from active list.") - if issue_number in self.issues: - self.issues.pop(issue_number) - needs_ui_update = True - significant_change = True # Closing is a significant change - # Clean up associated cached/computed data - self.precomputed_context.pop(issue_number, None) - self.precomputed_summaries.pop(issue_number, None) - self.precomputed_missing_info.pop(issue_number, None) - self.precomputed_analysis.pop(issue_number, None) - self.potential_duplicates.pop(issue_number, None) - # Remove from lists if present (use list comprehension for safe removal) - self.stale_issues = [i for i in self.stale_issues if i != issue_number] - self.high_priority_candidates = [i for i in self.high_priority_candidates if i != issue_number] - # Remove the code editor instance for the closed issue - self.code_editors.pop(issue_number, None) - logger.debug(f"Cleaned up state for closed issue {issue_number}.") - else: - logger.debug(f"Webhook: Issue {issue_number} closed, but not found in current active list. No state change needed.") - - - elif action in ['opened', 'reopened', 'edited', 'assigned', 'unassigned', 'labeled', 'unlabeled', 'milestoned', 'demilestoned']: - logger.info(f"Webhook: Adding/Updating issue {issue_number} (action: {action}).") - processed_data = self._process_issue_data(issue_data) - - old_issue = self.issues.get(issue_number) - # Check for changes that impact AI suggestions or clustering - if not old_issue or \ - old_issue.get('body') != processed_data.get('body') or \ - old_issue.get('title') != processed_data.get('title') or \ - set(old_issue.get('labels', [])) != set(processed_data.get('labels', [])): - significant_change = True - logger.info(f"Significant change detected for issue {issue_number} (content/labels).") - # Invalidate ALL precomputed AI state on significant edit - self.precomputed_context.pop(issue_number, None) - self.precomputed_summaries.pop(issue_number, None) - self.precomputed_missing_info.pop(issue_number, None) - self.precomputed_analysis.pop(issue_number, None) - # Clear the entire suggestion cache on significant change - self.cached_suggestion.cache_clear() - logger.debug("Cleared suggestion cache due to significant issue change.") - - # Check if state-related fields changed (affecting idle processing lists) - # This check is for logging/debugging, the idle loop re-evaluates lists anyway - if not old_issue or \ - old_issue.get('updated_at') != processed_data.get('updated_at') or \ - old_issue.get('assignee') != processed_data.get('assignee') or \ - set(old_issue.get('labels', [])) != set(processed_data.get('labels', [])) or \ - old_issue.get('state') != processed_data.get('state'): # State change (open/reopened) - logger.debug(f"State-related change detected for issue {issue_number} (update time, assignee, labels, state). Idle loop will re-evaluate.") - - self.issues[issue_number] = processed_data - needs_ui_update = True - else: - logger.info(f"Ignoring webhook action '{action}' for issue {issue_number} (already filtered).") - - # --- Track changes for idle processing --- - if needs_ui_update: - self.last_webhook_time = time.time() - if significant_change: - self._increment_change_counter() - # Rebuild the list used for clustering immediately if a significant change occurred - # This list is a snapshot used by the async clustering task - self.issue_list_for_clustering = list(self.issues.values()) - logger.info("Issue list for clustering updated due to significant webhook change.") - # Broadcast UI update notification - # Schedule this in the main loop using call_soon_threadsafe - if self.main_loop.is_running(): - self.main_loop.call_soon_threadsafe(asyncio.create_task, self.broadcast_issue_update()) - logger.debug("Scheduled issue update broadcast.") - else: - logger.warning("Main loop not running, cannot broadcast issue update.") - - - def _increment_change_counter(self): - """Increments change counter and sets recluster flag if threshold reached.""" - self._webhook_change_count += 1 - logger.debug(f"Significant change detected. Change count: {self._webhook_change_count}/{self.recluster_threshold}") - if self._webhook_change_count >= self.recluster_threshold: - self.needs_recluster = True - logger.info(f"Change threshold ({self.recluster_threshold}) reached. Flagging for re-clustering.") - - def _process_issue_data(self, issue_data: dict) -> dict: - """Helper to structure issue data consistently.""" - return { - "id": issue_data.get('number'), # Use .get for safety - "title": issue_data.get('title', 'No Title Provided'), - "body": issue_data.get('body', ''), - "state": issue_data.get('state', 'unknown'), - "labels": sorted([label.get('name', '') for label in issue_data.get('labels', []) if isinstance(label, dict) and label.get('name')]), # Ensure labels are dicts and have name - "assignee": issue_data.get('assignee', {}).get('login') if issue_data.get('assignee') and isinstance(issue_data.get('assignee'), dict) else None, - "url": issue_data.get('html_url', '#'), - "created_at": issue_data.get('created_at'), - "updated_at": issue_data.get('updated_at'), - } - - async def crawl_issues(self, repo_url: str, github_token: Optional[str], hf_token: Optional[str]) -> Tuple[List[List], go.Figure, str, go.Figure]: - """ - Crawls issues, resets state, clones repo, clusters, starts background tasks. - Returns dataframe data, stats plot, status message, and analytics plot. - """ - # Strip whitespace from inputs - repo_url = repo_url.strip() if repo_url else None - github_token = github_token.strip() if github_token else None - hf_token = hf_token.strip() if hf_token else None - - # Define a default empty plot for consistent return type - def get_empty_plot(title="Plot"): - fig = go.Figure() - fig.update_layout(title=title, xaxis={"visible": False}, yaxis={"visible": False}, - annotations=[{"text": "Scan needed.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}], - plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)') - return fig - - if not repo_url or not hf_token: - logger.error("Repository URL and Hugging Face Token are required.") - empty_fig = get_empty_plot("Issue Severity Distribution") - return [], empty_fig, "Error: Repository URL and Hugging Face Token are required.", empty_fig - - logger.info(f"Starting new issue crawl and setup for {repo_url}") - - # --- Reset Manager State --- - # Stop background tasks first - self.stop_idle_processing() - self.stop_broadcast_loop() - - self.issues = {} - # Clear code_editors instances - self.code_editors = {} - self.issue_clusters = {} - self.issue_list_for_clustering = [] - self.cached_suggestion.cache_clear() # Clear AI suggestion cache - self.precomputed_context = {} - self.precomputed_summaries = {} - self.precomputed_missing_info = {} - self.precomputed_analysis = {} - # self.code_embeddings = {} # Not used - self.potential_duplicates = {} - self.stale_issues = [] - self.high_priority_candidates = [] - self.needs_recluster = False - self._webhook_change_count = 0 - self.last_webhook_time = time.time() - self.repo = None # Clear the repo object - self.repo_url = repo_url - self.github_token = github_token - self.hf_token = hf_token - logger.info("Internal state reset for new crawl.") - - # --- Repository Cloning/Updating --- - match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", self.repo_url) - if not match: - logger.error(f"Invalid GitHub URL format: {self.repo_url}") - empty_fig = get_empty_plot("Issue Severity Distribution") - return [], empty_fig, "Error: Invalid GitHub URL format. Use https://github.com/owner/repo", empty_fig - self.repo_owner, self.repo_name = match.groups() - self.repo_local_path = WORKSPACE / f"{self.repo_owner}_{self.repo_name}" - - try: - if self.repo_local_path.exists(): - logger.info(f"Attempting to update existing repository clone at {self.repo_local_path}") - try: - self.repo = Repo(self.repo_local_path) - # Ensure the origin remote matches the requested URL - if not self.repo.remotes or 'origin' not in self.repo.remotes: - logger.warning(f"Existing repo at {self.repo_local_path} has no 'origin' remote. Re-cloning.") - if self.repo_local_path.exists(): shutil.rmtree(self.repo_local_path) - self.repo = Repo.clone_from(self.repo_url, self.repo_local_path, progress=lambda op, cur, tot, msg: logger.debug(f"Clone progress: {msg}")) - else: - origin = self.repo.remotes.origin - remote_url = next((u for u in origin.urls), None) # Get first URL - expected_urls = {self.repo_url, self.repo_url + ".git"} - if remote_url not in expected_urls: - logger.warning(f"Existing repo path {self.repo_local_path} has different remote URL ('{remote_url}' vs '{self.repo_url}'). Removing and re-cloning.") - # Remove the directory entirely before re-cloning - if self.repo_local_path.exists(): shutil.rmtree(self.repo_local_path) - self.repo = Repo.clone_from(self.repo_url, self.repo_local_path, progress=lambda op, cur, tot, msg: logger.debug(f"Clone progress: {msg}")) - else: - logger.info("Pulling latest changes...") - # Use a timeout for pull operations - try: - # Fetch first to get latest refs - origin.fetch(progress=lambda op, cur, tot, msg: logger.debug(f"Fetch progress: {msg}"), timeout=120) - # Then pull - origin.pull(progress=lambda op, cur, tot, msg: logger.debug(f"Pull progress: {msg}"), timeout=120) - # Unshallow if necessary - if self.repo.git.rev_parse('--is-shallow-repository').strip() == 'true': - logger.info("Repository is shallow, unshallowing...") - # Use a timeout for unshallow - self.repo.git.fetch('--unshallow', timeout=300) - except GitCommandError as pull_err: - logger.error(f"Git pull/fetch error: {pull_err}. Proceeding with potentially stale local copy.") - except Exception as pull_err: - logger.exception(f"Unexpected error during git pull/fetch: {pull_err}. Proceeding with potentially stale local copy.") - - - except (InvalidGitRepositoryError, NoSuchPathError): - logger.warning(f"Invalid or missing Git repository at {self.repo_local_path}. Re-cloning.") - # Ensure directory is clean before re-cloning - if self.repo_local_path.exists(): shutil.rmtree(self.repo_local_path) - self.repo = Repo.clone_from(self.repo_url, self.repo_local_path, progress=lambda op, cur, tot, msg: logger.debug(f"Clone progress: {msg}")) - except GitCommandError as git_err: - logger.error(f"Git operation error during update: {git_err}. Trying to proceed with existing copy, but it might be stale.") - if not self.repo: # If repo object wasn't successfully created before the error - try: self.repo = Repo(self.repo_local_path) - except Exception: logger.error("Failed to even load existing repo after git error.") - except Exception as e: - logger.exception(f"An unexpected error occurred during repository update check: {e}") - # If repo object wasn't successfully created before the error - if not self.repo: - try: self.repo = Repo(self.repo_local_path) - except Exception: logger.error("Failed to even load existing repo after update error.") - - else: - logger.info(f"Cloning repository {self.repo_url} to {self.repo_local_path}") - # Use a timeout for the initial clone - self.repo = Repo.clone_from(self.repo_url, self.repo_local_path, progress=lambda op, cur, tot, msg: logger.debug(f"Clone progress: {msg}"), timeout=300) - - - logger.info("Repository clone/update process finished.") - if not self.repo: - raise Exception("Repository object could not be initialized after cloning/update.") - - except GitCommandError as e: - logger.error(f"Failed to clone/update repository: {e}") - empty_fig = get_empty_plot("Issue Severity Distribution") - empty_fig.update_layout(annotations=[{"text": "Repo Error.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}]) - return [], empty_fig, f"Error cloning/updating repository: {e}. Check URL, permissions, and network.", empty_fig - except Exception as e: - logger.exception(f"An unexpected error occurred during repository handling: {e}") - empty_fig = get_empty_plot("Issue Severity Distribution") - empty_fig.update_layout(annotations=[{"text": "Repo Error.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}]) - return [], empty_fig, f"An unexpected error occurred during repo setup: {e}", empty_fig - - - # --- Issue Fetching --- - api_url = f"{GITHUB_API}/{self.repo_owner}/{self.repo_name}/issues?state=open&per_page=100" - headers = {"Accept": "application/vnd.github.v3+json"} - if github_token: - headers["Authorization"] = f"token {github_token}" - - try: - all_issues_data = [] - page = 1 - logger.info(f"Fetching open issues from GitHub API (repo: {self.repo_owner}/{self.repo_name})...") - async with aiohttp.ClientSession(headers=headers) as session: - while True: - paginated_url = api_url - logger.debug(f"Fetching URL: {paginated_url}") - # Use a timeout for API requests - try: - async with session.get(paginated_url, timeout=30) as response: - rate_limit_remaining = response.headers.get('X-RateLimit-Remaining') - rate_limit_reset = response.headers.get('X-RateLimit-Reset') - logger.debug(f"GitHub API Response Status: {response.status}, RateLimit Remaining: {rate_limit_remaining}, Reset: {rate_limit_reset}") - - if response.status == 403 and rate_limit_remaining == '0': - reset_time = int(rate_limit_reset) if rate_limit_reset else time.time() + 60 - wait_time = max(reset_time - time.time() + 5, 0) # Wait until reset time + a buffer - logger.warning(f"GitHub API rate limit exceeded. Waiting until {datetime.fromtimestamp(reset_time).strftime('%H:%M:%S')} ({wait_time:.0f}s)") - await asyncio.sleep(wait_time) - continue # Retry the same page - - response.raise_for_status() # Raise for other 4xx/5xx errors - - issues_page_data = await response.json() - if not issues_page_data: break # No more issues on this page - - logger.info(f"Fetched page {page} with {len(issues_page_data)} items.") - all_issues_data.extend(issues_page_data) - - link_header = response.headers.get('Link') - if link_header and 'rel="next"' in link_header: - # Simple parsing for the next link, more robust parsing might be needed for complex headers - next_url_match = re.search(r'<([^>]+)>;\s*rel="next"', link_header) - if next_url_match: - # The next URL is provided directly, use it - api_url = next_url_match.group(1) - page += 1 # Increment page counter for logging, though not strictly needed for the loop logic now - logger.debug(f"Found next page link: {api_url}") - else: - logger.warning("Link header contains 'rel=\"next\"' but could not parse the URL. Stopping pagination.") - break - else: - logger.debug("No 'next' link found in Link header. Assuming last page.") - break - await asyncio.sleep(0.1) # Small delay between requests - - except asyncio.TimeoutError: - logger.warning(f"GitHub API request timed out for page {page}. Stopping pagination early.") - break # Stop pagination on timeout - except aiohttp.ClientResponseError as e: - # Re-raise client response errors so the outer handler catches them - raise e - except Exception as e: - logger.exception(f"An unexpected error occurred during GitHub API pagination for page {page}. Stopping pagination.") - break # Stop pagination on unexpected error - - - logger.info(f"Total items fetched (including potential PRs): {len(all_issues_data)}") - # Filter out pull requests (issues with 'pull_request' key) - self.issues = { - issue_data['number']: self._process_issue_data(issue_data) - for issue_data in all_issues_data - if 'pull_request' not in issue_data and issue_data.get('number') is not None # Ensure number exists - } +import re - logger.info(f"Filtered out pull requests, {len(self.issues)} actual open issues remaining.") - - empty_fig = get_empty_plot("Issue Severity Distribution") - if not self.issues: - logger.warning("No open issues found for this repository.") - empty_fig.update_layout(annotations=[{"text": "No issues found.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}]) - return [], empty_fig, "No open issues found in the repository.", empty_fig - - # --- Clustering and UI Data Prep --- - self.issue_list_for_clustering = list(self.issues.values()) - logger.info("Clustering issues...") - await self._cluster_similar_issues() - - # --- Initial Idle Task Prep (Run synchronously after load) --- - logger.info("Identifying potential duplicates based on initial clusters...") - self._identify_potential_duplicates() - logger.info("Identifying potentially stale issues...") - self._identify_stale_issues() - logger.info("Identifying high priority candidates...") - self._identify_high_priority_candidates() - - # --- Prepare Dataframe Output & Stats --- - dataframe_data = [] - severity_counts = {"Critical": 0, "High": 0, "Medium": 0, "Low": 0, "Unknown": 0} - index_to_cluster_id: Dict[int, int] = {} - # Map issue index in the clustering list back to its cluster ID - for cluster_id, indices in self.issue_clusters.items(): - for index in indices: - if 0 <= index < len(self.issue_list_for_clustering): - index_to_cluster_id[index] = cluster_id - else: - logger.warning(f"Clustering returned invalid index {index} for list of length {len(self.issue_list_for_clustering)}") - - for i, issue in enumerate(self.issue_list_for_clustering): - severity = self._determine_severity(issue.get('labels', [])) # Use get for safety - severity_counts[severity] += 1 - # Get cluster ID using the index from the clustering list - cluster_id = index_to_cluster_id.get(i, -1) - dataframe_data.append([ - issue.get('id', 'N/A'), # Use get for safety - issue.get('title', 'No Title'), # Use get for safety - severity, - cluster_id if cluster_id != -1 else "N/A" # Display "N/A" for noise (-1) - ]) - - logger.info("Generating statistics plot...") - stats_fig = self._generate_stats_plot(severity_counts) - - # --- Start Background Tasks --- - # Ensure tasks are created in the manager's loop - self.start_broadcast_loop() - self.start_idle_processing() - - success_msg = f"Found {len(self.issues)} open issues. Clustered into {len(self.issue_clusters)} groups. Repo ready. Background analysis started." - logger.info(success_msg) - # Return both plots (stats and analytics severity are the same initially) - return dataframe_data, stats_fig, success_msg, stats_fig - - except aiohttp.ClientResponseError as e: - logger.error(f"GitHub API request failed: Status={e.status}, Message='{e.message}', URL='{e.request_info.url}'") - error_msg = f"Error fetching issues: {e.status} - {e.message}. Check token/URL." - if e.status == 404: error_msg = f"Error: Repository not found at {self.repo_url}." - elif e.status == 401: error_msg = "Error: Invalid GitHub token or insufficient permissions for this repository." - elif e.status == 403: - rate_limit_remaining = e.headers.get('X-RateLimit-Remaining') # FIX: Access rate_limit_remaining from error headers - rate_limit_reset = e.headers.get('X-RateLimit-Reset') - reset_time_str = "unknown" - if rate_limit_reset: - try: reset_time_str = datetime.fromtimestamp(int(rate_limit_reset), timezone.utc).strftime('%Y-%m-%d %H:%M:%S %Z') - except ValueError: pass - error_msg = f"Error: GitHub API rate limit likely exceeded or access forbidden (Remaining: {rate_limit_remaining}). Reset time: {reset_time_str}. Check token or wait." - self.stop_idle_processing() - self.stop_broadcast_loop() - empty_fig = get_empty_plot("Issue Severity Distribution") - empty_fig.update_layout(annotations=[{"text": "API Error.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}]) - return [], empty_fig, error_msg, empty_fig - except asyncio.TimeoutError: - logger.error("GitHub API request timed out.") - self.stop_idle_processing() - self.stop_broadcast_loop() - empty_fig = get_empty_plot("Issue Severity Distribution") - empty_fig.update_layout(annotations=[{"text": "API Timeout.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}]) - return [], empty_fig, "Error: GitHub API request timed out.", empty_fig - except Exception as e: - self.stop_idle_processing() - self.stop_broadcast_loop() - logger.exception(f"An unexpected error occurred during issue crawl: {e}") - empty_fig = get_empty_plot("Issue Severity Distribution") - empty_fig.update_layout(annotations=[{"text": "Unexpected Error.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}]) - return [], empty_fig, f"An unexpected error occurred: {e}", empty_fig - - def _determine_severity(self, labels: List[str]) -> str: - """Determines issue severity based on labels using predefined rules.""" - labels_lower = {label.lower().strip() for label in labels} - for severity, keywords in self.severity_rules.items(): - if any(keyword in label for label in labels_lower for keyword in keywords): - return severity - return "Unknown" - - def _generate_stats_plot(self, severity_counts: Dict[str, int]) -> go.Figure: - """Generates a Plotly bar chart for issue severity distribution.""" - filtered_counts = {k: v for k, v in severity_counts.items() if v > 0} - if not filtered_counts: - fig = go.Figure() - fig.update_layout(title="Issue Severity Distribution", xaxis={"visible": False}, yaxis={"visible": False}, - # FIX: Corrected dictionary syntax for font size - annotations=[{"text": "No issues to display.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}], - plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)') - return fig # Return the empty figure here - - severities = list(filtered_counts.keys()) - counts = list(filtered_counts.values()) - order = ['Critical', 'High', 'Medium', 'Low', 'Unknown'] - # Sort severities based on the predefined order - severities_sorted = sorted(severities, key=lambda x: order.index(x) if x in order else len(order)) - counts_sorted = [filtered_counts[s] for s in severities_sorted] - - fig = px.bar(x=severities_sorted, y=counts_sorted, title="Issue Severity Distribution", - labels={'x': 'Severity', 'y': 'Number of Issues'}, color=severities_sorted, - color_discrete_map={'Critical': '#DC2626', 'High': '#F97316', 'Medium': '#FACC15', 'Low': '#84CC16', 'Unknown': '#6B7280'}, - text=counts_sorted) - fig.update_layout(xaxis_title=None, yaxis_title="Number of Issues", plot_bgcolor='rgba(0,0,0,0)', - paper_bgcolor='rgba(0,0,0,0)', showlegend=False, - xaxis={'categoryorder':'array', 'categoryarray': order}, - yaxis={'rangemode': 'tozero'}) # Ensure y-axis starts at 0 - fig.update_traces(textposition='outside') - return fig - - async def _cluster_similar_issues(self): - """Generates embeddings and clusters issues using HDBSCAN. Uses self.issue_list_for_clustering.""" - if not self.issue_list_for_clustering: - logger.warning("Cannot cluster issues: No issues loaded or list is empty.") - self.issue_clusters = {} - self._webhook_change_count = 0 # Reset on empty list - self.needs_recluster = False - return - if not self.hf_token or HF_EMBEDDING_MODEL is None: - logger.error("Cannot cluster issues: Hugging Face token or embedding model missing.") - self.issue_clusters = {} - self._webhook_change_count = 0 # Reset on missing token/model - self.needs_recluster = False - return - - num_issues = len(self.issue_list_for_clustering) - logger.info(f"Generating embeddings for {num_issues} issues for clustering using {HF_EMBEDDING_MODEL}...") - try: - # Use title + a snippet of the body for embedding - texts_to_embed = [ - f"Title: {i.get('title','')} Body: {i.get('body','')[:500]}" # Limit body length - for i in self.issue_list_for_clustering - ] - embeddings = await self._generate_embeddings(texts_to_embed) - - if embeddings is None or not isinstance(embeddings, list) or len(embeddings) != num_issues: - logger.error(f"Failed to generate valid embeddings for clustering. Expected {num_issues}, got {type(embeddings)} len {len(embeddings) if embeddings else 'N/A'}.") - self.issue_clusters = {} - self._webhook_change_count = 0 # Reset on embedding failure - self.needs_recluster = False - return - - logger.info(f"Generated {len(embeddings)} embeddings. Running HDBSCAN clustering...") - # Adjust min_cluster_size dynamically based on issue count? - min_cluster_size = max(2, min(5, num_issues // 10)) # Example: min 2, max 5, or 10% of issues - clusterer = HDBSCAN(min_cluster_size=min_cluster_size, metric='cosine', allow_single_cluster=True, gen_min_span_tree=True) - clusters = clusterer.fit_predict(embeddings) - - new_issue_clusters: Dict[int, List[int]] = {} - noise_count = 0 - for i, cluster_id in enumerate(clusters): - cluster_id_int = int(cluster_id) - if cluster_id_int == -1: - noise_count += 1 - continue - if cluster_id_int not in new_issue_clusters: - new_issue_clusters[cluster_id_int] = [] - new_issue_clusters[cluster_id_int].append(i) - - self.issue_clusters = new_issue_clusters - logger.info(f"Clustering complete. Found {len(self.issue_clusters)} clusters (min size {min_cluster_size}) with {noise_count} noise points.") - - # Reset the change counter and flag after successful clustering - self._webhook_change_count = 0 - self.needs_recluster = False - logger.debug("Reset webhook change counter and recluster flag after clustering.") - - except Exception as e: - logger.exception(f"Error during issue clustering: {e}") - self.issue_clusters = {} - self._webhook_change_count = 0 # Reset on clustering failure - self.needs_recluster = False - - - def _identify_potential_duplicates(self): - """Populates self.potential_duplicates based on self.issue_clusters and self.issue_list_for_clustering.""" - self.potential_duplicates = {} - if not self.issue_clusters or not self.issue_list_for_clustering: - logger.debug("Skipping duplicate identification: No clusters or issue list.") - return - - index_to_id = {} - try: - for i, issue in enumerate(self.issue_list_for_clustering): - issue_id = issue.get('id') - if issue_id is None: - logger.warning(f"Issue at index {i} in clustering list is missing an ID.") - continue - index_to_id[i] = issue_id - except Exception as e: - logger.error(f"Error creating index-to-ID map for duplicate check: {e}. Issue list might be inconsistent.") - return - - for cluster_id, indices in self.issue_clusters.items(): - if len(indices) > 1: - # Get issue IDs for indices in this cluster, skipping any invalid indices - cluster_issue_ids = [index_to_id[i] for i in indices if i in index_to_id] - if len(cluster_issue_ids) > 1: # Ensure there's more than one valid issue ID in the cluster - for issue_id in cluster_issue_ids: - # For each issue in the cluster, list all *other* issues in the same cluster as potential duplicates - self.potential_duplicates[issue_id] = [other_id for other_id in cluster_issue_ids if other_id != issue_id] - - logger.info(f"Identified potential duplicates for {len(self.potential_duplicates)} issues based on clustering.") - - async def _generate_embeddings(self, texts: List[str]): - """Generates sentence embeddings using Hugging Face Inference API.""" - if not self.hf_token: - logger.error("Hugging Face token is not set. Cannot generate embeddings.") - return None - if not texts: - logger.warning("Embedding generation requested with empty text list.") - return [] - if HF_EMBEDDING_MODEL is None: - logger.error("HF Embedding model is not configured.") - return None - - model_id = HF_EMBEDDING_MODEL # Use the fixed embedding model - api_url = f"{HF_INFERENCE_API}/{model_id}" - headers = {"Authorization": f"Bearer {self.hf_token}"} - timeout = aiohttp.ClientTimeout(total=180) # Increased timeout for embedding large batches - - logger.info(f"Requesting embeddings from {api_url} for {len(texts)} texts.") - # HF Inference API has a limit on the number of inputs per request (often 512 or 1024) - # Batching is recommended for large lists of texts. - batch_size = 500 # Example batch size, adjust based on model limits if known - all_embeddings = [] - - for i in range(0, len(texts), batch_size): - batch_texts = texts[i:i + batch_size] - payload = {"inputs": batch_texts, "options": {"wait_for_model": True}} - logger.debug(f"Processing embedding batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1} ({len(batch_texts)} texts)") - - # Implement retry logic for batches - retries = 3 - for attempt in range(retries): - try: - async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session: - async with session.post(api_url, json=payload) as response: - rate_limit_remaining = response.headers.get('X-Ratelimit-Remaining') - logger.debug(f"HF Embedding API Response Status: {response.status}, RateLimit Remaining: {rate_limit_remaining}") - - if response.status == 429: # Too Many Requests - retry_after = int(response.headers.get('Retry-After', 10)) - logger.warning(f"HF Embedding API rate limited. Waiting for {retry_after} seconds before retry {attempt + 1}/{retries}.") - await asyncio.sleep(retry_after) - continue # Retry the same batch - response.raise_for_status() # Raise for other 4xx/5xx errors - - result = await response.json() - - if isinstance(result, list) and all(isinstance(emb, list) and all(isinstance(f, float) for f in emb) for emb in result): - if len(result) == len(batch_texts): - all_embeddings.extend(result) - logger.debug(f"Successfully received {len(result)} embeddings for batch.") - break # Batch successful, move to next batch - else: - logger.error(f"HF Embedding API returned wrong number of embeddings for batch: Got {len(result)}, expected {len(batch_texts)}.") - return None # Indicate failure - elif isinstance(result, dict) and 'error' in result: - error_msg = result['error'] - estimated_time = result.get('estimated_time') - logger.error(f"HF Inference API embedding error on batch: {error_msg}" + (f" (Estimated time: {estimated_time}s)" if estimated_time else "")) - return None # Indicate failure - else: - logger.error(f"Unexpected embedding format received on batch: Type={type(result)}. Response: {str(result)[:500]}") - return None # Indicate failure - - except asyncio.TimeoutError: - logger.warning(f"HF Inference API embedding request timed out after {timeout.total} seconds for batch. Retry {attempt + 1}/{retries}.") - if attempt < retries - 1: - await asyncio.sleep(5) # Wait a bit before retrying timeout - continue - else: - logger.error("Max retries reached for embedding batch timeout.") - return None # Indicate failure - except aiohttp.ClientResponseError as e: - error_body = await e.text() - logger.error(f"HF Inference API embedding request failed on batch: Status={e.status}, Message='{e.message}'. Body: {error_body[:500]}. Retry {attempt + 1}/{retries}.") - if attempt < retries - 1 and e.status in [500, 502, 503, 504]: # Retry on server errors - await asyncio.sleep(5) - continue - else: - logger.error("Max retries reached or non-retryable error for embedding batch.") - return None # Indicate failure - except Exception as e: - logger.exception(f"Unexpected error during embedding generation on batch: {e}. Retry {attempt + 1}/{retries}.") - if attempt < retries - 1: - await asyncio.sleep(5) - continue - else: - logger.error("Max retries reached for unexpected error during embedding batch.") - return None # Indicate failure - else: # This else block executes if the inner loop completes without a 'break' (i.e., all retries failed) - logger.error(f"Failed to process embedding batch after {retries} retries.") - return None # Indicate failure - - await asyncio.sleep(0.1) # Small delay between batches - - if len(all_embeddings) == len(texts): - logger.info(f"Successfully generated embeddings for all {len(all_embeddings)} texts.") - return all_embeddings - else: - logger.error(f"Embedding generation failed partway through. Expected {len(texts)}, got {len(all_embeddings)}.") - return None # Indicate overall failure - - - async def generate_code_patch(self, issue_number: int, model_key: str) -> dict: - """Generates a code patch suggestion using a selected AI model.""" - if issue_number not in self.issues: - return {"error": f"Issue {issue_number} not found."} - if not self.hf_token: - return {"error": "Hugging Face token not set."} - if model_key not in HF_MODELS or HF_MODELS.get(model_key) is None: - return {"error": f"Invalid or unconfigured model key: {model_key}"} - if not self.repo_local_path or not self.repo: - return {"error": "Repository not cloned/available locally. Please scan the repository first."} - - issue = self.issues[issue_number] - model_id = HF_MODELS[model_key] - logger.info(f"Generating patch for issue {issue_number} ('{issue.get('title', 'N/A')[:50]}...') using model {model_id}") - - # --- Context Gathering --- - context_str = "Context gathering failed or not available." - context_source = "Error" - start_time_context = time.time() - context_data = self.precomputed_context.get(issue_number) # Use .get for safety - - if context_data: - timestamp = context_data.get('timestamp', 0) - timestamp_str = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S') - if context_data.get("error"): - context_str = f"Pre-computed context retrieval failed: {context_data['error']}" - context_source = f"Pre-computed (Failed @ {timestamp_str})" - elif context_data.get("content"): - context_str = context_data["content"] - num_files = len(context_data.get('files',[])) - context_source = f"Pre-computed ({num_files} files @ {timestamp_str})" - else: - context_str = "Pre-computed context was empty or unavailable." - context_source = f"Pre-computed (Empty @ {timestamp_str})" - logger.info(f"Using pre-computed context for issue {issue_number} (Source: {context_source})") - else: - logger.info(f"No pre-computed context found for issue {issue_number}, computing now.") - context_source = "Computed On-Demand" - # Compute context on demand and store it - context_result = await self._get_code_context(issue) - self.precomputed_context[issue_number] = { - "content": context_result.get("content"), - "files": context_result.get("files", []), - "error": context_result.get("error"), - "timestamp": time.time() +app = Flask(__name__) + +# HTML template (simplified version of your provided HTML) +HTML_TEMPLATE = ''' + + +
+ + +Collaborative Issue Resolution Powered by AI
+Collaborative Issue Resolution Powered by AI
+ID | +Title | +Severity | +Cluster | +
---|
Select an issue from the 'Issue Board' tab.
", elem_id="issue_preview_div") - - with gr.Accordion("🛠️ AI Assistance Tools", open=True, elem_id="ai_tools_accordion"): - suggest_btn = gr.Button("🧠 Suggest Resolution Steps", icon="💡", elem_id="suggest_btn", interactive=bool(HF_MODELS)) - patch_btn = gr.Button("📝 Generate Code Patch", icon="🩹", elem_id="patch_btn", interactive=bool(HF_MODELS)) - - gr.Markdown("### AI Output") - # Use a Markdown component with scrollability - ai_output_display = gr.Markdown(value="*AI suggestions and patches will appear here...*", elem_id="ai_output_md") - with gr.Row(): - copy_patch_btn = gr.Button("📋 Copy Patch", elem_id="copy_patch_btn", visible=False) # Initially hidden - clear_ai_output_btn = gr.Button("🧹 Clear Output", elem_id="clear_ai_output_btn") - - - with gr.Column(scale=2, min_width=600): - gr.Markdown("### Collaborative Code Editor (Context-Aware)") - gr.Markdown("⚠️ Warning: Real-time collaborative editing is experimental and may lose data with simultaneous edits. Use with caution and save work frequently!
") - # Initialize with placeholder content - code_edit_component = code_editor( - label="Code Context / Editor", - language="python", # Default language, can be changed based on file extension in JS - interactive=True, - elem_id="code_editor_component", - value={"placeholder.txt": "# Select an issue to load relevant code context."} - ) - - with gr.Tab("📈 Analytics", id="analytics", elem_id="tab-analytics"): - gr.Markdown("### Repository Analytics") - with gr.Row(equal_height=False): - with gr.Column(scale=1): - gr.Markdown("#### Issue Severity Distribution") - # Initialize with empty plot, will be updated after crawl - analytics_severity_plot = gr.Plot(label="Severity Distribution (Analytics)", elem_id="analytics_severity_plot", value=manager._generate_stats_plot({})) - with gr.Column(scale=1): - gr.Markdown("#### Issue Cluster Analysis (Top Clusters)") - cluster_info_df = gr.Dataframe( - headers=["Cluster ID", "Issue Count", "Top Keywords (Example)"], - datatype=["number", "number", "str"], - value=[["Scan a repository to see cluster data.", 0, ""]], # Initial placeholder data - label="Issue Clusters", elem_id="cluster_info_df", - interactive=False, # Make this dataframe non-interactive - row_count=(5, "dynamic"), - col_count=(3, "fixed"), - wrap=True - ) - gr.Markdown("*(Analytics update after scanning the repository. More detailed analytics could be added.)*") - - # --- Event Handlers --- - # The crawl button updates the issue list, stats plot, status, and analytics plot - crawl_btn.click( - fn=manager.crawl_issues, - inputs=[repo_url, github_token, hf_token], - outputs=[issue_list, stats_plot, status_output, analytics_severity_plot], - api_name="crawl_issues", - show_progress="full" - ).then( - # After crawl_issues completes, update the cluster analytics dataframe - fn=lambda: update_cluster_analytics(manager), - inputs=[], - outputs=[cluster_info_df] - ) - - # The issue list selection updates the preview, code editor, AI output area, and hidden state - issue_list.select( - fn=handle_issue_select, - # Pass the event data, which contains the selected row's value (including ID) - inputs=[gr.SelectData()], - outputs=[selected_issue_id_hidden, issue_preview_html, code_edit_component, ai_output_display, copy_patch_btn], - show_progress="minimal", - # Trigger the JS function to update the tracked issue ID and editor listeners - # This is handled by the MutationObserver and reading the hidden input value in JS - ) - - # AI Suggestion button - suggest_btn.click( - fn=get_ai_suggestion_wrapper, - inputs=[selected_issue_id_hidden, model_select], # Read selected issue ID from hidden state - outputs=[ai_output_display], - api_name="suggest_resolution", - show_progress="full" - ).then( - # After getting suggestion, hide the copy patch button - fn=lambda: gr.update(visible=False), - inputs=[], - outputs=[copy_patch_btn] - ) - - # AI Patch button - patch_btn.click( - fn=get_ai_patch_wrapper, - inputs=[selected_issue_id_hidden, model_select], # Read selected issue ID from hidden state - outputs=[ai_output_display], - api_name="generate_patch", - show_progress="full" - ).then( - # After getting patch, check if it contains a diff block and show the copy button if so - # Use a lambda to check the output text - fn=lambda output_text: gr.update(visible="```diff" in output_text), - inputs=[ai_output_display], - outputs=[copy_patch_btn] - ) - - # Clear AI Output button - clear_ai_output_btn.click( - fn=lambda: ["*AI suggestions and patches will appear here...*", gr.update(visible=False)], - inputs=[], - outputs=[ai_output_display, copy_patch_btn] - ) - - # --- JavaScript for WebSocket Communication and UI Interaction --- - def web_socket_js(ws_port, gradio_port): - # Generate a unique client ID on page load - # Note: This JS is generated *once* when the Gradio app is created. - # The actual client ID is generated and logged when the JS runs in the browser. - # The Python-side logging here is just for context during app startup. - temp_client_id_placeholder = f"client_{hashlib.sha1(os.urandom(16)).hexdigest()[:8]}" - logger.info(f"Generating JS with placeholder Client ID: {temp_client_id_placeholder}") - - return f""" - - """ - # The _js parameter injects the JavaScript code into the Gradio page - demo_app.load(_js=web_socket_js(WS_PORT, GRADIO_PORT), fn=None, inputs=None, outputs=None) - - return demo_app - -# ========== WebSocket Server Logic ========== -async def handle_ws_connection(websocket: WebSocketServerProtocol, path: str, manager: IssueManager): - """Handles incoming WebSocket connections and messages for collaboration.""" - # Generate a client ID and attach it to the websocket object - client_id = f"client_{hashlib.sha1(os.urandom(16)).hexdigest()[:8]}" - setattr(websocket, 'client_id', client_id) - remote_addr = websocket.remote_address - logger.info(f"WebSocket client connected: {remote_addr} assigned ID {client_id}") - - # Add the new client to the list of active clients - manager.ws_clients.append(websocket) - logger.info(f"Client list size: {len(manager.ws_clients)}") +Select an issue from the 'Issue Board' tab.
+AI suggestions and patches will appear here...
+Real-time collaborative editing is experimental and may lose data with simultaneous edits. Use with caution and save work frequently!
+# Select an issue to load relevant code context.
+Cluster ID | +Issue Count | +Top Keywords | +
---|---|---|
1 | +15 | +authentication, login, session | +
2 | +8 | +performance, slow, response | +
3 | +5 | +documentation, api, update | +
Analytics update after scanning the repository. More detailed analytics could be added.
+1. Address Missing Information: Request steps to reproduce from the user and ask for server error logs.
+2. Refine Understanding: The issue #{issue_id} appears to be related to authentication flow problems.
+3. Identify Relevant Code Areas: Check authentication modules and session management code.
+4. Implementation Steps: +
5. Testing Recommendations: Test with different browsers and network conditions.
+ ''' + + return jsonify({'suggestions': suggestions}) + except Exception as e: - logger.exception(f"Unexpected error in Webhook server thread: {e}") - # If an unexpected error occurs, signal the main loop to stop - if main_loop.is_running(): - main_loop.call_soon_threadsafe(main_loop.stop) - finally: - if httpd: - logger.info("Shutting down Webhook HTTP server...") - # This method stops the serve_forever() loop - # It needs to be called from a different thread than the one running serve_forever() - # The manager.stop_webhook_server lambda is designed for this. - # If this finally block is reached due to an exception *within* serve_forever(), - # httpd.shutdown() might not be needed or might fail, but calling it is safer. - try: - httpd.shutdown() # Signal the server to stop accepting new connections and finish current ones - except Exception as e: - logger.warning(f"Error during httpd.shutdown(): {e}") - try: - httpd.server_close() # Close the server socket - except Exception as e: - logger.warning(f"Error during httpd.server_close(): {e}") - - logger.info("Webhook server thread finished.") - - -# ========== Main Execution ========== -if __name__ == "__main__": - print("--- Acknowledging potential TensorFlow/CUDA warnings ---") - print("If you see warnings like 'Could not load dynamic library...' or related to CUDA/GPU,") - print("they are often harmless if you are not using a local GPU-accelerated model.") - print("Hugging Face Inference API calls run remotely and do not require local GPU setup.") - print("--- Starting Application ---") - - # Get or create the event loop for the main thread - # This loop will run the async tasks (WS server, idle tasks, broadcast) - try: - loop = asyncio.get_event_loop() - logger.info(f"Using existing asyncio loop in main thread: {id(loop)}") - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - logger.info(f"Created and set new asyncio loop in main thread: {id(loop)}") - - manager = IssueManager() - - # Start the webhook server in a separate thread - webhook_thread = threading.Thread( - target=run_webhook_server, - args=(manager, WEBHOOK_PORT, loop), # Pass the main loop to the webhook thread - name="WebhookServerThread", - daemon=True # Allow the main program to exit even if this thread is running - ) - webhook_thread.start() - - # Create the Gradio UI - app = create_ui(manager) - - # Start the WebSocket server as an asyncio task in the main loop - ws_task = loop.create_task(start_websocket_server(manager, WS_PORT)) - - # Start the manager's background tasks (idle processing, broadcast) - # These tasks are created in the manager's loop (which is the main loop here) - manager.start_idle_processing() - manager.start_broadcast_loop() - - - def shutdown_handler(signum, frame): - """Graceful shutdown logic.""" - logger.info(f"Signal {signum} received. Initiating shutdown...") - # Signal the WebSocket server to stop - if hasattr(manager, 'stop_ws_server') and manager.stop_ws_server: - manager.stop_ws_server() - # Signal the webhook server to stop - if hasattr(manager, 'stop_webhook_server') and manager.stop_webhook_server: - manager.stop_webhook_server() - # Stop manager's internal async tasks - manager.stop_idle_processing() - manager.stop_broadcast_loop() - - # Stop the main asyncio loop - if loop.is_running(): - logger.info("Stopping main asyncio loop...") - # Use call_soon_threadsafe because signal handlers run in a different context - loop.call_soon_threadsafe(loop.stop) - else: - logger.warning("Main asyncio loop not running during shutdown handler.") - + return jsonify({'error': str(e)}), 500 - # Add signal handlers for graceful shutdown (e.g., Ctrl+C) - try: - # For Unix-like systems - loop.add_signal_handler(signal.SIGINT, shutdown_handler, signal.SIGINT, None) - loop.add_signal_handler(signal.SIGTERM, shutdown_handler, signal.SIGTERM, None) - logger.info("Added signal handlers for SIGINT and SIGTERM.") - except NotImplementedError: - # Signal handlers are not available on Windows - logger.warning("Signal handlers not available on this platform (likely Windows). Ctrl+C may not be graceful.") - # On Windows, KeyboardInterrupt is usually caught directly in the main thread - - - # Launch Gradio app - # Use prevent_thread_lock=True to run Gradio server in a separate thread, - # freeing the main thread to run the asyncio loop. +@app.route('/ai_patch', methods=['POST']) +def ai_patch(): try: - logger.info(f"Launching Gradio app on port {GRADIO_PORT}...") - app.launch( - server_name="0.0.0.0", - server_port=GRADIO_PORT, - share=False, # Set to True to share publicly (requires auth token usually) - debug=True, # Keep debug=True for development logs - inbrowser=True, - prevent_thread_lock=True # Run Gradio server in a separate thread - ) - logger.info("Gradio app launched in separate thread.") - - # Run the main asyncio loop indefinitely to keep async tasks running - logger.info("Running main asyncio loop...") - loop.run_forever() # This blocks the main thread until loop.stop() is called - - except KeyboardInterrupt: - logger.info("KeyboardInterrupt received in main thread.") - # If signal handlers are not implemented (e.g., Windows), KeyboardInterrupt - # is caught here. Trigger shutdown manually. - # Check if signal handlers were likely not registered by checking if signal.SIGINT exists - if not hasattr(signal, 'SIGINT'): - shutdown_handler(None, None) # Call shutdown logic - - except SystemExit as e: - logger.error(f"SystemExit received: {e}") - # SystemExit might be raised by port binding errors etc. - # The source of SystemExit might have already triggered loop.stop() or shutdown. - # Ensure cleanup happens if not already started. - # Check if the loop is still running; if so, stop it. - if loop.is_running(): - logger.info("SystemExit caught before loop stopped, triggering shutdown.") - shutdown_handler(None, None) - else: - logger.info("SystemExit caught, but loop already stopped. Proceeding with final cleanup.") - - + data = request.get_json() + issue_id = data.get('issue_id') + hf_token = data.get('hf_token') + model = data.get('model') + + if not hf_token: + return jsonify({'error': 'Hugging Face token is required'}), 400 + + # In a real implementation, you would call the Hugging Face API here + # For this example, we'll return a mock patch + patch = f''' +Explanation: This patch addresses issue #{issue_id} by improving error handling in the authentication flow.
+Patch:
++--- a/src/auth/auth_service.py ++++ b/src/auth/auth_service.py +@@ -42,6 +42,10 @@ + user = self.get_user(credentials.username) + if not user: + raise AuthenticationError("User not found") ++ ++ # Validate account status ++ if not user.is_active: ++ raise AuthenticationError("Account is deactivated") + + if not self.check_password(credentials.password, user.password_hash): + raise AuthenticationError("Invalid credentials") ++ ''' + + return jsonify({'patch': patch}) + except Exception as e: - logger.exception(f"An unexpected error occurred in the main thread: {e}") - # Trigger graceful shutdown on unexpected error - shutdown_handler(None, None) - - finally: - logger.info("Main thread exiting finally block.") - # Wait for background tasks/threads to complete shutdown... - logger.info("Running loop briefly to complete pending tasks (e.g., cancellations)...") - try: - # Gather remaining tasks (like ws_task cancellation, idle_task cancellation) - # Exclude the current task if inside a task (though we are in the main thread here) - # asyncio.all_tasks() returns tasks for the *current* thread's loop - pending_tasks = [task for task in asyncio.all_tasks(loop=loop) if not task.done()] - if pending_tasks: - # Use a timeout for waiting for tasks to finish - try: - # Wait for pending tasks, ignoring exceptions during shutdown - loop.run_until_complete(asyncio.wait(pending_tasks, timeout=5, return_when=asyncio.ALL_COMPLETED)) - logger.info(f"Completed pending tasks.") - except asyncio.TimeoutError: - logger.warning(f"Timed out waiting for {len(pending_tasks)} pending tasks to complete.") - except Exception as e: - logger.error(f"Error during final loop run_until_complete: {e}") - else: - logger.info("No pending tasks to complete.") - except Exception as e: - logger.error(f"Error checking pending tasks: {e}") - - - # Wait for the webhook thread to join (if it's not daemon, or if shutdown() needs time) - # Since it's daemon, it will be killed, but explicit shutdown is better. - # Add a small timeout for join in case shutdown() hangs. - # Check if the thread is still alive before joining - if webhook_thread.is_alive(): - logger.info("Waiting for webhook thread to join...") - webhook_thread.join(timeout=2) # Wait up to 2 seconds - - # Close the loop - if not loop.is_closed(): - logger.info("Closing asyncio loop.") - loop.close() - - logger.info("Application shutdown complete.") \ No newline at end of file + return jsonify({'error': str(e)}), 500 + +if __name__ == '__main__': + app.run(debug=True) \ No newline at end of file