import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download import pandas as pd import os import logging from datetime import datetime from datasets import Dataset from src.core.evaluation import EvaluationManager, EvaluationRequest from src.logging_config import setup_logging from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, ModelType, WeightType, Precision ) from src.envs import ( API, CACHE_PATH, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN ) from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import initialize_queue_repo, initialize_results_repo # Setup logging setup_logging(log_dir="logs") logger = logging.getLogger('web') # Initialize evaluation manager evaluation_manager = EvaluationManager( results_dir=EVAL_RESULTS_PATH, backup_dir=os.path.join(CACHE_PATH, "eval-backups") ) def restart_space(): """Restart the Hugging Face space.""" logger.info("Restarting space") API.restart_space(repo_id=REPO_ID) def initialize_space(): """Initialize the space by downloading required data.""" logger.info("Initializing space") try: logger.info(f"Downloading queue data from {QUEUE_REPO}") # Initialize queue repository if needed if not initialize_queue_repo(): logger.error("Failed to initialize queue repository") restart_space() return snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception as e: logger.error(f"Failed to download queue data: {str(e)}") restart_space() try: logger.info(f"Downloading results data from {RESULTS_REPO}") # Initialize results repository if needed if not initialize_results_repo(): logger.error("Failed to initialize results repository") restart_space() return snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception as e: logger.error(f"Failed to download results data: {str(e)}") restart_space() # Initialize space initialize_space() LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS) ( finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, ) = get_evaluation_queue_df(EVAL_COLS) def process_evaluation_queue(): """Process pending evaluation requests.""" logger.info("Processing evaluation queue") # Fetch pending requests from Hugging Face repository _, _, pending_requests = get_evaluation_queue_df(EVAL_COLS + ['model_raw', 'timestamp']) for _, request in pending_requests.iterrows(): try: model_name = request['model_raw'] logger.info(f"Processing request for model: {model_name}") # Update status to RUNNING update_request_status(model_name, "RUNNING") # Convert queue request to evaluation request eval_request = EvaluationRequest( model=model_name, revision=request['revision'], precision=request['precision'], weight_type=request['weight_type'], submitted_time=request['timestamp'], # Use the actual timestamp field model_type=request.get('model_type', '') ) # Run evaluation results = evaluation_manager.run_evaluation(eval_request) logger.info(f"Evaluation complete for {model_name}") # Save results to stacklok/results save_results_to_repo(results, RESULTS_REPO) # Update request status in stacklok/requests update_request_status(model_name, "FINISHED") # Update leaderboard update_leaderboard() except Exception as e: logger.error(f"Evaluation failed for {model_name}: {str(e)}", exc_info=True) # Update request status to indicate failure update_request_status(model_name, "FAILED") def update_request_status(model_name, status): """Update the status of a request in the Hugging Face repository.""" try: # Load the current dataset from datasets import load_dataset dataset = load_dataset(QUEUE_REPO, split="train") # Convert to dictionary for easier manipulation data_dict = dataset.to_dict() # Find the most recent request for this model indices = [i for i, m in enumerate(data_dict["model_raw"]) if m == model_name] if not indices: logger.error(f"No request found for model {model_name}") return # Get the most recent request (last index) latest_index = indices[-1] # Update the status for the found request data_dict["status"][latest_index] = status # Create new dataset with updated status updated_dataset = Dataset.from_dict(data_dict) # Push the updated dataset back to the hub with a descriptive commit message updated_dataset.push_to_hub( QUEUE_REPO, split="train", commit_message=f"Update status to {status} for {model_name}" ) logger.info(f"Updated status for {model_name} to {status}") except Exception as e: logger.error(f"Failed to update status for {model_name}: {str(e)}", exc_info=True) # Remove the extract_model_name function as it's no longer needed def save_results_to_repo(results, repo): """Save evaluation results to the specified repository.""" try: model_id = results.get('model', '') if not model_id: raise ValueError("Model ID not found in results") # Convert all values to lists if they aren't already dataset_dict = { k: [v] if not isinstance(v, list) else v for k, v in results.items() } # Create a Dataset object from the results dataset = Dataset.from_dict(dataset_dict) # Push the dataset to the Hugging Face Hub dataset.push_to_hub(repo, split="train") logger.info(f"Saved results for {model_id} to {repo}") except Exception as e: logger.error(f"Failed to save results to {repo}: {str(e)}", exc_info=True) def update_leaderboard(): """Update the leaderboard with latest evaluation results.""" global LEADERBOARD_DF LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS) return LEADERBOARD_DF def init_leaderboard(df): """Initialize the leaderboard with the given DataFrame.""" if df is None or df.empty: df = pd.DataFrame(columns=COLS) logger.info("Creating empty leaderboard - no evaluations completed yet") else: logger.info(f"Initializing leaderboard with {len(df)} rows") # Ensure all required columns exist for col in COLS: if col not in df.columns: logger.warning(f"Column {col} not found in DataFrame, adding with None values") df[col] = None # Map dataset columns to display columns column_mapping = { "model_id": "Model", "security_score": "Security Score ⬆️", "safetensors_compliant": "Safetensors", "precision": "Precision" } for src, dst in column_mapping.items(): if src in df.columns: df[dst] = df[src] logger.debug(f"Mapped column {src} to {dst}") # Sort by Security Score if available if "Security Score ⬆️" in df.columns: df = df.sort_values(by="Security Score ⬆️", ascending=False) logger.info("Sorted leaderboard by Security Score") # Select only the columns we want to display df = df[COLS] logger.info(f"Final leaderboard columns: {df.columns.tolist()}") logger.debug(f"Leaderboard data:\n{df}") # Create the leaderboard using gradio_leaderboard return Leaderboard( value=df, datatype=["html" if col == "Model" else "number" if col == "Security Score ⬆️" else "bool" if col == "Safetensors" else "str" for col in COLS], select_columns=SelectColumns( default_selection=COLS, cant_deselect=["Model", "Security Score ⬆️", "Safetensors"], label="Select Columns to Display:", ), search_columns=["Model"], filter_columns=[ ColumnFilter("Safetensors", type="boolean", label="Show only Safetensors models"), ColumnFilter("Security Score ⬆️", type="slider", min=0, max=1, label="Minimum Security Score"), ], interactive=False, ) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🔒 Security Leaderboard", elem_id="security-leaderboard-tab", id=0): leaderboard = init_leaderboard(LEADERBOARD_DF) with gr.TabItem("📝 About", elem_id="about-tab", id=2): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("🚀 Submit Model", elem_id="submit-tab", id=3): with gr.Column(): with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Column(): with gr.Accordion( f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False, ): with gr.Row(): finished_eval_table = gr.components.Dataframe( value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False, ): with gr.Row(): running_eval_table = gr.components.Dataframe( value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False, ): with gr.Row(): pending_eval_table = gr.components.Dataframe( value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Row(): gr.Markdown("# 🔒 Submit Your Model for Security Evaluation", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox( label="Model name (organization/model-name)", placeholder="huggingface/model-name" ) revision_name_textbox = gr.Textbox( label="Revision commit", placeholder="main" ) model_type = gr.Dropdown( choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], label="Model type", multiselect=False, value=None, interactive=True, ) with gr.Column(): precision = gr.Dropdown( choices=[i.value.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16", interactive=True, ) weight_type = gr.Dropdown( choices=[i.value.name for i in WeightType], label="Weight Format", multiselect=False, value="Safetensors", interactive=True, ) base_model_name_textbox = gr.Textbox( label="Base model (for delta or adapter weights)", placeholder="Optional: base model path" ) with gr.Row(): gr.Markdown( """ ### Security Requirements: 1. Model weights must be in safetensors format 2. Model card must include security considerations 3. Model will be evaluated on secure coding capabilities """, elem_classes="markdown-text" ) submit_button = gr.Button("Submit for Security Evaluation") submission_result = gr.Markdown() def handle_submission(model, base_model, revision, precision, weight_type, model_type): """Handle new model submission.""" try: logger.info(f"New submission received for {model}") # Prepare request data as a dataset-compatible dictionary (all values must be lists) request_data = { "model": [model], "model_raw": [model], # Store raw model name for processing "base_model": [base_model if base_model else ""], "revision": [revision if revision else "main"], "precision": [precision], "weight_type": [weight_type], "model_type": [model_type], "status": ["PENDING"], "timestamp": [datetime.now().isoformat()] } # Convert to dataset and push to hub dataset = Dataset.from_dict(request_data) dataset.push_to_hub( QUEUE_REPO, config_name=model.replace("/", "_"), split="train" ) logger.info(f"Added request for {model} to {QUEUE_REPO}") # Get updated pending evaluations _, _, pending_eval_queue_df = get_evaluation_queue_df(EVAL_COLS) # Start processing queue in background scheduler.add_job(process_evaluation_queue, id='process_queue_job', replace_existing=True) return "Submission successful! Your model has been added to the evaluation queue. Please check the 'Pending Evaluation Queue' for status updates.", pending_eval_queue_df except Exception as e: logger.error(f"Submission failed: {str(e)}", exc_info=True) return f"Error: {str(e)}", None # Remove the queue_manager initialization # queue_manager = QueueManager(queue_dir=os.path.join(CACHE_PATH, "eval-queue")) submit_button.click( handle_submission, [ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, ], [submission_result, pending_eval_table], ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) # Update evaluation tables periodically def update_evaluation_tables(): finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_COLS) return finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df # Setup schedulers scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.add_job(process_evaluation_queue, "interval", seconds=300) # Process queue every 5 minutes scheduler.start() logger.info("Application startup complete") demo.queue(default_concurrency_limit=40).launch() # Update evaluation tables every 60 seconds demo.load(update_evaluation_tables, outputs=[finished_eval_table, running_eval_table, pending_eval_table], every=60)