import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download, HfApi import pandas as pd import os import logging import json from datetime import datetime from src.core.evaluation import EvaluationManager, EvaluationRequest from src.logging_config import setup_logging from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision ) from src.envs import ( API, CACHE_PATH, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN ) from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval # Setup logging setup_logging(log_dir="logs") logger = logging.getLogger('web') # Initialize evaluation manager evaluation_manager = EvaluationManager( results_dir=EVAL_RESULTS_PATH, backup_dir=os.path.join(CACHE_PATH, "eval-backups") ) def restart_space(): """Restart the Hugging Face space.""" logger.info("Restarting space") API.restart_space(repo_id=REPO_ID) def initialize_space(): """Initialize the space by downloading required data.""" logger.info("Initializing space") try: logger.info(f"Downloading queue data from {QUEUE_REPO}") snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception as e: logger.error(f"Failed to download queue data: {str(e)}") restart_space() try: logger.info(f"Downloading results data from {RESULTS_REPO}") snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception as e: logger.error(f"Failed to download results data: {str(e)}") restart_space() # Initialize space initialize_space() LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS) ( finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, ) = get_evaluation_queue_df(EVAL_COLS) # Function to update the leaderboard def update_leaderboard(): global LEADERBOARD_DF LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS) return LEADERBOARD_DF def process_evaluation_queue(): """Process pending evaluation requests.""" logger.info("Processing evaluation queue") # Fetch pending requests from Hugging Face repository _, _, pending_requests = get_evaluation_queue_df(EVAL_COLS + ['model_raw']) for _, request in pending_requests.iterrows(): try: model_name = request['model_raw'] logger.info(f"Processing request for model: {model_name}") # Convert queue request to evaluation request eval_request = EvaluationRequest( model=model_name, revision=request['revision'], precision=request['precision'], weight_type=request['weight_type'], submitted_time=request['status'], # Assuming 'status' column contains timestamp model_type=request.get('model_type', '') ) # Run evaluation results = evaluation_manager.run_evaluation(eval_request) logger.info(f"Evaluation complete for {model_name}") # Save results to stacklok/results save_results_to_repo(results, RESULTS_REPO) # Update request status in stacklok/requests update_request_status(model_name, "FINISHED") # Update leaderboard update_leaderboard() except Exception as e: logger.error(f"Evaluation failed for {model_name}: {str(e)}", exc_info=True) # Update request status to indicate failure update_request_status(model_name, "FAILED") def update_request_status(model_name, status): """Update the status of a request in the Hugging Face repository.""" try: api = HfApi() filename = f"{model_name.replace('/', '_')}_request.json" # Fetch the current request data file_content = api.hf_hub_download(repo_id=QUEUE_REPO, filename=filename, repo_type="dataset") with open(file_content, 'r') as f: request_data = json.load(f) # Update the status request_data['status'] = status # Upload the updated file api.upload_file( path_or_fileobj=json.dumps(request_data).encode(), path_in_repo=filename, repo_id=QUEUE_REPO, repo_type="dataset", token=TOKEN ) logger.info(f"Updated status for {model_name} to {status}") except Exception as e: logger.error(f"Failed to update status for {model_name}: {str(e)}", exc_info=True) # Remove the extract_model_name function as it's no longer needed from huggingface_hub import HfApi def save_results_to_repo(results, repo): """Save evaluation results to the specified repository.""" try: api = HfApi() model_id = results['model_id'].replace('/', '_') filename = f"{model_id}_results.json" # Convert results to JSON string json_results = json.dumps(results, indent=2) # Save results to the repository api.upload_file( path_or_fileobj=json_results.encode(), path_in_repo=filename, repo_id=repo, repo_type="dataset", token=TOKEN ) logger.info(f"Saved results for {results['model_id']} to {repo}/{filename}") except Exception as e: logger.error(f"Failed to save results to {repo}: {str(e)}", exc_info=True) def update_leaderboard(): """Update the leaderboard with latest evaluation results.""" global LEADERBOARD_DF LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) return LEADERBOARD_DF def init_leaderboard(df): """Initialize the leaderboard with the given DataFrame.""" if df is None or df.empty: df = pd.DataFrame(columns=COLS) logger.info("Creating empty leaderboard - no evaluations completed yet") else: logger.info(f"Initializing leaderboard with {len(df)} rows") # Ensure "Security Score ⬆️" and "Safetensors" columns are present if "Security Score ⬆️" not in df.columns: logger.warning("Security Score column not found in DataFrame") df["Security Score ⬆️"] = None if "Safetensors" not in df.columns: logger.warning("Safetensors column not found in DataFrame") df["Safetensors"] = None # Sort by Security Score if available if "Security Score ⬆️" in df.columns: df = df.sort_values(by="Security Score ⬆️", ascending=False) logger.info("Sorted leaderboard by Security Score") # Ensure all required columns are present for col in COLS: if col not in df.columns: logger.warning(f"Column {col} not found in DataFrame, adding with None values") df[col] = None # Select only the columns we want to display df = df[COLS] logger.info(f"Final leaderboard columns: {df.columns.tolist()}") logger.debug(f"Leaderboard data:\n{df}") # Create the leaderboard return gr.Dataframe( headers=COLS, datatype=["str"] * len(COLS), row_count=10, col_count=(len(COLS), "fixed"), value=df, wrap=True, column_widths=[50] + [None] * (len(COLS) - 1), type="pandas", ) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🔒 Security Leaderboard", elem_id="security-leaderboard-tab", id=0): leaderboard = init_leaderboard(LEADERBOARD_DF) with gr.TabItem("📝 About", elem_id="about-tab", id=2): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("🚀 Submit Model", elem_id="submit-tab", id=3): with gr.Column(): with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Column(): with gr.Accordion( f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False, ): with gr.Row(): finished_eval_table = gr.components.Dataframe( value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False, ): with gr.Row(): running_eval_table = gr.components.Dataframe( value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False, ): with gr.Row(): pending_eval_table = gr.components.Dataframe( value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Row(): gr.Markdown("# 🔒 Submit Your Model for Security Evaluation", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox( label="Model name (organization/model-name)", placeholder="huggingface/model-name" ) revision_name_textbox = gr.Textbox( label="Revision commit", placeholder="main" ) model_type = gr.Dropdown( choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], label="Model type", multiselect=False, value=None, interactive=True, ) with gr.Column(): precision = gr.Dropdown( choices=[i.value.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16", interactive=True, ) weight_type = gr.Dropdown( choices=[i.value.name for i in WeightType], label="Weight Format", multiselect=False, value="Safetensors", interactive=True, ) base_model_name_textbox = gr.Textbox( label="Base model (for delta or adapter weights)", placeholder="Optional: base model path" ) with gr.Row(): gr.Markdown( """ ### Security Requirements: 1. Model weights must be in safetensors format 2. Model card must include security considerations 3. Model will be evaluated on secure coding capabilities """, elem_classes="markdown-text" ) submit_button = gr.Button("Submit for Security Evaluation") submission_result = gr.Markdown() def handle_submission(model, base_model, revision, precision, weight_type, model_type): """Handle new model submission.""" try: logger.info(f"New submission received for {model}") # Prepare request data request_data = { "model": model, "base_model": base_model, "revision": revision if revision else "main", "precision": precision, "weight_type": weight_type, "model_type": model_type, "status": "PENDING", "timestamp": datetime.now().isoformat() } # Add to queue in QUEUE_REPO api = HfApi() filename = f"{model.replace('/', '_')}_request.json" api.upload_file( path_or_fileobj=json.dumps(request_data).encode(), path_in_repo=filename, repo_id=QUEUE_REPO, repo_type="dataset", token=TOKEN ) logger.info(f"Added request for {model} to {QUEUE_REPO}") # Get updated pending evaluations _, _, pending_eval_queue_df = get_evaluation_queue_df(EVAL_COLS) # Start processing queue in background scheduler.add_job(process_evaluation_queue, id='process_queue_job', replace_existing=True) return gr.Markdown("Submission successful! Your model has been added to the evaluation queue. Please check the 'Pending Evaluation Queue' for status updates."), pending_eval_queue_df except Exception as e: logger.error(f"Submission failed: {str(e)}", exc_info=True) return gr.Markdown(f"Error: {str(e)}"), None # Remove the queue_manager initialization # queue_manager = QueueManager(queue_dir=os.path.join(CACHE_PATH, "eval-queue")) submit_button.click( handle_submission, [ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, ], submission_result, ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) # Update evaluation tables periodically def update_evaluation_tables(): finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_COLS) return finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df # Setup schedulers scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.add_job(process_evaluation_queue, "interval", seconds=300) # Process queue every 5 minutes scheduler.start() logger.info("Application startup complete") demo.queue(default_concurrency_limit=40).launch() # Update evaluation tables every 60 seconds demo.load(update_evaluation_tables, outputs=[finished_eval_table, running_eval_table, pending_eval_table], every=60)