lukehinds's picture
Big refactor
1264ff3
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision
)
from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
from src.leaderboard.security_eval import check_safetensors
# Skip HuggingFace downloads for local testing
print("Creating leaderboard DataFrame...")
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
print(f"LEADERBOARD_DF shape: {LEADERBOARD_DF.shape}")
print(f"LEADERBOARD_DF columns: {LEADERBOARD_DF.columns.tolist()}")
print(f"LEADERBOARD_DF data:\n{LEADERBOARD_DF}")
print("\nGetting evaluation queue DataFrames...")
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def get_field_mapping():
"""Create a mapping from display names to field names."""
auto_eval_fields = fields(AutoEvalColumn)
return {f.name: f for f in auto_eval_fields}
def create_empty_dataframe(field_mapping):
"""Create an empty DataFrame with the correct columns."""
import pandas as pd
return pd.DataFrame(columns=[f.name for f in field_mapping.values()])
def verify_columns(dataframe, field_mapping):
"""Verify all required columns are present."""
for col in dataframe.columns:
if col not in field_mapping:
print(f"Warning: Column {col} not found in field mapping")
def init_leaderboard(dataframe):
print(f"Initializing leaderboard with DataFrame shape: {dataframe.shape}")
field_mapping = get_field_mapping()
print(f"Field mapping: {field_mapping}")
if dataframe is None or len(dataframe) == 0:
dataframe = create_empty_dataframe(field_mapping)
print("Created empty DataFrame with correct columns")
verify_columns(dataframe, field_mapping)
return Leaderboard(
value=dataframe,
datatype=["str" if col not in field_mapping else field_mapping[col].type for col in dataframe.columns],
select_columns=SelectColumns(
default_selection=[col for col in dataframe.columns if col in field_mapping and field_mapping[col].displayed_by_default],
cant_deselect=[col for col in dataframe.columns if col in field_mapping and field_mapping[col].never_hidden],
label="Select Columns to Display:",
),
search_columns=["Model", "Hub License"],
hide_columns=[col for col in dataframe.columns if col in field_mapping and field_mapping[col].hidden],
filter_columns=[
ColumnFilter("Type", type="checkboxgroup", label="Model types"),
ColumnFilter("Weight Format", type="checkboxgroup", label="Weight Format"),
ColumnFilter("Precision", type="checkboxgroup", label="Precision"),
ColumnFilter(
"#Params (B)",
type="slider",
min=0.01,
max=150,
label="Select the number of parameters (B)",
),
ColumnFilter(
"Available on Hub", type="boolean", label="Deleted/incomplete", default=True
),
],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ”’ Security Leaderboard", elem_id="security-leaderboard-tab", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("πŸ“ About", elem_id="about-tab", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸš€ Submit Model", elem_id="submit-tab", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Column():
with gr.Accordion(
f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
):
with gr.Row():
finished_eval_table = gr.components.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
):
with gr.Row():
running_eval_table = gr.components.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
):
with gr.Row():
pending_eval_table = gr.components.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Row():
gr.Markdown("# πŸ”’ Submit Your Model for Security Evaluation", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(
label="Model name (organization/model-name)",
placeholder="huggingface/model-name"
)
revision_name_textbox = gr.Textbox(
label="Revision commit",
placeholder="main"
)
model_type = gr.Dropdown(
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.value.name for i in WeightType],
label="Weight Format",
multiselect=False,
value="Safetensors",
interactive=True,
)
base_model_name_textbox = gr.Textbox(
label="Base model (for delta or adapter weights)",
placeholder="Optional: base model path"
)
with gr.Row():
gr.Markdown(
"""
### Security Requirements:
1. Model weights must be in safetensors format
2. Model card must include security considerations
3. Model will be evaluated on secure coding capabilities
""",
elem_classes="markdown-text"
)
submit_button = gr.Button("Submit for Security Evaluation")
submission_result = gr.Markdown()
def handle_submission(model, base_model, revision, precision, weight_type, model_type):
"""Handle new model submission."""
try:
print(f"New submission received for {model}")
# Add to queue
result = add_new_eval(model, base_model, revision, precision, weight_type, model_type)
# Update pending evaluations table
global pending_eval_queue_df
_, _, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
return [
gr.Markdown("Submission successful! Your model has been added to the evaluation queue. Please check the 'Pending Evaluation Queue' for status updates."),
gr.Dataframe(value=pending_eval_queue_df)
]
except Exception as e:
print(f"Submission failed: {str(e)}")
return [gr.Markdown(f"Error: {str(e)}"), gr.Dataframe(value=pending_eval_queue_df)]
# Update tables periodically
def update_evaluation_tables():
global finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
return [
finished_eval_table.update(value=finished_eval_queue_df),
running_eval_table.update(value=running_eval_queue_df),
pending_eval_table.update(value=pending_eval_queue_df)
]
submit_button.click(
handle_submission,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
],
[submission_result, pending_eval_table],
)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
# Setup periodic updates
import time
import threading
def periodic_update():
while True:
time.sleep(60) # Update every 60 seconds
demo.queue(update_evaluation_tables)()
update_thread = threading.Thread(target=periodic_update, daemon=True)
update_thread.start()
demo.queue(default_concurrency_limit=40).launch()