Spaces:

stacklok
/

secure_code_leaderboard_archived

Running

App Files Files Community

secure_code_leaderboard_archived / app_local.py

lukehinds

Big refactor

1264ff3 about 1 month ago

raw

history blame contribute delete

11.2 kB

	import gradio as gr
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns

	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	LLM_BENCHMARKS_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	BENCHMARK_COLS,
	COLS,
	EVAL_COLS,
	EVAL_TYPES,
	AutoEvalColumn,
	ModelType,
	fields,
	WeightType,
	Precision
	)
	from src.envs import EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
	from src.populate import get_evaluation_queue_df, get_leaderboard_df
	from src.submission.submit import add_new_eval
	from src.leaderboard.security_eval import check_safetensors

	# Skip HuggingFace downloads for local testing
	print("Creating leaderboard DataFrame...")
	LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
	print(f"LEADERBOARD_DF shape: {LEADERBOARD_DF.shape}")
	print(f"LEADERBOARD_DF columns: {LEADERBOARD_DF.columns.tolist()}")
	print(f"LEADERBOARD_DF data:\n{LEADERBOARD_DF}")

	print("\nGetting evaluation queue DataFrames...")
	(
	finished_eval_queue_df,
	running_eval_queue_df,
	pending_eval_queue_df,
	) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

	def get_field_mapping():
	"""Create a mapping from display names to field names."""
	auto_eval_fields = fields(AutoEvalColumn)
	return {f.name: f for f in auto_eval_fields}

	def create_empty_dataframe(field_mapping):
	"""Create an empty DataFrame with the correct columns."""
	import pandas as pd
	return pd.DataFrame(columns=[f.name for f in field_mapping.values()])

	def verify_columns(dataframe, field_mapping):
	"""Verify all required columns are present."""
	for col in dataframe.columns:
	if col not in field_mapping:
	print(f"Warning: Column {col} not found in field mapping")

	def init_leaderboard(dataframe):
	print(f"Initializing leaderboard with DataFrame shape: {dataframe.shape}")

	field_mapping = get_field_mapping()
	print(f"Field mapping: {field_mapping}")

	if dataframe is None or len(dataframe) == 0:
	dataframe = create_empty_dataframe(field_mapping)
	print("Created empty DataFrame with correct columns")

	verify_columns(dataframe, field_mapping)

	return Leaderboard(
	value=dataframe,
	datatype=["str" if col not in field_mapping else field_mapping[col].type for col in dataframe.columns],
	select_columns=SelectColumns(
	default_selection=[col for col in dataframe.columns if col in field_mapping and field_mapping[col].displayed_by_default],
	cant_deselect=[col for col in dataframe.columns if col in field_mapping and field_mapping[col].never_hidden],
	label="Select Columns to Display:",
	),
	search_columns=["Model", "Hub License"],
	hide_columns=[col for col in dataframe.columns if col in field_mapping and field_mapping[col].hidden],
	filter_columns=[
	ColumnFilter("Type", type="checkboxgroup", label="Model types"),
	ColumnFilter("Weight Format", type="checkboxgroup", label="Weight Format"),
	ColumnFilter("Precision", type="checkboxgroup", label="Precision"),
	ColumnFilter(
	"#Params (B)",
	type="slider",
	min=0.01,
	max=150,
	label="Select the number of parameters (B)",
	),
	ColumnFilter(
	"Available on Hub", type="boolean", label="Deleted/incomplete", default=True
	),
	],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	)


	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🔒 Security Leaderboard", elem_id="security-leaderboard-tab", id=0):
	leaderboard = init_leaderboard(LEADERBOARD_DF)

	with gr.TabItem("📝 About", elem_id="about-tab", id=2):
	gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

	with gr.TabItem("🚀 Submit Model", elem_id="submit-tab", id=3):
	with gr.Column():
	with gr.Row():
	gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

	with gr.Column():
	with gr.Accordion(
	f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
	open=False,
	):
	with gr.Row():
	finished_eval_table = gr.components.Dataframe(
	value=finished_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)
	with gr.Accordion(
	f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
	open=False,
	):
	with gr.Row():
	running_eval_table = gr.components.Dataframe(
	value=running_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)

	with gr.Accordion(
	f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
	open=False,
	):
	with gr.Row():
	pending_eval_table = gr.components.Dataframe(
	value=pending_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)
	with gr.Row():
	gr.Markdown("# 🔒 Submit Your Model for Security Evaluation", elem_classes="markdown-text")

	with gr.Row():
	with gr.Column():
	model_name_textbox = gr.Textbox(
	label="Model name (organization/model-name)",
	placeholder="huggingface/model-name"
	)
	revision_name_textbox = gr.Textbox(
	label="Revision commit",
	placeholder="main"
	)
	model_type = gr.Dropdown(
	choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
	label="Model type",
	multiselect=False,
	value=None,
	interactive=True,
	)

	with gr.Column():
	precision = gr.Dropdown(
	choices=[i.value.name for i in Precision if i != Precision.Unknown],
	label="Precision",
	multiselect=False,
	value="float16",
	interactive=True,
	)
	weight_type = gr.Dropdown(
	choices=[i.value.name for i in WeightType],
	label="Weight Format",
	multiselect=False,
	value="Safetensors",
	interactive=True,
	)
	base_model_name_textbox = gr.Textbox(
	label="Base model (for delta or adapter weights)",
	placeholder="Optional: base model path"
	)

	with gr.Row():
	gr.Markdown(
	"""
	### Security Requirements:
	1. Model weights must be in safetensors format
	2. Model card must include security considerations
	3. Model will be evaluated on secure coding capabilities
	""",
	elem_classes="markdown-text"
	)

	submit_button = gr.Button("Submit for Security Evaluation")
	submission_result = gr.Markdown()

	def handle_submission(model, base_model, revision, precision, weight_type, model_type):
	"""Handle new model submission."""
	try:
	print(f"New submission received for {model}")

	# Add to queue
	result = add_new_eval(model, base_model, revision, precision, weight_type, model_type)

	# Update pending evaluations table
	global pending_eval_queue_df
	_, _, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

	return [
	gr.Markdown("Submission successful! Your model has been added to the evaluation queue. Please check the 'Pending Evaluation Queue' for status updates."),
	gr.Dataframe(value=pending_eval_queue_df)
	]
	except Exception as e:
	print(f"Submission failed: {str(e)}")
	return [gr.Markdown(f"Error: {str(e)}"), gr.Dataframe(value=pending_eval_queue_df)]

	# Update tables periodically
	def update_evaluation_tables():
	global finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
	finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
	return [
	finished_eval_table.update(value=finished_eval_queue_df),
	running_eval_table.update(value=running_eval_queue_df),
	pending_eval_table.update(value=pending_eval_queue_df)
	]

	submit_button.click(
	handle_submission,
	[
	model_name_textbox,
	base_model_name_textbox,
	revision_name_textbox,
	precision,
	weight_type,
	model_type,
	],
	[submission_result, pending_eval_table],
	)

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=20,
	elem_id="citation-button",
	show_copy_button=True,
	)

	# Setup periodic updates
	import time
	import threading

	def periodic_update():
	while True:
	time.sleep(60) # Update every 60 seconds
	demo.queue(update_evaluation_tables)()

	update_thread = threading.Thread(target=periodic_update, daemon=True)
	update_thread.start()

	demo.queue(default_concurrency_limit=40).launch()