alielfilali01's picture
Update app.py
269e9b8 verified
import os
import json
import numpy as np
import pandas as pd
import gradio as gr
from huggingface_hub import HfApi, hf_hub_download
OWNER = "inceptionai"
DATASET_REPO_ID = f"{OWNER}/requests-dataset"
HEADER = """
<center>
<br></br>
<h1>Arabic Leaderboards</h1>
<h2>Comprehensive Evaluation of Arabic Large Language Models</h2>
<br></br>
<br></br>
</center>
"""
ABOUT_SECTION = """
## About
In our `12-24` release, we introduced the `AraGen Benchmark`, along with the `3C3H` evaluation measure (aka the 3C3H Score). You can find more details about AraGen and 3C3H, [here](https://huggingface.co/blog/leaderboard-3c3h-aragen). And you can find the first version of the benchmark, `AraGen-12-24` [here](https://huggingface.co/datasets/inceptionai/AraGen). Building on that foundation, and as part of this new release, we have expanded this space to incorporate additional tasks and evaluation metrics.
In this release, we present two leaderboards:
**AraGen-03-25 (v2):**
- The AraGen Benchmark is designed to evaluate and compare the performance of Chat/Instruct Arabic Large Language Models on a suite of generative tasks that are culturally relevant to the Arab region, history, politics, cuisine ... etc. By leveraging **3C3H** as an evaluation metric—which assesses a model's output across six dimensions: Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness—the leaderboard offers a comprehensive and holistic evaluation of a model’s chat capabilities and its ability to generate human-like and ethically responsible content.
**Instruction Following:**
- We have established a robust leaderboard that benchmarks models on Arabic and English instruction following, offering an open and comparative performance landscape for the research community. Concurrently, we released the first publicly available Arabic [dataset](https://huggingface.co/datasets/inceptionai/Arabic_IFEval) aimed at evaluating LLMs' ability to follow instructions. The Arabic IFEval samples are meticulously curated to capture the language’s unique nuances—such as diacritization and distinctive phonetic features—often overlooked in generic datasets. Our dedicated linguistic team generated original samples and adapted selections from the IFEval English dataset, ensuring that the material resonates with Arabic cultural contexts and meets the highest standards of authenticity and quality.
### Why Focus on Chat Models?
Our evaluations are conducted in a generative mode, meaning that we expect models to produce complete, context-rich responses rather than simply predicting the next token as base models do. This approach not only yields results that are more explainable and nuanced compared to logit-based measurements, but it also captures elements like creativity, coherence, and ethical considerations—providing deeper insights into overall model performance.
### Contact
For inquiries or assistance, please join the conversation on our [Discussions Tab](https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/discussions) or reach out via [email](mailto:[email protected]).
"""
BOTTOM_LOGO = """<img src="https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/resolve/main/assets/pictures/03-25/arabic-leaderboards-colab-march-preview-free-3.png" style="width:50%;display:block;margin-left:auto;margin-right:auto;border-radius:15px;">"""
CITATION_BUTTON_TEXT = """
@misc{Arabic-Leaderboards,
author = {El Filali, Ali and Albarri, Sarah and Abouelseoud, Arwa and Kamboj, Samta and Sengupta, Neha and Nakov, Preslav},
title = {Arabic-Leaderboards: Comprehensive Evaluation of Arabic Large Language Models},
year = {2025},
publisher = {Inception},
howpublished = "url{https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards}"
}
"""
CITATION_BUTTON_LABEL = """
Copy the following snippet to cite the results from all Arabic Leaderboards in this Space.
"""
def load_results():
"""
Loads the AraGen v2 results from aragen_v2_results.json and returns two dataframes:
1) df_3c3h with columns for 3C3H scores
2) df_tasks with columns for tasks scores
"""
current_dir = os.path.dirname(os.path.abspath(__file__))
results_file = os.path.join(current_dir, "assets", "results", "aragen_v2_results.json")
with open(results_file, 'r') as f:
data = json.load(f)
# Filter out any entries that only contain '_last_sync_timestamp'
filtered_data = []
for entry in data:
if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
continue
filtered_data.append(entry)
data = filtered_data
data_3c3h = []
data_tasks = []
for model_data in data:
meta = model_data.get('Meta', {})
model_name = meta.get('Model Name', 'UNK')
revision = meta.get('Revision', 'UNK')
precision = meta.get('Precision', 'UNK')
params = meta.get('Params', 'UNK')
try:
model_size_numeric = float(params)
except (ValueError, TypeError):
model_size_numeric = np.inf
scores_data = model_data.get('claude-3.5-sonnet Scores', {})
scores_3c3h = scores_data.get('3C3H Scores', {})
scores_tasks = scores_data.get('Tasks Scores', {})
formatted_scores_3c3h = {k: v*100 for k, v in scores_3c3h.items()}
formatted_scores_tasks = {k: v*100 for k, v in scores_tasks.items()}
data_entry_3c3h = {
'Model Name': model_name,
'Revision': revision,
'License': meta.get('License', 'UNK'),
'Precision': precision,
'Model Size': model_size_numeric,
'3C3H Score': formatted_scores_3c3h.get("3C3H Score", np.nan),
'Correctness': formatted_scores_3c3h.get("Correctness", np.nan),
'Completeness': formatted_scores_3c3h.get("Completeness", np.nan),
'Conciseness': formatted_scores_3c3h.get("Conciseness", np.nan),
'Helpfulness': formatted_scores_3c3h.get("Helpfulness", np.nan),
'Honesty': formatted_scores_3c3h.get("Honesty", np.nan),
'Harmlessness': formatted_scores_3c3h.get("Harmlessness", np.nan),
}
data_3c3h.append(data_entry_3c3h)
data_entry_tasks = {
'Model Name': model_name,
'Revision': revision,
'License': meta.get('License', 'UNK'),
'Precision': precision,
'Model Size': model_size_numeric,
**formatted_scores_tasks
}
data_tasks.append(data_entry_tasks)
df_3c3h = pd.DataFrame(data_3c3h)
df_tasks = pd.DataFrame(data_tasks)
score_columns_3c3h = ['3C3H Score', 'Correctness', 'Completeness', 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness']
df_3c3h[score_columns_3c3h] = df_3c3h[score_columns_3c3h].round(4)
max_model_size_value = 1000
df_3c3h['Model Size Filter'] = df_3c3h['Model Size'].replace(np.inf, max_model_size_value)
if '3C3H Score' in df_3c3h.columns:
df_3c3h = df_3c3h.sort_values(by='3C3H Score', ascending=False)
df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
else:
df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
task_columns = [col for col in df_tasks.columns if col not in ['Model Name', 'Revision', 'License', 'Precision', 'Model Size', 'Model Size Filter']]
if task_columns:
df_tasks[task_columns] = df_tasks[task_columns].round(4)
df_tasks['Model Size Filter'] = df_tasks['Model Size'].replace(np.inf, max_model_size_value)
if task_columns:
first_task = task_columns[0]
df_tasks = df_tasks.sort_values(by=first_task, ascending=False)
df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
else:
df_tasks = df_tasks.sort_values(by='Model Name', ascending=True)
df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
return df_3c3h, df_tasks, task_columns
def load_if_data():
"""
Loads the instruction-following data from ifeval_results.jsonl
and returns a dataframe with relevant columns,
converting decimal values to percentage format.
"""
current_dir = os.path.dirname(os.path.abspath(__file__))
results_file = os.path.join(current_dir, "assets", "results", "ifeval_results.jsonl")
data = []
with open(results_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
data.append(json.loads(line))
df = pd.DataFrame(data)
# Convert numeric columns
numeric_cols = ["En Prompt-lvl", "En Instruction-lvl", "Ar Prompt-lvl", "Ar Instruction-lvl"]
for col in numeric_cols:
df[col] = pd.to_numeric(df[col], errors="coerce")
# Compute average accuracy for En and Ar
df["Average Accuracy (En)"] = (df["En Prompt-lvl"] + df["En Instruction-lvl"]) / 2
df["Average Accuracy (Ar)"] = (df["Ar Prompt-lvl"] + df["Ar Instruction-lvl"]) / 2
# Convert them to percentage format (e.g., 0.871 -> 87.1)
for col in numeric_cols:
df[col] = (df[col] * 100).round(1)
df["Average Accuracy (En)"] = (df["Average Accuracy (En)"] * 100).round(1)
df["Average Accuracy (Ar)"] = (df["Average Accuracy (Ar)"] * 100).round(1)
# Handle size as numeric
def parse_size(x):
try:
return float(x)
except:
return np.inf
df["Model Size"] = df["Size (B)"].apply(parse_size)
# Add a filter column for size
max_model_size_value = 1000
df["Model Size Filter"] = df["Model Size"].replace(np.inf, max_model_size_value)
# Sort by "Average Accuracy (Ar)" as an example
df = df.sort_values(by="Average Accuracy (Ar)", ascending=False)
df = df.reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
return df
def submit_model(model_name, revision, precision, params, license, modality):
df_3c3h, df_tasks, _ = load_results()
existing_models_results = df_3c3h[['Model Name', 'Revision', 'Precision']]
if precision == 'Missing':
precision = None
else:
precision = precision.strip().lower()
df_pending = load_requests('pending')
df_finished = load_requests('finished')
model_exists_in_results = (
(existing_models_results['Model Name'] == model_name) &
(existing_models_results['Revision'] == revision) &
(existing_models_results['Precision'] == precision)
).any()
if model_exists_in_results:
return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
if not df_pending.empty:
existing_models_pending = df_pending[['model_name', 'revision', 'precision']]
model_exists_in_pending = (
(existing_models_pending['model_name'] == model_name) &
(existing_models_pending['revision'] == revision) &
(existing_models_pending['precision'] == precision)
).any()
if model_exists_in_pending:
return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' is already in the pending evaluations.**"
if not df_finished.empty:
existing_models_finished = df_finished[['model_name', 'revision', 'precision']]
model_exists_in_finished = (
(existing_models_finished['model_name'] == model_name) &
(existing_models_finished['revision'] == revision) &
(existing_models_finished['precision'] == precision)
).any()
if model_exists_in_finished:
return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
api = HfApi()
try:
_ = api.model_info(model_name)
except Exception:
return f"**Error: Could not find model '{model_name}' on HuggingFace Hub. Please ensure the model name is correct and the model is public.**"
status = "PENDING"
submission = {
"model_name": model_name,
"license": license,
"revision": revision,
"precision": precision,
"params": params,
"status": status,
"modality": modality
}
submission_json = json.dumps(submission, indent=2)
org_model = model_name.split('/')
if len(org_model) != 2:
return "**Please enter the full model name including the organization or username, e.g., 'inceptionai/jais-family-30b-8k'**"
org, model_id = org_model
precision_str = precision if precision else 'Missing'
file_path_in_repo = f"pending/{org}/{model_id}_eval_request_{revision}_{precision_str}.json"
try:
hf_api_token = os.environ.get('HF_API_TOKEN', None)
api.upload_file(
path_or_fileobj=submission_json.encode('utf-8'),
path_in_repo=file_path_in_repo,
repo_id=DATASET_REPO_ID,
repo_type="dataset",
token=hf_api_token
)
except Exception as e:
return f"**Error: Could not submit the model. {str(e)}**"
return f"**Model '{model_name}' has been submitted for evaluation.**"
def load_requests(status_folder):
api = HfApi()
requests_data = []
folder_path_in_repo = status_folder
hf_api_token = os.environ.get('HF_API_TOKEN', None)
try:
files_info = api.list_repo_files(
repo_id=DATASET_REPO_ID,
repo_type="dataset",
token=hf_api_token
)
except Exception as e:
print(f"Error accessing dataset repository: {e}")
return pd.DataFrame()
files_in_folder = [f for f in files_info if f.startswith(f"{folder_path_in_repo}/") and f.endswith('.json')]
for file_path in files_in_folder:
try:
local_file_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=file_path,
repo_type="dataset",
token=hf_api_token
)
with open(local_file_path, 'r') as f:
request = json.load(f)
requests_data.append(request)
except Exception as e:
print(f"Error loading file {file_path}: {e}")
continue
df = pd.DataFrame(requests_data)
return df
def filter_df_3c3h(search_query, selected_cols, precision_filters, license_filters, min_size, max_size):
df_ = load_results()[0].copy()
if min_size > max_size:
min_size, max_size = max_size, min_size
if search_query:
df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
if precision_filters:
include_missing = 'Missing' in precision_filters
selected_precisions = [p for p in precision_filters if p != 'Missing']
if include_missing:
df_ = df_[
(df_['Precision'].isin(selected_precisions)) |
(df_['Precision'] == 'UNK') |
(df_['Precision'].isna())
]
else:
df_ = df_[df_['Precision'].isin(selected_precisions)]
if license_filters:
include_missing = 'Missing' in license_filters
selected_licenses = [l for l in license_filters if l != 'Missing']
if include_missing:
df_ = df_[
(df_['License'].isin(selected_licenses)) |
(df_['License'] == 'UNK') |
(df_['License'].isna())
]
else:
df_ = df_[df_['License'].isin(selected_licenses)]
df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
if 'Rank' in df_.columns:
df_ = df_.drop(columns=['Rank'])
df_ = df_.reset_index(drop=True)
df_.insert(0, 'Rank', range(1, len(df_)+1))
fixed_column_order = [
"Rank",
"Model Name",
"3C3H Score",
"Correctness",
"Completeness",
"Conciseness",
"Helpfulness",
"Honesty",
"Harmlessness",
"Revision",
"License",
"Precision",
"Model Size"
]
selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
return df_[selected_cols]
def filter_df_tasks(search_query, selected_cols, precision_filters, license_filters, min_size, max_size, task_columns):
df_ = load_results()[1].copy()
if min_size > max_size:
min_size, max_size = max_size, min_size
if search_query:
df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
if precision_filters:
include_missing = 'Missing' in precision_filters
selected_precisions = [p for p in precision_filters if p != 'Missing']
if include_missing:
df_ = df_[
(df_['Precision'].isin(selected_precisions)) |
(df_['Precision'] == 'UNK') |
(df_['Precision'].isna())
]
else:
df_ = df_[df_['Precision'].isin(selected_precisions)]
if license_filters:
include_missing = 'Missing' in license_filters
selected_licenses = [l for l in license_filters if l != 'Missing']
if include_missing:
df_ = df_[
(df_['License'].isin(selected_licenses)) |
(df_['License'] == 'UNK') |
(df_['License'].isna())
]
else:
df_ = df_[df_['License'].isin(selected_licenses)]
df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
if 'Rank' in df_.columns:
df_ = df_.drop(columns=['Rank'])
if task_columns:
first_task = task_columns[0]
df_ = df_.sort_values(by=first_task, ascending=False)
else:
df_ = df_.sort_values(by='Model Name', ascending=True)
df_ = df_.reset_index(drop=True)
df_.insert(0, 'Rank', range(1, len(df_)+1))
fixed_column_order = [
"Rank",
"Model Name",
"Question Answering (QA)",
"Orthographic and Grammatical Analysis",
"Safety",
"Reasoning",
"Revision",
"License",
"Precision",
"Model Size"
]
selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
return df_[selected_cols]
def filter_if_df(search_query, selected_cols, family_filters, min_size, max_size):
"""
Filters the instruction-following dataframe based on various criteria.
We have removed 'Filter by Type' and 'Filter by Creator'.
"""
df_ = load_if_data().copy()
if min_size > max_size:
min_size, max_size = max_size, min_size
# Search by model name
if search_query:
df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
# Filter by Family only (Creator and Type filters removed)
if family_filters:
df_ = df_[df_['Family'].isin(family_filters)]
# Filter by Model Size
df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
# Re-rank
if 'Rank' in df_.columns:
df_ = df_.drop(columns=['Rank'])
df_ = df_.reset_index(drop=True)
df_.insert(0, 'Rank', range(1, len(df_)+1))
fixed_column_order = [
"Rank",
"Model Name",
"Creator",
"Family",
"Type",
"Average Accuracy (Ar)",
"Ar Prompt-lvl",
"Ar Instruction-lvl",
"Average Accuracy (En)",
"En Prompt-lvl",
"En Instruction-lvl",
"Size (B)",
"Base Model",
"Context Window",
"Lang."
]
selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
return df_[selected_cols]
def main():
df_3c3h, df_tasks, task_columns = load_results()
df_if = load_if_data() # Instruction Following DF
# Setup precision/license options for the 3C3H scoreboard
precision_options_3c3h = sorted(df_3c3h['Precision'].dropna().unique().tolist())
precision_options_3c3h = [p for p in precision_options_3c3h if p != 'UNK']
precision_options_3c3h.append('Missing')
license_options_3c3h = sorted(df_3c3h['License'].dropna().unique().tolist())
license_options_3c3h = [l for l in license_options_3c3h if l != 'UNK']
license_options_3c3h.append('Missing')
# Setup precision/license options for tasks scoreboard
precision_options_tasks = sorted(df_tasks['Precision'].dropna().unique().tolist())
precision_options_tasks = [p for p in precision_options_tasks if p != 'UNK']
precision_options_tasks.append('Missing')
license_options_tasks = sorted(df_tasks['License'].dropna().unique().tolist())
license_options_tasks = [l for l in license_options_tasks if l != 'UNK']
license_options_tasks.append('Missing')
# Model size range for 3C3H scoreboard
min_model_size_3c3h = int(df_3c3h['Model Size Filter'].min())
max_model_size_3c3h = int(df_3c3h['Model Size Filter'].max())
# Model size range for tasks scoreboard
min_model_size_tasks = int(df_tasks['Model Size Filter'].min())
max_model_size_tasks = int(df_tasks['Model Size Filter'].max())
# Column choices for 3C3H
column_choices_3c3h = [col for col in df_3c3h.columns.tolist() if col != 'Model Size Filter']
# Column choices for tasks
column_choices_tasks = [col for col in df_tasks.columns.tolist() if col != 'Model Size Filter']
# Now for instruction-following
family_options_if = sorted(df_if['Family'].dropna().unique().tolist())
min_model_size_if = int(df_if['Model Size Filter'].min())
max_model_size_if = int(df_if['Model Size Filter'].max())
#
# IMPORTANT: Reorder the columns for the Instruction-Following leaderboard
# Define the full order and the default visible columns separately.
#
all_if_columns = [
"Rank",
"Model Name",
"Average Accuracy (Ar)",
"Ar Prompt-lvl",
"Ar Instruction-lvl",
"Average Accuracy (En)",
"En Prompt-lvl",
"En Instruction-lvl",
"Type",
"Creator",
"Family",
"Size (B)",
"Base Model",
"Context Window",
"Lang."
]
default_if_columns = [
"Rank",
"Model Name",
"Average Accuracy (Ar)",
"Ar Prompt-lvl",
"Ar Instruction-lvl",
"Average Accuracy (En)"
]
with gr.Blocks() as demo:
gr.HTML(HEADER)
with gr.Tabs():
#
# AL Leaderboards Tab
#
with gr.Tab("AL Leaderboards 🏅"):
# -------------------------
# Sub-Tab: AraGen Leaderboards
# -------------------------
with gr.Tab("🐪 AraGen Leaderboards"):
with gr.Tabs():
# 3C3H Scores
with gr.Tab("3C3H Scores"):
with gr.Accordion("⚙️ Filters", open=False):
with gr.Row():
search_box_3c3h = gr.Textbox(
placeholder="Search for models...",
label="Search",
interactive=True
)
with gr.Row():
column_selector_3c3h = gr.CheckboxGroup(
choices=column_choices_3c3h,
value=[
'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
],
label="Select columns to display"
)
with gr.Row():
license_filter_3c3h = gr.CheckboxGroup(
choices=license_options_3c3h,
value=license_options_3c3h.copy(),
label="Filter by License"
)
precision_filter_3c3h = gr.CheckboxGroup(
choices=precision_options_3c3h,
value=precision_options_3c3h.copy(),
label="Filter by Precision"
)
with gr.Row():
model_size_min_filter_3c3h = gr.Slider(
minimum=min_model_size_3c3h,
maximum=max_model_size_3c3h,
value=min_model_size_3c3h,
step=1,
label="Minimum Model Size",
interactive=True
)
model_size_max_filter_3c3h = gr.Slider(
minimum=min_model_size_3c3h,
maximum=max_model_size_3c3h,
value=max_model_size_3c3h,
step=1,
label="Maximum Model Size",
interactive=True
)
leaderboard_3c3h = gr.Dataframe(
df_3c3h[[
'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
]],
interactive=False
)
filter_inputs_3c3h = [
search_box_3c3h, column_selector_3c3h,
precision_filter_3c3h, license_filter_3c3h,
model_size_min_filter_3c3h, model_size_max_filter_3c3h
]
search_box_3c3h.submit(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)
for component in filter_inputs_3c3h:
component.change(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)
# Tasks Scores
with gr.Tab("Tasks Scores"):
gr.Markdown("This Table is sorted based on the First Task (Question Answering)")
with gr.Accordion("⚙️ Filters", open=False):
with gr.Row():
search_box_tasks = gr.Textbox(
placeholder="Search for models...",
label="Search",
interactive=True
)
with gr.Row():
column_selector_tasks = gr.CheckboxGroup(
choices=column_choices_tasks,
value=['Rank', 'Model Name'] + task_columns,
label="Select columns to display"
)
with gr.Row():
license_filter_tasks = gr.CheckboxGroup(
choices=license_options_tasks,
value=license_options_tasks.copy(),
label="Filter by License"
)
precision_filter_tasks = gr.CheckboxGroup(
choices=precision_options_tasks,
value=precision_options_tasks.copy(),
label="Filter by Precision"
)
with gr.Row():
model_size_min_filter_tasks = gr.Slider(
minimum=min_model_size_tasks,
maximum=max_model_size_tasks,
value=min_model_size_tasks,
step=1,
label="Minimum Model Size",
interactive=True
)
model_size_max_filter_tasks = gr.Slider(
minimum=min_model_size_tasks,
maximum=max_model_size_tasks,
value=max_model_size_tasks,
step=1,
label="Maximum Model Size",
interactive=True
)
leaderboard_tasks = gr.Dataframe(
df_tasks[['Rank', 'Model Name'] + task_columns],
interactive=False
)
filter_inputs_tasks = [
search_box_tasks, column_selector_tasks,
precision_filter_tasks, license_filter_tasks,
model_size_min_filter_tasks, model_size_max_filter_tasks
]
search_box_tasks.submit(
lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
inputs=filter_inputs_tasks,
outputs=leaderboard_tasks
)
for component in filter_inputs_tasks:
component.change(
lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
inputs=filter_inputs_tasks,
outputs=leaderboard_tasks
)
# -------------------------
# Sub-Tab: Instruction Following Leaderboard
# -------------------------
with gr.Tab("🗡️ Instruction Following Leaderboard"):
with gr.Accordion("⚙️ Filters", open=False):
with gr.Row():
search_box_if = gr.Textbox(
placeholder="Search for models...",
label="Search",
interactive=True
)
with gr.Row():
column_selector_if = gr.CheckboxGroup(
choices=all_if_columns,
value=default_if_columns,
label="Select columns to display"
)
with gr.Row():
family_filter_if = gr.CheckboxGroup(
choices=family_options_if,
value=family_options_if.copy(),
label="Filter by Family"
)
with gr.Row():
model_size_min_filter_if = gr.Slider(
minimum=min_model_size_if,
maximum=max_model_size_if,
value=min_model_size_if,
step=1,
label="Minimum Model Size",
interactive=True
)
model_size_max_filter_if = gr.Slider(
minimum=min_model_size_if,
maximum=max_model_size_if,
value=max_model_size_if,
step=1,
label="Maximum Model Size",
interactive=True
)
leaderboard_if = gr.Dataframe(
df_if[default_if_columns],
interactive=False
)
filter_inputs_if = [
search_box_if, column_selector_if,
family_filter_if,
model_size_min_filter_if, model_size_max_filter_if
]
search_box_if.submit(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)
for component in filter_inputs_if:
component.change(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)
#
# Submit Tab
#
with gr.Tab("Submit Here 📝"):
df_pending = load_requests('pending')
df_finished = load_requests('finished')
df_failed = load_requests('failed')
gr.Markdown(ABOUT_SECTION)
gr.Markdown("## Submit Your Model for Evaluation")
with gr.Column():
model_name_input = gr.Textbox(
label="Model Name",
placeholder="Enter the full model name from HuggingFace Hub (e.g., inceptionai/jais-family-30b-8k)"
)
revision_input = gr.Textbox(label="Revision", placeholder="main", value="main")
precision_input = gr.Dropdown(
choices=["float16", "float32", "bfloat16", "8bit", "4bit"],
label="Precision",
value="float16"
)
params_input = gr.Textbox(
label="Params",
placeholder="Enter the approximate number of parameters as Integer (e.g., 7, 13, 30, 70 ...)"
)
license_input = gr.Textbox(
label="License",
placeholder="Enter the license type (Generic one is 'Open' in case no License is provided)",
value="Open"
)
modality_input = gr.Radio(
choices=["Text"],
label="Modality",
value="Text"
)
submit_button = gr.Button("Submit Model")
submission_result = gr.Markdown()
submit_button.click(
submit_model,
inputs=[
model_name_input, revision_input, precision_input,
params_input, license_input, modality_input
],
outputs=submission_result
)
gr.Markdown("## Evaluation Status")
with gr.Accordion(f"Pending Evaluations ({len(df_pending)})", open=False):
if not df_pending.empty:
gr.Dataframe(df_pending)
else:
gr.Markdown("No pending evaluations.")
with gr.Accordion(f"Finished Evaluations ({len(df_finished)})", open=False):
if not df_finished.empty:
gr.Dataframe(df_finished)
else:
gr.Markdown("No finished evaluations.")
with gr.Accordion(f"Failed Evaluations ({len(df_failed)})", open=False):
if not df_failed.empty:
gr.Dataframe(df_failed)
else:
gr.Markdown("No failed evaluations.")
# Citation Section
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=8,
elem_id="citation-button",
show_copy_button=True
)
gr.HTML(BOTTOM_LOGO)
demo.launch()
if __name__ == "__main__":
main()