Spaces:

inceptionai
/

Arabic-Leaderboards

Running

App Files Files Community

alielfilali01 commited on 19 days ago

Commit

dfa1e52

verified ·

1 Parent(s): ad45142

Update app.py

Browse files

Files changed (1) hide show

app.py +571 -406

app.py CHANGED Viewed

@@ -9,105 +9,107 @@ from huggingface_hub import HfApi, hf_hub_download
 OWNER = "inceptionai"
 DATASET_REPO_ID = f"{OWNER}/requests-dataset"
 HEADER = """
 <center>
-<h1>AraGen Leaderboard: Generative Tasks Evaluation of Arabic LLMs</h1>
-</center>
 <br></br>
-<p>This leaderboard introduces generative tasks evaluation for Arabic Large Language Models (LLMs). Powered by the new <strong>3C3H</strong> evaluation measure, this framework delivers a transparent, robust, and holistic evaluation system that balances factual accuracy and usability assessment for a production ready setting.</p>
-<p>For more details, please consider going through the technical blogpost <a href="https://huggingface.co/blog/leaderboard-3c3h-aragen">here</a>.</p>
 """
 ABOUT_SECTION = """
 ## About
-The AraGen Leaderboard is designed to evaluate and compare the performance of Chat Arabic Large Language Models (LLMs) on a set of generative tasks. By leveraging the new **3C3H** evaluation measure which evaluate the model's output across six dimensions —Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness— the leaderboard provides a comprehensive and holistic evaluation of a model's performance in generating human-like and ethically responsible content.
-### Why Focus on Chat Models?
-AraGen Leaderboard —And 3C3H in general— is specifically designed to assess **chat models**, which interact in conversational settings, intended for end user interaction and require a blend of factual accuracy and user-centric dialogue capabilities. While it is technically possible to submit foundational models, we kindly ask users to refrain from doing so. For evaluations of foundational models using likelihood accuracy based benchmarks, please refer to the [Open Arabic LLM Leaderboard (OALL)](https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard).
-### How to Submit Your Model?
-Navigate to the submission section below to submit your open chat model from the HuggingFace Hub for evaluation. Ensure that your model is public and the submmited metadata (precision, revision, #params) is accurate.
 ### Contact
-For any inquiries or assistance, feel free to reach out through the community tab at [Inception AraGen Community](https://huggingface.co/spaces/inceptionai/AraGen-Leaderboard/discussions) or via [email](mailto:[email protected]).
 """
-CITATION_BUTTON_LABEL = """
-Copy the following snippet to cite these results
-"""
 CITATION_BUTTON_TEXT = """
-@misc{AraGen,
-  author = {El Filali, Ali and Sengupta, Neha and Abouelseoud, Arwa and Nakov, Preslav and Fourrier, Clémentine},
-  title = {Rethinking LLM Evaluation with 3C3H: AraGen Benchmark and Leaderboard},
-  year = {2024},
   publisher = {Inception},
-  howpublished = "url{https://huggingface.co/spaces/inceptionai/AraGen-Leaderboard}"
 }
 """
 def load_results():
-    # Get the current directory of the script and construct the path to results.json
     current_dir = os.path.dirname(os.path.abspath(__file__))
-    results_file = os.path.join(current_dir, "assets", "results", "results.json")
-    # Load the JSON data from the specified file
     with open(results_file, 'r') as f:
         data = json.load(f)
     # Filter out any entries that only contain '_last_sync_timestamp'
     filtered_data = []
     for entry in data:
-        # If '_last_sync_timestamp' is the only key, skip it
         if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
             continue
         filtered_data.append(entry)
     data = filtered_data
-    # Lists to collect data
     data_3c3h = []
     data_tasks = []
     for model_data in data:
-        # Extract model meta data
         meta = model_data.get('Meta', {})
         model_name = meta.get('Model Name', 'UNK')
         revision = meta.get('Revision', 'UNK')
         precision = meta.get('Precision', 'UNK')
         params = meta.get('Params', 'UNK')
-        license = meta.get('License', 'UNK')
-        # Convert "Model Size" to numeric, treating "UNK" as infinity
         try:
             model_size_numeric = float(params)
         except (ValueError, TypeError):
             model_size_numeric = np.inf
-        # 3C3H Scores
         scores_data = model_data.get('claude-3.5-sonnet Scores', {})
         scores_3c3h = scores_data.get('3C3H Scores', {})
         scores_tasks = scores_data.get('Tasks Scores', {})
-        # Multiply scores by 100 to get percentages (keep them as numeric values)
         formatted_scores_3c3h = {k: v*100 for k, v in scores_3c3h.items()}
         formatted_scores_tasks = {k: v*100 for k, v in scores_tasks.items()}
-        # For 3C3H Scores DataFrame
         data_entry_3c3h = {
             'Model Name': model_name,
             'Revision': revision,
-            'License': license,
             'Precision': precision,
-            'Model Size': model_size_numeric,  # Numeric value for sorting
             '3C3H Score': formatted_scores_3c3h.get("3C3H Score", np.nan),
             'Correctness': formatted_scores_3c3h.get("Correctness", np.nan),
             'Completeness': formatted_scores_3c3h.get("Completeness", np.nan),
@@ -118,13 +120,12 @@ def load_results():
         }
         data_3c3h.append(data_entry_3c3h)
-        # For Tasks Scores DataFrame
         data_entry_tasks = {
             'Model Name': model_name,
             'Revision': revision,
-            'License': license,
             'Precision': precision,
-            'Model Size': model_size_numeric,  # Numeric value for sorting
             **formatted_scores_tasks
         }
         data_tasks.append(data_entry_tasks)
@@ -132,147 +133,148 @@ def load_results():
     df_3c3h = pd.DataFrame(data_3c3h)
     df_tasks = pd.DataFrame(data_tasks)
-    # Round the numeric score columns to 4 decimal places
     score_columns_3c3h = ['3C3H Score', 'Correctness', 'Completeness', 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness']
     df_3c3h[score_columns_3c3h] = df_3c3h[score_columns_3c3h].round(4)
-    # Replace np.inf with a large number in 'Model Size Filter' for filtering
-    max_model_size_value = 1000  # Define a maximum value
     df_3c3h['Model Size Filter'] = df_3c3h['Model Size'].replace(np.inf, max_model_size_value)
-    # Sort df_3c3h by '3C3H Score' descending if column exists
     if '3C3H Score' in df_3c3h.columns:
         df_3c3h = df_3c3h.sort_values(by='3C3H Score', ascending=False)
-        df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))  # Add Rank column starting from 1
     else:
         df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
-    # Extract task columns
     task_columns = [col for col in df_tasks.columns if col not in ['Model Name', 'Revision', 'License', 'Precision', 'Model Size', 'Model Size Filter']]
-    # Round the task score columns to 4 decimal places
     if task_columns:
         df_tasks[task_columns] = df_tasks[task_columns].round(4)
-    # Replace np.inf with a large number in 'Model Size Filter' for filtering
     df_tasks['Model Size Filter'] = df_tasks['Model Size'].replace(np.inf, max_model_size_value)
-    # Sort df_tasks by the first task column if it exists
     if task_columns:
         first_task = task_columns[0]
         df_tasks = df_tasks.sort_values(by=first_task, ascending=False)
-        df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))  # Add Rank column starting from 1
     else:
         df_tasks = df_tasks.sort_values(by='Model Name', ascending=True)
         df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
     return df_3c3h, df_tasks, task_columns
-def load_requests(status_folder):
-    api = HfApi()
-    requests_data = []
-    folder_path_in_repo = status_folder  # 'pending', 'finished', or 'failed'
-    hf_api_token = os.environ.get('HF_API_TOKEN', None)
-    try:
-        # List files in the dataset repository
-        files_info = api.list_repo_files(
-            repo_id=DATASET_REPO_ID,
-            repo_type="dataset",
-            token=hf_api_token
-        )
-    except Exception as e:
-        print(f"Error accessing dataset repository: {e}")
-        return pd.DataFrame()  # Return empty DataFrame if repository not found or inaccessible
-    # Filter files in the desired folder
-    files_in_folder = [f for f in files_info if f.startswith(f"{folder_path_in_repo}/") and f.endswith('.json')]
-    for file_path in files_in_folder:
         try:
-            # Download the JSON file
-            local_file_path = hf_hub_download(
-                repo_id=DATASET_REPO_ID,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_api_token
-            )
-            # Load JSON data
-            with open(local_file_path, 'r') as f:
-                request = json.load(f)
-            requests_data.append(request)
-        except Exception as e:
-            print(f"Error loading file {file_path}: {e}")
-            continue  # Skip files that can't be loaded
-    df = pd.DataFrame(requests_data)
     return df
-def submit_model(model_name, revision, precision, params, license):
-    # Load existing evaluations
     df_3c3h, df_tasks, _ = load_results()
     existing_models_results = df_3c3h[['Model Name', 'Revision', 'Precision']]
-    # Handle 'Missing' precision
     if precision == 'Missing':
         precision = None
     else:
         precision = precision.strip().lower()
-    # Load pending and finished requests from the dataset repository
     df_pending = load_requests('pending')
     df_finished = load_requests('finished')
-    # Check if model is already evaluated
-    model_exists_in_results = ((existing_models_results['Model Name'] == model_name) &
-                               (existing_models_results['Revision'] == revision) &
-                               (existing_models_results['Precision'] == precision)).any()
     if model_exists_in_results:
         return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
-    # Check if model is in pending requests
     if not df_pending.empty:
         existing_models_pending = df_pending[['model_name', 'revision', 'precision']]
-        model_exists_in_pending = ((existing_models_pending['model_name'] == model_name) &
-                                   (existing_models_pending['revision'] == revision) &
-                                   (existing_models_pending['precision'] == precision)).any()
         if model_exists_in_pending:
             return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' is already in the pending evaluations.**"
-    # Check if model is in finished requests
     if not df_finished.empty:
         existing_models_finished = df_finished[['model_name', 'revision', 'precision']]
-        model_exists_in_finished = ((existing_models_finished['model_name'] == model_name) &
-                                    (existing_models_finished['revision'] == revision) &
-                                    (existing_models_finished['precision'] == precision)).any()
         if model_exists_in_finished:
             return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
-    # Check if model exists on HuggingFace Hub
     api = HfApi()
     try:
-        model_info = api.model_info(model_name)
-    except Exception as e:
         return f"**Error: Could not find model '{model_name}' on HuggingFace Hub. Please ensure the model name is correct and the model is public.**"
-    # Proceed with submission
     status = "PENDING"
-    # Prepare the submission data
     submission = {
         "model_name": model_name,
         "license": license,
         "revision": revision,
         "precision": precision,
         "status": status,
-        "params": params
     }
-    # Serialize the submission to JSON
     submission_json = json.dumps(submission, indent=2)
-    # Define the file path in the repository
     org_model = model_name.split('/')
     if len(org_model) != 2:
         return "**Please enter the full model name including the organization or username, e.g., 'inceptionai/jais-family-30b-8k'**"
@@ -280,7 +282,6 @@ def submit_model(model_name, revision, precision, params, license):
     precision_str = precision if precision else 'Missing'
     file_path_in_repo = f"pending/{org}/{model_id}_eval_request_{revision}_{precision_str}.json"
-    # Upload the submission to the dataset repository
     try:
         hf_api_token = os.environ.get('HF_API_TOKEN', None)
         api.upload_file(
@@ -295,10 +296,207 @@ def submit_model(model_name, revision, precision, params, license):
     return f"**Model '{model_name}' has been submitted for evaluation.**"
 def main():
     df_3c3h, df_tasks, task_columns = load_results()
-    # Extract unique Precision and License values for filters
     precision_options_3c3h = sorted(df_3c3h['Precision'].dropna().unique().tolist())
     precision_options_3c3h = [p for p in precision_options_3c3h if p != 'UNK']
     precision_options_3c3h.append('Missing')
@@ -307,6 +505,7 @@ def main():
     license_options_3c3h = [l for l in license_options_3c3h if l != 'UNK']
     license_options_3c3h.append('Missing')
     precision_options_tasks = sorted(df_tasks['Precision'].dropna().unique().tolist())
     precision_options_tasks = [p for p in precision_options_tasks if p != 'UNK']
     precision_options_tasks.append('Missing')
@@ -315,361 +514,327 @@ def main():
     license_options_tasks = [l for l in license_options_tasks if l != 'UNK']
     license_options_tasks.append('Missing')
-    # Get min and max model sizes for sliders, handling 'inf' values
     min_model_size_3c3h = int(df_3c3h['Model Size Filter'].min())
     max_model_size_3c3h = int(df_3c3h['Model Size Filter'].max())
     min_model_size_tasks = int(df_tasks['Model Size Filter'].min())
     max_model_size_tasks = int(df_tasks['Model Size Filter'].max())
-    # Exclude 'Model Size Filter' from column selectors
-    column_choices_3c3h = [col for col in df_3c3h.columns if col != 'Model Size Filter']
-    column_choices_tasks = [col for col in df_tasks.columns if col != 'Model Size Filter']
     with gr.Blocks() as demo:
         gr.HTML(HEADER)
         with gr.Tabs():
-            with gr.Tab("Leaderboard"):
-                with gr.Tabs():
-                    with gr.Tab("3C3H Scores"):
-                        with gr.Row():
-                            search_box_3c3h = gr.Textbox(
-                                placeholder="Search for models...",
-                                label="Search",
-                                interactive=True
-                            )
-                        with gr.Row():
-                            column_selector_3c3h = gr.CheckboxGroup(
-                                choices=column_choices_3c3h,
-                                value=[
                                     'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
                                     'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
-                                ],
-                                label="Select columns to display",
-                            )
-                        with gr.Row():
-                            license_filter_3c3h = gr.CheckboxGroup(
-                                choices=license_options_3c3h,
-                                value=license_options_3c3h.copy(),  # Default all selected
-                                label="Filter by License",
-                            )
-                            precision_filter_3c3h = gr.CheckboxGroup(
-                                choices=precision_options_3c3h,
-                                value=precision_options_3c3h.copy(),  # Default all selected
-                                label="Filter by Precision",
-                            )
-                        with gr.Row():
-                            model_size_min_filter_3c3h = gr.Slider(
-                                minimum=min_model_size_3c3h,
-                                maximum=max_model_size_3c3h,
-                                value=min_model_size_3c3h,
-                                step=1,
-                                label="Minimum Model Size",
-                                interactive=True
                             )
-                            model_size_max_filter_3c3h = gr.Slider(
-                                minimum=min_model_size_3c3h,
-                                maximum=max_model_size_3c3h,
-                                value=max_model_size_3c3h,
-                                step=1,
-                                label="Maximum Model Size",
-                                interactive=True
                             )
-                        leaderboard_3c3h = gr.Dataframe(
-                            df_3c3h[['Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
-                                   'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness']],
-                            interactive=False
-                        )
-                        def filter_df_3c3h(search_query, selected_cols, precision_filters, license_filters, min_size, max_size):
-                            filtered_df = df_3c3h.copy()
-                            # Ensure min_size <= max_size
-                            if min_size > max_size:
-                                min_size, max_size = max_size, min_size
-                            # Apply search filter
-                            if search_query:
-                                filtered_df = filtered_df[filtered_df['Model Name'].str.contains(search_query, case=False, na=False)]
-                            # Apply Precision filter
-                            if precision_filters:
-                                include_missing = 'Missing' in precision_filters
-                                selected_precisions = [p for p in precision_filters if p != 'Missing']
-                                if include_missing:
-                                    filtered_df = filtered_df[
-                                        (filtered_df['Precision'].isin(selected_precisions)) |
-                                        (filtered_df['Precision'] == 'UNK') |
-                                        (filtered_df['Precision'].isna())
-                                    ]
-                                else:
-                                    filtered_df = filtered_df[filtered_df['Precision'].isin(selected_precisions)]
-                            # Apply License filter
-                            if license_filters:
-                                include_missing = 'Missing' in license_filters
-                                selected_licenses = [l for l in license_filters if l != 'Missing']
-                                if include_missing:
-                                    filtered_df = filtered_df[
-                                        (filtered_df['License'].isin(selected_licenses)) |
-                                        (filtered_df['License'] == 'UNK') |
-                                        (filtered_df['License'].isna())
-                                    ]
-                                else:
-                                    filtered_df = filtered_df[filtered_df['License'].isin(selected_licenses)]
-                            # Apply Model Size filter
-                            filtered_df = filtered_df[
-                                (filtered_df['Model Size Filter'] >= min_size) &
-                                (filtered_df['Model Size Filter'] <= max_size)
                             ]
-                            # Remove existing 'Rank' column if present
-                            if 'Rank' in filtered_df.columns:
-                                filtered_df = filtered_df.drop(columns=['Rank'])
-                            # Recalculate Rank after filtering
-                            filtered_df = filtered_df.reset_index(drop=True)
-                            filtered_df.insert(0, 'Rank', range(1, len(filtered_df) + 1))
-                            # Ensure selected columns are present
-                            selected_cols = [col for col in selected_cols if col in filtered_df.columns]
-                            return filtered_df[selected_cols]
-                        # Bind the filter function to the appropriate events
-                        filter_inputs_3c3h = [
-                            search_box_3c3h,
-                            column_selector_3c3h,
-                            precision_filter_3c3h,
-                            license_filter_3c3h,
-                            model_size_min_filter_3c3h,
-                            model_size_max_filter_3c3h
-                        ]
-                        search_box_3c3h.submit(
-                            filter_df_3c3h,
-                            inputs=filter_inputs_3c3h,
-                            outputs=leaderboard_3c3h
-                        )
-                        # Bind change events for CheckboxGroups and sliders
-                        for component in filter_inputs_3c3h:
-                            component.change(
-                                filter_df_3c3h,
-                                inputs=filter_inputs_3c3h,
-                                outputs=leaderboard_3c3h
                             )
-                    with gr.Tab("Tasks Scores"):
-                        gr.Markdown("""
-                                    Note: This Table is sorted based on the First Task (Question Answering)
-                                    """)
                         with gr.Row():
-                            search_box_tasks = gr.Textbox(
                                 placeholder="Search for models...",
                                 label="Search",
                                 interactive=True
                             )
                         with gr.Row():
-                            column_selector_tasks = gr.CheckboxGroup(
-                                choices=column_choices_tasks,
-                                value=['Rank', 'Model Name'] + task_columns,
-                                label="Select columns to display",
                             )
                         with gr.Row():
-                            license_filter_tasks = gr.CheckboxGroup(
-                                choices=license_options_tasks,
-                                value=license_options_tasks.copy(),  # Default all selected
-                                label="Filter by License",
-                            )
-                            precision_filter_tasks = gr.CheckboxGroup(
-                                choices=precision_options_tasks,
-                                value=precision_options_tasks.copy(),  # Default all selected
-                                label="Filter by Precision",
                             )
                         with gr.Row():
-                            model_size_min_filter_tasks = gr.Slider(
-                                minimum=min_model_size_tasks,
-                                maximum=max_model_size_tasks,
-                                value=min_model_size_tasks,
                                 step=1,
                                 label="Minimum Model Size",
                                 interactive=True
                             )
-                            model_size_max_filter_tasks = gr.Slider(
-                                minimum=min_model_size_tasks,
-                                maximum=max_model_size_tasks,
-                                value=max_model_size_tasks,
                                 step=1,
                                 label="Maximum Model Size",
                                 interactive=True
                             )
-                        leaderboard_tasks = gr.Dataframe(
-                            df_tasks[['Rank', 'Model Name'] + task_columns],
-                            interactive=False
-                        )
-                        def filter_df_tasks(search_query, selected_cols, precision_filters, license_filters, min_size, max_size):
-                            filtered_df = df_tasks.copy()
-                            # Ensure min_size <= max_size
-                            if min_size > max_size:
-                                min_size, max_size = max_size, min_size
-                            # Apply search filter
-                            if search_query:
-                                filtered_df = filtered_df[filtered_df['Model Name'].str.contains(search_query, case=False, na=False)]
-                            # Apply Precision filter
-                            if precision_filters:
-                                include_missing = 'Missing' in precision_filters
-                                selected_precisions = [p for p in precision_filters if p != 'Missing']
-                                if include_missing:
-                                    filtered_df = filtered_df[
-                                        (filtered_df['Precision'].isin(selected_precisions)) |
-                                        (filtered_df['Precision'] == 'UNK') |
-                                        (filtered_df['Precision'].isna())
-                                    ]
-                                else:
-                                    filtered_df = filtered_df[filtered_df['Precision'].isin(selected_precisions)]
-                            # Apply License filter
-                            if license_filters:
-                                include_missing = 'Missing' in license_filters
-                                selected_licenses = [l for l in license_filters if l != 'Missing']
-                                if include_missing:
-                                    filtered_df = filtered_df[
-                                        (filtered_df['License'].isin(selected_licenses)) |
-                                        (filtered_df['License'] == 'UNK') |
-                                        (filtered_df['License'].isna())
-                                    ]
-                                else:
-                                    filtered_df = filtered_df[filtered_df['License'].isin(selected_licenses)]
-                            # Apply Model Size filter
-                            filtered_df = filtered_df[
-                                (filtered_df['Model Size Filter'] >= min_size) &
-                                (filtered_df['Model Size Filter'] <= max_size)
-                            ]
-                            # Remove existing 'Rank' column if present
-                            if 'Rank' in filtered_df.columns:
-                                filtered_df = filtered_df.drop(columns=['Rank'])
-                            # Sort by the first task column if it exists
-                            if task_columns:
-                                first_task = task_columns[0]
-                                filtered_df = filtered_df.sort_values(by=first_task, ascending=False)
-                            else:
-                                filtered_df = filtered_df.sort_values(by='Model Name', ascending=True)
-                            # Recalculate Rank after filtering
-                            filtered_df = filtered_df.reset_index(drop=True)
-                            filtered_df.insert(0, 'Rank', range(1, len(filtered_df) + 1))
-                            # Ensure selected columns are present
-                            selected_cols = [col for col in selected_cols if col in filtered_df.columns]
-                            return filtered_df[selected_cols]
-                        # Bind the filter function to the appropriate events
-                        filter_inputs_tasks = [
-                            search_box_tasks,
-                            column_selector_tasks,
-                            precision_filter_tasks,
-                            license_filter_tasks,
-                            model_size_min_filter_tasks,
-                            model_size_max_filter_tasks
-                        ]
-                        search_box_tasks.submit(
-                            filter_df_tasks,
-                            inputs=filter_inputs_tasks,
-                            outputs=leaderboard_tasks
-                        )
-                        # Bind change events for CheckboxGroups and sliders
-                        for component in filter_inputs_tasks:
-                            component.change(
-                                filter_df_tasks,
-                                inputs=filter_inputs_tasks,
-                                outputs=leaderboard_tasks
-                            )
-            with gr.Tab("Submit Here"):
                 gr.Markdown(ABOUT_SECTION)
-                gr.Markdown("---")
-                gr.Markdown("# Submit Your Model for Evaluation")
                 with gr.Column():
                     model_name_input = gr.Textbox(
-                        label="Model Name",
                         placeholder="Enter the full model name from HuggingFace Hub (e.g., inceptionai/jais-family-30b-8k)"
                     )
-                    revision_input = gr.Textbox(
-                        label="Revision",
-                        placeholder="main",
-                        value="main"
-                    )
                     precision_input = gr.Dropdown(
-                        choices=["float16", "float32", "bfloat16", "8bit", "4bit"],
                         label="Precision",
                         value="float16"
                     )
                     params_input = gr.Textbox(
-                        label="Params",
                         placeholder="Enter the approximate number of parameters as Integer (e.g., 7, 13, 30, 70 ...)"
                     )
-                    # Changed from Dropdown to Textbox with default value "Open"
                     license_input = gr.Textbox(
-                        label="License",
-                        placeholder="Enter the license type (Generic one is 'Open' in case no License is provided)",
                         value="Open"
                     )
                     submit_button = gr.Button("Submit Model")
                     submission_result = gr.Markdown()
                     submit_button.click(
                         submit_model,
-                        inputs=[model_name_input, revision_input, precision_input, params_input, license_input],
                         outputs=submission_result
                     )
-                    # Load pending, finished, and failed requests
-                    df_pending = load_requests('pending')
-                    df_finished = load_requests('finished')
-                    df_failed = load_requests('failed')
-                    # Display the tables
-                    gr.Markdown("## Evaluation Status of Open Models from the 🤗 Hub")
-                    with gr.Accordion(f"Pending Evaluations ({len(df_pending)})", open=False):
-                        if not df_pending.empty:
-                            gr.Dataframe(df_pending)
-                        else:
-                            gr.Markdown("No pending evaluations.")
-                    with gr.Accordion(f"Finished Evaluations ({len(df_finished)})", open=False):
-                        if not df_finished.empty:
-                            gr.Dataframe(df_finished)
-                        else:
-                            gr.Markdown("No finished evaluations.")
-                    with gr.Accordion(f"Failed Evaluations ({len(df_failed)})", open=False):
-                        if not df_failed.empty:
-                            gr.Dataframe(df_failed)
-                        else:
-                            gr.Markdown("No failed evaluations.")
             with gr.Row():
                 with gr.Accordion("📙 Citation", open=False):
                     citation_button = gr.Textbox(
                         value=CITATION_BUTTON_TEXT,
                         label=CITATION_BUTTON_LABEL,
-                        lines=20,
                         elem_id="citation-button",
-                        show_copy_button=True,
                     )
-    demo.launch()
 if __name__ == "__main__":
     main()

 OWNER = "inceptionai"
 DATASET_REPO_ID = f"{OWNER}/requests-dataset"
 HEADER = """
 <center>
 <br></br>
+<h1>Arabic Leaderboards</h1>
+<h2>Comprehensive Evaluation of Arabic Large Language Models</h2>
+<br></br>
+<br></br>
+</center>
 """
 ABOUT_SECTION = """
 ## About
+In our `12-24` release, we introduced the `AraGen Benchmark`, along with the `3C3H` evaluation measure (aka the 3C3H Score). You can find more details about AraGen and 3C3H, [here](https://huggingface.co/blog/leaderboard-3c3h-aragen). And you can find the first version of the benchmark, `AraGen-12-24` [here](https://huggingface.co/datasets/inceptionai/AraGen). Building on that foundation, and as part of this new release, we have expanded this space to incorporate additional tasks and evaluation metrics.
+In this release, we present two leaderboards:
+**AraGen-03-25 (v2):**
+- The AraGen Benchmark is designed to evaluate and compare the performance of Chat/Instruct Arabic Large Language Models on a suite of generative tasks that are culturally relevant to the Arab region, history, politics, cuisine ... etc. By leveraging **3C3H** as an evaluation metric—which assesses a model's output across six dimensions: Correctness, Completeness, Conciseness, Helpfulness, Honesty, and Harmlessness—the leaderboard offers a comprehensive and holistic evaluation of a model’s chat capabilities and its ability to generate human-like and ethically responsible content.
+**Instruction Following:**
+- We have established a robust leaderboard that benchmarks models on Arabic and English instruction following, offering an open and comparative performance landscape for the research community. Concurrently, we released the first publicly available Arabic [dataset](https://huggingface.co/datasets/inceptionai/Arabic_IFEval) aimed at evaluating LLMs' ability to follow instructions. The Arabic IFEval samples are meticulously curated to capture the language’s unique nuances—such as diacritization and distinctive phonetic features—often overlooked in generic datasets. Our dedicated linguistic team generated original samples and adapted selections from the IFEval English dataset, ensuring that the material resonates with Arabic cultural contexts and meets the highest standards of authenticity and quality.
+### Why Focus on Chat Models?
+Our evaluations are conducted in a generative mode, meaning that we expect models to produce complete, context-rich responses rather than simply predicting the next token as base models do. This approach not only yields results that are more explainable and nuanced compared to logit-based measurements, but it also captures elements like creativity, coherence, and ethical considerations—providing deeper insights into overall model performance.
 ### Contact
+For inquiries or assistance, please join the conversation on our [Discussions Tab](https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/discussions) or reach out via [email](mailto:[email protected]).
 """
+BOTTOM_LOGO = """<img src="https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards/resolve/main/assets/pictures/03-25/arabic-leaderboards-colab-march-preview-free-3.png" style="width:50%;display:block;margin-left:auto;margin-right:auto;border-radius:15px;">"""
 CITATION_BUTTON_TEXT = """
+@misc{Arabic-Leaderboards,
+  author = {El Filali, Ali and Albarri, Sarah and Abouelseoud, Arwa and Kamboj, Samta and Sengupta, Neha and Nakov, Preslav},
+  title = {Arabic-Leaderboards: Comprehensive Evaluation of Arabic Large Language Models},
+  year = {2025},
   publisher = {Inception},
+  howpublished = "url{https://huggingface.co/spaces/inceptionai/Arabic-Leaderboards}"
 }
 """
+CITATION_BUTTON_LABEL = """
+Copy the following snippet to cite the results from all Arabic Leaderboards in this Space.
+"""
 def load_results():
+    """
+    Loads the AraGen v2 results from aragen_v2_results.json and returns two dataframes:
+    1) df_3c3h with columns for 3C3H scores
+    2) df_tasks with columns for tasks scores
+    """
     current_dir = os.path.dirname(os.path.abspath(__file__))
+    results_file = os.path.join(current_dir, "assets", "results", "aragen_v2_results.json")
     with open(results_file, 'r') as f:
         data = json.load(f)
     # Filter out any entries that only contain '_last_sync_timestamp'
     filtered_data = []
     for entry in data:
         if len(entry.keys()) == 1 and "_last_sync_timestamp" in entry:
             continue
         filtered_data.append(entry)
     data = filtered_data
     data_3c3h = []
     data_tasks = []
     for model_data in data:
         meta = model_data.get('Meta', {})
         model_name = meta.get('Model Name', 'UNK')
         revision = meta.get('Revision', 'UNK')
         precision = meta.get('Precision', 'UNK')
         params = meta.get('Params', 'UNK')
         try:
             model_size_numeric = float(params)
         except (ValueError, TypeError):
             model_size_numeric = np.inf
         scores_data = model_data.get('claude-3.5-sonnet Scores', {})
         scores_3c3h = scores_data.get('3C3H Scores', {})
         scores_tasks = scores_data.get('Tasks Scores', {})
         formatted_scores_3c3h = {k: v*100 for k, v in scores_3c3h.items()}
         formatted_scores_tasks = {k: v*100 for k, v in scores_tasks.items()}
         data_entry_3c3h = {
             'Model Name': model_name,
             'Revision': revision,
+            'License': meta.get('License', 'UNK'),
             'Precision': precision,
+            'Model Size': model_size_numeric,
             '3C3H Score': formatted_scores_3c3h.get("3C3H Score", np.nan),
             'Correctness': formatted_scores_3c3h.get("Correctness", np.nan),
             'Completeness': formatted_scores_3c3h.get("Completeness", np.nan),
         }
         data_3c3h.append(data_entry_3c3h)
         data_entry_tasks = {
             'Model Name': model_name,
             'Revision': revision,
+            'License': meta.get('License', 'UNK'),
             'Precision': precision,
+            'Model Size': model_size_numeric,
             **formatted_scores_tasks
         }
         data_tasks.append(data_entry_tasks)
     df_3c3h = pd.DataFrame(data_3c3h)
     df_tasks = pd.DataFrame(data_tasks)
     score_columns_3c3h = ['3C3H Score', 'Correctness', 'Completeness', 'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness']
     df_3c3h[score_columns_3c3h] = df_3c3h[score_columns_3c3h].round(4)
+    max_model_size_value = 1000
     df_3c3h['Model Size Filter'] = df_3c3h['Model Size'].replace(np.inf, max_model_size_value)
     if '3C3H Score' in df_3c3h.columns:
         df_3c3h = df_3c3h.sort_values(by='3C3H Score', ascending=False)
+        df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
     else:
         df_3c3h.insert(0, 'Rank', range(1, len(df_3c3h) + 1))
     task_columns = [col for col in df_tasks.columns if col not in ['Model Name', 'Revision', 'License', 'Precision', 'Model Size', 'Model Size Filter']]
     if task_columns:
         df_tasks[task_columns] = df_tasks[task_columns].round(4)
     df_tasks['Model Size Filter'] = df_tasks['Model Size'].replace(np.inf, max_model_size_value)
     if task_columns:
         first_task = task_columns[0]
         df_tasks = df_tasks.sort_values(by=first_task, ascending=False)
+        df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
     else:
         df_tasks = df_tasks.sort_values(by='Model Name', ascending=True)
         df_tasks.insert(0, 'Rank', range(1, len(df_tasks) + 1))
     return df_3c3h, df_tasks, task_columns
+def load_if_data():
+    """
+    Loads the instruction-following data from ifeval_results.jsonl
+    and returns a dataframe with relevant columns,
+    converting decimal values to percentage format.
+    """
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    results_file = os.path.join(current_dir, "assets", "results", "ifeval_results.jsonl")
+    data = []
+    with open(results_file, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            data.append(json.loads(line))
+    df = pd.DataFrame(data)
+    # Convert numeric columns
+    numeric_cols = ["En Prompt-lvl", "En Instruction-lvl", "Ar Prompt-lvl", "Ar Instruction-lvl"]
+    for col in numeric_cols:
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+    # Compute average accuracy for En and Ar
+    df["Average Accuracy (En)"] = (df["En Prompt-lvl"] + df["En Instruction-lvl"]) / 2
+    df["Average Accuracy (Ar)"] = (df["Ar Prompt-lvl"] + df["Ar Instruction-lvl"]) / 2
+    # Convert them to percentage format (e.g., 0.871 -> 87.1)
+    for col in numeric_cols:
+        df[col] = (df[col] * 100).round(1)
+    df["Average Accuracy (En)"] = (df["Average Accuracy (En)"] * 100).round(1)
+    df["Average Accuracy (Ar)"] = (df["Average Accuracy (Ar)"] * 100).round(1)
+    # Handle size as numeric
+    def parse_size(x):
         try:
+            return float(x)
+        except:
+            return np.inf
+    df["Model Size"] = df["Size (B)"].apply(parse_size)
+    # Add a filter column for size
+    max_model_size_value = 1000
+    df["Model Size Filter"] = df["Model Size"].replace(np.inf, max_model_size_value)
+    # Sort by "Average Accuracy (Ar)" as an example
+    df = df.sort_values(by="Average Accuracy (Ar)", ascending=False)
+    df = df.reset_index(drop=True)
+    df.insert(0, "Rank", range(1, len(df) + 1))
     return df
+def submit_model(model_name, revision, precision, params, license, modality):
     df_3c3h, df_tasks, _ = load_results()
     existing_models_results = df_3c3h[['Model Name', 'Revision', 'Precision']]
     if precision == 'Missing':
         precision = None
     else:
         precision = precision.strip().lower()
     df_pending = load_requests('pending')
     df_finished = load_requests('finished')
+    model_exists_in_results = (
+        (existing_models_results['Model Name'] == model_name) &
+        (existing_models_results['Revision'] == revision) &
+        (existing_models_results['Precision'] == precision)
+    ).any()
     if model_exists_in_results:
         return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
     if not df_pending.empty:
         existing_models_pending = df_pending[['model_name', 'revision', 'precision']]
+        model_exists_in_pending = (
+            (existing_models_pending['model_name'] == model_name) &
+            (existing_models_pending['revision'] == revision) &
+            (existing_models_pending['precision'] == precision)
+        ).any()
         if model_exists_in_pending:
             return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' is already in the pending evaluations.**"
     if not df_finished.empty:
         existing_models_finished = df_finished[['model_name', 'revision', 'precision']]
+        model_exists_in_finished = (
+            (existing_models_finished['model_name'] == model_name) &
+            (existing_models_finished['revision'] == revision) &
+            (existing_models_finished['precision'] == precision)
+        ).any()
         if model_exists_in_finished:
             return f"**Model '{model_name}' with revision '{revision}' and precision '{precision}' has already been evaluated.**"
     api = HfApi()
     try:
+        _ = api.model_info(model_name)
+    except Exception:
         return f"**Error: Could not find model '{model_name}' on HuggingFace Hub. Please ensure the model name is correct and the model is public.**"
     status = "PENDING"
     submission = {
         "model_name": model_name,
         "license": license,
         "revision": revision,
         "precision": precision,
+        "params": params,
         "status": status,
+        "modality": modality
     }
     submission_json = json.dumps(submission, indent=2)
     org_model = model_name.split('/')
     if len(org_model) != 2:
         return "**Please enter the full model name including the organization or username, e.g., 'inceptionai/jais-family-30b-8k'**"
     precision_str = precision if precision else 'Missing'
     file_path_in_repo = f"pending/{org}/{model_id}_eval_request_{revision}_{precision_str}.json"
     try:
         hf_api_token = os.environ.get('HF_API_TOKEN', None)
         api.upload_file(
     return f"**Model '{model_name}' has been submitted for evaluation.**"
+def load_requests(status_folder):
+    api = HfApi()
+    requests_data = []
+    folder_path_in_repo = status_folder
+    hf_api_token = os.environ.get('HF_API_TOKEN', None)
+    try:
+        files_info = api.list_repo_files(
+            repo_id=DATASET_REPO_ID,
+            repo_type="dataset",
+            token=hf_api_token
+        )
+    except Exception as e:
+        print(f"Error accessing dataset repository: {e}")
+        return pd.DataFrame()
+    files_in_folder = [f for f in files_info if f.startswith(f"{folder_path_in_repo}/") and f.endswith('.json')]
+    for file_path in files_in_folder:
+        try:
+            local_file_path = hf_hub_download(
+                repo_id=DATASET_REPO_ID,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_api_token
+            )
+            with open(local_file_path, 'r') as f:
+                request = json.load(f)
+            requests_data.append(request)
+        except Exception as e:
+            print(f"Error loading file {file_path}: {e}")
+            continue
+    df = pd.DataFrame(requests_data)
+    return df
+def filter_df_3c3h(search_query, selected_cols, precision_filters, license_filters, min_size, max_size):
+    df_ = load_results()[0].copy()
+    if min_size > max_size:
+        min_size, max_size = max_size, min_size
+    if search_query:
+        df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
+    if precision_filters:
+        include_missing = 'Missing' in precision_filters
+        selected_precisions = [p for p in precision_filters if p != 'Missing']
+        if include_missing:
+            df_ = df_[
+                (df_['Precision'].isin(selected_precisions)) |
+                (df_['Precision'] == 'UNK') |
+                (df_['Precision'].isna())
+            ]
+        else:
+            df_ = df_[df_['Precision'].isin(selected_precisions)]
+    if license_filters:
+        include_missing = 'Missing' in license_filters
+        selected_licenses = [l for l in license_filters if l != 'Missing']
+        if include_missing:
+            df_ = df_[
+                (df_['License'].isin(selected_licenses)) |
+                (df_['License'] == 'UNK') |
+                (df_['License'].isna())
+            ]
+        else:
+            df_ = df_[df_['License'].isin(selected_licenses)]
+    df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
+    if 'Rank' in df_.columns:
+        df_ = df_.drop(columns=['Rank'])
+    df_ = df_.reset_index(drop=True)
+    df_.insert(0, 'Rank', range(1, len(df_)+1))
+    fixed_column_order = [
+        "Rank",
+        "Model Name",
+        "3C3H Score",
+        "Correctness",
+        "Completeness",
+        "Conciseness",
+        "Helpfulness",
+        "Honesty",
+        "Harmlessness",
+        "Revision",
+        "License",
+        "Precision",
+        "Model Size"
+    ]
+    selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
+    return df_[selected_cols]
+def filter_df_tasks(search_query, selected_cols, precision_filters, license_filters, min_size, max_size, task_columns):
+    df_ = load_results()[1].copy()
+    if min_size > max_size:
+        min_size, max_size = max_size, min_size
+    if search_query:
+        df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
+    if precision_filters:
+        include_missing = 'Missing' in precision_filters
+        selected_precisions = [p for p in precision_filters if p != 'Missing']
+        if include_missing:
+            df_ = df_[
+                (df_['Precision'].isin(selected_precisions)) |
+                (df_['Precision'] == 'UNK') |
+                (df_['Precision'].isna())
+            ]
+        else:
+            df_ = df_[df_['Precision'].isin(selected_precisions)]
+    if license_filters:
+        include_missing = 'Missing' in license_filters
+        selected_licenses = [l for l in license_filters if l != 'Missing']
+        if include_missing:
+            df_ = df_[
+                (df_['License'].isin(selected_licenses)) |
+                (df_['License'] == 'UNK') |
+                (df_['License'].isna())
+            ]
+        else:
+            df_ = df_[df_['License'].isin(selected_licenses)]
+    df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
+    if 'Rank' in df_.columns:
+        df_ = df_.drop(columns=['Rank'])
+    if task_columns:
+        first_task = task_columns[0]
+        df_ = df_.sort_values(by=first_task, ascending=False)
+    else:
+        df_ = df_.sort_values(by='Model Name', ascending=True)
+    df_ = df_.reset_index(drop=True)
+    df_.insert(0, 'Rank', range(1, len(df_)+1))
+    fixed_column_order = [
+        "Rank",
+        "Model Name",
+        "Question Answering (QA)",
+        "Orthographic and Grammatical Analysis",
+        "Safety",
+        "Reasoning",
+        "Revision",
+        "License",
+        "Precision",
+        "Model Size"
+    ]
+    selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
+    return df_[selected_cols]
+def filter_if_df(search_query, selected_cols, family_filters, min_size, max_size):
+    """
+    Filters the instruction-following dataframe based on various criteria.
+    We have removed 'Filter by Type' and 'Filter by Creator'.
+    """
+    df_ = load_if_data().copy()
+    if min_size > max_size:
+        min_size, max_size = max_size, min_size
+    # Search by model name
+    if search_query:
+        df_ = df_[df_['Model Name'].str.contains(search_query, case=False, na=False)]
+    # Filter by Family only (Creator and Type filters removed)
+    if family_filters:
+        df_ = df_[df_['Family'].isin(family_filters)]
+    # Filter by Model Size
+    df_ = df_[(df_['Model Size Filter'] >= min_size) & (df_['Model Size Filter'] <= max_size)]
+    # Re-rank
+    if 'Rank' in df_.columns:
+        df_ = df_.drop(columns=['Rank'])
+    df_ = df_.reset_index(drop=True)
+    df_.insert(0, 'Rank', range(1, len(df_)+1))
+    fixed_column_order = [
+        "Rank",
+        "Model Name",
+        "Creator",
+        "Family",
+        "Type",
+        "Average Accuracy (Ar)",
+        "Ar Prompt-lvl",
+        "Ar Instruction-lvl",
+        "Average Accuracy (En)",
+        "En Prompt-lvl",
+        "En Instruction-lvl",
+        "Size (B)",
+        "Base Model",
+        "Context Window",
+        "Lang."
+    ]
+    selected_cols = [col for col in fixed_column_order if col in selected_cols and col in df_.columns]
+    return df_[selected_cols]
 def main():
     df_3c3h, df_tasks, task_columns = load_results()
+    df_if = load_if_data()  # Instruction Following DF
+    # Setup precision/license options for the 3C3H scoreboard
     precision_options_3c3h = sorted(df_3c3h['Precision'].dropna().unique().tolist())
     precision_options_3c3h = [p for p in precision_options_3c3h if p != 'UNK']
     precision_options_3c3h.append('Missing')
     license_options_3c3h = [l for l in license_options_3c3h if l != 'UNK']
     license_options_3c3h.append('Missing')
+    # Setup precision/license options for tasks scoreboard
     precision_options_tasks = sorted(df_tasks['Precision'].dropna().unique().tolist())
     precision_options_tasks = [p for p in precision_options_tasks if p != 'UNK']
     precision_options_tasks.append('Missing')
     license_options_tasks = [l for l in license_options_tasks if l != 'UNK']
     license_options_tasks.append('Missing')
+    # Model size range for 3C3H scoreboard
     min_model_size_3c3h = int(df_3c3h['Model Size Filter'].min())
     max_model_size_3c3h = int(df_3c3h['Model Size Filter'].max())
+    # Model size range for tasks scoreboard
     min_model_size_tasks = int(df_tasks['Model Size Filter'].min())
     max_model_size_tasks = int(df_tasks['Model Size Filter'].max())
+    # Column choices for 3C3H
+    column_choices_3c3h = [col for col in df_3c3h.columns.tolist() if col != 'Model Size Filter']
+    # Column choices for tasks
+    column_choices_tasks = [col for col in df_tasks.columns.tolist() if col != 'Model Size Filter']
+    # Now for instruction-following
+    family_options_if = sorted(df_if['Family'].dropna().unique().tolist())
+    min_model_size_if = int(df_if['Model Size Filter'].min())
+    max_model_size_if = int(df_if['Model Size Filter'].max())
+    #
+    # IMPORTANT: Reorder the columns for the Instruction-Following leaderboard
+    # Define the full order and the default visible columns separately.
+    #
+    all_if_columns = [
+        "Rank",
+        "Model Name",
+        "Average Accuracy (Ar)",
+        "Ar Prompt-lvl",
+        "Ar Instruction-lvl",
+        "Average Accuracy (En)",
+        "En Prompt-lvl",
+        "En Instruction-lvl",
+        "Type",
+        "Creator",
+        "Family",
+        "Size (B)",
+        "Base Model",
+        "Context Window",
+        "Lang."
+    ]
+    default_if_columns = [
+        "Rank",
+        "Model Name",
+        "Average Accuracy (Ar)",
+        #"Ar Prompt-lvl",
+        #"Ar Instruction-lvl",
+        "Average Accuracy (En)"
+    ]
     with gr.Blocks() as demo:
         gr.HTML(HEADER)
         with gr.Tabs():
+            #
+            # AL Leaderboards Tab
+            #
+            with gr.Tab("AL Leaderboards 🏅"):
+                # -------------------------
+                # Sub-Tab: AraGen Leaderboards
+                # -------------------------
+                with gr.Tab("🐪 AraGen Leaderboards"):
+                    with gr.Tabs():
+                        # 3C3H Scores
+                        with gr.Tab("3C3H Scores"):
+                            with gr.Accordion("⚙️ Filters", open=False):
+                                with gr.Row():
+                                    search_box_3c3h = gr.Textbox(
+                                        placeholder="Search for models...",
+                                        label="Search",
+                                        interactive=True
+                                    )
+                                with gr.Row():
+                                    column_selector_3c3h = gr.CheckboxGroup(
+                                        choices=column_choices_3c3h,
+                                        value=[
+                                            'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
+                                            'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
+                                        ],
+                                        label="Select columns to display"
+                                    )
+                                with gr.Row():
+                                    license_filter_3c3h = gr.CheckboxGroup(
+                                        choices=license_options_3c3h,
+                                        value=license_options_3c3h.copy(),
+                                        label="Filter by License"
+                                    )
+                                    precision_filter_3c3h = gr.CheckboxGroup(
+                                        choices=precision_options_3c3h,
+                                        value=precision_options_3c3h.copy(),
+                                        label="Filter by Precision"
+                                    )
+                                with gr.Row():
+                                    model_size_min_filter_3c3h = gr.Slider(
+                                        minimum=min_model_size_3c3h,
+                                        maximum=max_model_size_3c3h,
+                                        value=min_model_size_3c3h,
+                                        step=1,
+                                        label="Minimum Model Size",
+                                        interactive=True
+                                    )
+                                    model_size_max_filter_3c3h = gr.Slider(
+                                        minimum=min_model_size_3c3h,
+                                        maximum=max_model_size_3c3h,
+                                        value=max_model_size_3c3h,
+                                        step=1,
+                                        label="Maximum Model Size",
+                                        interactive=True
+                                    )
+                            leaderboard_3c3h = gr.Dataframe(
+                                df_3c3h[[
                                     'Rank', 'Model Name', '3C3H Score', 'Correctness', 'Completeness',
                                     'Conciseness', 'Helpfulness', 'Honesty', 'Harmlessness'
+                                ]],
+                                interactive=False
                             )
+                            filter_inputs_3c3h = [
+                                search_box_3c3h, column_selector_3c3h,
+                                precision_filter_3c3h, license_filter_3c3h,
+                                model_size_min_filter_3c3h, model_size_max_filter_3c3h
+                            ]
+                            search_box_3c3h.submit(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)
+                            for component in filter_inputs_3c3h:
+                                component.change(filter_df_3c3h, inputs=filter_inputs_3c3h, outputs=leaderboard_3c3h)
+                        # Tasks Scores
+                        with gr.Tab("Tasks Scores"):
+                            gr.Markdown("This Table is sorted based on the First Task (Question Answering)")
+                            with gr.Accordion("⚙️ Filters", open=False):
+                                with gr.Row():
+                                    search_box_tasks = gr.Textbox(
+                                        placeholder="Search for models...",
+                                        label="Search",
+                                        interactive=True
+                                    )
+                                with gr.Row():
+                                    column_selector_tasks = gr.CheckboxGroup(
+                                        choices=column_choices_tasks,
+                                        value=['Rank', 'Model Name'] + task_columns,
+                                        label="Select columns to display"
+                                    )
+                                with gr.Row():
+                                    license_filter_tasks = gr.CheckboxGroup(
+                                        choices=license_options_tasks,
+                                        value=license_options_tasks.copy(),
+                                        label="Filter by License"
+                                    )
+                                    precision_filter_tasks = gr.CheckboxGroup(
+                                        choices=precision_options_tasks,
+                                        value=precision_options_tasks.copy(),
+                                        label="Filter by Precision"
+                                    )
+                                with gr.Row():
+                                    model_size_min_filter_tasks = gr.Slider(
+                                        minimum=min_model_size_tasks,
+                                        maximum=max_model_size_tasks,
+                                        value=min_model_size_tasks,
+                                        step=1,
+                                        label="Minimum Model Size",
+                                        interactive=True
+                                    )
+                                    model_size_max_filter_tasks = gr.Slider(
+                                        minimum=min_model_size_tasks,
+                                        maximum=max_model_size_tasks,
+                                        value=max_model_size_tasks,
+                                        step=1,
+                                        label="Maximum Model Size",
+                                        interactive=True
+                                    )
+                            leaderboard_tasks = gr.Dataframe(
+                                df_tasks[['Rank', 'Model Name'] + task_columns],
+                                interactive=False
                             )
+                            filter_inputs_tasks = [
+                                search_box_tasks, column_selector_tasks,
+                                precision_filter_tasks, license_filter_tasks,
+                                model_size_min_filter_tasks, model_size_max_filter_tasks
                             ]
+                            search_box_tasks.submit(
+                                lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
+                                inputs=filter_inputs_tasks,
+                                outputs=leaderboard_tasks
                             )
+                            for component in filter_inputs_tasks:
+                                component.change(
+                                    lambda sq, cols, pf, lf, min_val, max_val: filter_df_tasks(sq, cols, pf, lf, min_val, max_val, task_columns),
+                                    inputs=filter_inputs_tasks,
+                                    outputs=leaderboard_tasks
+                                )
+                # -------------------------
+                # Sub-Tab: Instruction Following Leaderboard
+                # -------------------------
+                with gr.Tab("🗡️ Instruction Following Leaderboard"):
+                    with gr.Accordion("⚙️ Filters", open=False):
                         with gr.Row():
+                            search_box_if = gr.Textbox(
                                 placeholder="Search for models...",
                                 label="Search",
                                 interactive=True
                             )
                         with gr.Row():
+                            column_selector_if = gr.CheckboxGroup(
+                                choices=all_if_columns,
+                                value=default_if_columns,
+                                label="Select columns to display"
                             )
                         with gr.Row():
+                            family_filter_if = gr.CheckboxGroup(
+                                choices=family_options_if,
+                                value=family_options_if.copy(),
+                                label="Filter by Family"
                             )
                         with gr.Row():
+                            model_size_min_filter_if = gr.Slider(
+                                minimum=min_model_size_if,
+                                maximum=max_model_size_if,
+                                value=min_model_size_if,
                                 step=1,
                                 label="Minimum Model Size",
                                 interactive=True
                             )
+                            model_size_max_filter_if = gr.Slider(
+                                minimum=min_model_size_if,
+                                maximum=max_model_size_if,
+                                value=max_model_size_if,
                                 step=1,
                                 label="Maximum Model Size",
                                 interactive=True
                             )
+                    leaderboard_if = gr.Dataframe(
+                        df_if[default_if_columns],
+                        interactive=False
+                    )
+                    filter_inputs_if = [
+                        search_box_if, column_selector_if,
+                        family_filter_if,
+                        model_size_min_filter_if, model_size_max_filter_if
+                    ]
+                    search_box_if.submit(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)
+                    for component in filter_inputs_if:
+                        component.change(filter_if_df, inputs=filter_inputs_if, outputs=leaderboard_if)
+            #
+            # Submit Tab
+            #
+            with gr.Tab("Submit Here 📝"):
+                df_pending = load_requests('pending')
+                df_finished = load_requests('finished')
+                df_failed = load_requests('failed')
                 gr.Markdown(ABOUT_SECTION)
+                gr.Markdown("## Submit Your Model for Evaluation")
                 with gr.Column():
                     model_name_input = gr.Textbox(
+                        label="Model Name",
                         placeholder="Enter the full model name from HuggingFace Hub (e.g., inceptionai/jais-family-30b-8k)"
                     )
+                    revision_input = gr.Textbox(label="Revision", placeholder="main", value="main")
                     precision_input = gr.Dropdown(
+                        choices=["float16", "float32", "bfloat16", "8bit", "4bit"],
                         label="Precision",
                         value="float16"
                     )
                     params_input = gr.Textbox(
+                        label="Params",
                         placeholder="Enter the approximate number of parameters as Integer (e.g., 7, 13, 30, 70 ...)"
                     )
                     license_input = gr.Textbox(
+                        label="License",
+                        placeholder="Enter the license type (Generic one is 'Open' in case no License is provided)",
                         value="Open"
                     )
+                    modality_input = gr.Radio(
+                        choices=["Text"],
+                        label="Modality",
+                        value="Text"
+                    )
                     submit_button = gr.Button("Submit Model")
                     submission_result = gr.Markdown()
                     submit_button.click(
                         submit_model,
+                        inputs=[
+                            model_name_input, revision_input, precision_input,
+                            params_input, license_input, modality_input
+                        ],
                         outputs=submission_result
                     )
+                gr.Markdown("## Evaluation Status")
+                with gr.Accordion(f"Pending Evaluations ({len(df_pending)})", open=False):
+                    if not df_pending.empty:
+                        gr.Dataframe(df_pending)
+                    else:
+                        gr.Markdown("No pending evaluations.")
+                with gr.Accordion(f"Finished Evaluations ({len(df_finished)})", open=False):
+                    if not df_finished.empty:
+                        gr.Dataframe(df_finished)
+                    else:
+                        gr.Markdown("No finished evaluations.")
+                with gr.Accordion(f"Failed Evaluations ({len(df_failed)})", open=False):
+                    if not df_failed.empty:
+                        gr.Dataframe(df_failed)
+                    else:
+                        gr.Markdown("No failed evaluations.")
+            # Citation Section
             with gr.Row():
                 with gr.Accordion("📙 Citation", open=False):
                     citation_button = gr.Textbox(
                         value=CITATION_BUTTON_TEXT,
                         label=CITATION_BUTTON_LABEL,
+                        lines=8,
                         elem_id="citation-button",
+                        show_copy_button=True
                     )
+        gr.HTML(BOTTOM_LOGO)
+        demo.launch()
 if __name__ == "__main__":
     main()