Spaces:

aslanovaf
/

frontend

Runtime error

App Files Files Community

MirakramAghalarov commited on Oct 25, 2024

Commit

d5b71b9

1 Parent(s): dfc075f

Added information for second table

Browse files

Files changed (6) hide show

.gitignore +2 -1
app.py +75 -34
src/datasets.json +130 -0
src/display/about.py +16 -20
src/display/utils.py +47 -3
src/envs.py +2 -0

.gitignore CHANGED Viewed

@@ -17,4 +17,5 @@ src/assets/model_counts.html
 test
 env
-a.py

 test
 env
+a.py
+testing.py

app.py CHANGED Viewed

@@ -20,9 +20,15 @@ from src.display.utils import (
     EVAL_TYPES,
     TYPES,
     AutoEvalColumn,
-    fields
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
@@ -59,6 +65,9 @@ except Exception:
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
 (
@@ -68,6 +77,12 @@ leaderboard_df = original_df.copy()
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
@@ -193,38 +208,54 @@ with demo:
                     leaderboard_table,
                     queue=True,
                 )
-            leaderboard_table = gr.components.Dataframe(
-                value=leaderboard_df[
-                    [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
-                    + shown_columns.value
-                    + [AutoEvalColumn.dummy.name]
-                ],
-                headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name],
-                datatype=TYPES,
-                elem_id="leaderboard-table",
-                interactive=False,
-                visible=True,
-                column_widths=["15%", "30%"]
-            )
-            # Dummy leaderboard for handling the case when the user uses backspace key
-            hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[COLS],
-                headers=COLS,
-                datatype=TYPES,
-                visible=False,
-            )
-            search_bar.submit(
-                update_table,
-                [
-                    hidden_leaderboard_table_for_search,
-                    shown_columns,
-                    search_bar,
-                ],
-                leaderboard_table,
-            )
-            for selector in [shown_columns]:
-                selector.change(
                     update_table,
                     [
                         hidden_leaderboard_table_for_search,
@@ -232,8 +263,18 @@ with demo:
                         search_bar,
                     ],
                     leaderboard_table,
-                    queue=True,
                 )
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():

     EVAL_TYPES,
     TYPES,
     AutoEvalColumn,
+    fields,
+    BENCHMARK_COLS_GROUP,
+    COLS_GROUP,
+    EVAL_COLS_GROUP,
+    EVAL_TYPES_GROUP,
+    TYPES_GROUP,
+    AutoEvalColumnGroup,
 )
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO, EVAL_RESULTS_GROUP_PATH, RESULTS_GROUP_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
+raw_data_grouped, original_df_grouped = get_leaderboard_df(EVAL_RESULTS_GROUP_PATH, COLS_GROUP, BENCHMARK_COLS_GROUP)
+leaderboard_grouped_df = original_df_grouped.copy()
 leaderboard_df = original_df.copy()
 (
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+(
+    finished_eval_queue_g_df,
+    running_eval_queue_g_df,
+    pending_eval_queue_g_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS_GROUP)
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
                     leaderboard_table,
                     queue=True,
                 )
+        with gr.TabItem("🏅 LLM Benchmark FineGrained", elem_id="llm-benchmark-tab-table", id=0):
+            with gr.Row():
+                with gr.Row():
+                    search_bar = gr.Textbox(
+                        placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
+                        show_label=False,
+                        elem_id="search-bar",
+                        )
+                with gr.Row():
+                    shown_columns = gr.CheckboxGroup(
+                        choices=[
+                            c.name
+                            for c in fields(AutoEvalColumn)
+                            if not c.hidden and not c.never_hidden and not c.dummy
+                        ],
+                        value=[
+                            c.name
+                            for c in fields(AutoEvalColumn)
+                            if c.displayed_by_default and not c.hidden and not c.never_hidden
+                        ],
+                        label="Select columns to show",
+                        elem_id="column-select",
+                        interactive=True,
+                    )
+                leaderboard_table = gr.components.Dataframe(
+                    value=leaderboard_df[
+                        [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
+                        + shown_columns.value
+                        + [AutoEvalColumn.dummy.name]
+                    ],
+                    headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name],
+                    datatype=TYPES,
+                    elem_id="leaderboard-table",
+                    interactive=False,
+                    visible=True,
+                    column_widths=["15%", "30%"]
+                )
+                # Dummy leaderboard for handling the case when the user uses backspace key
+                hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                    value=original_df[COLS],
+                    headers=COLS,
+                    datatype=TYPES,
+                    visible=False,
+                )
+                search_bar.submit(
                     update_table,
                     [
                         hidden_leaderboard_table_for_search,
                         search_bar,
                     ],
                     leaderboard_table,
                 )
+                for selector in [shown_columns]:
+                    selector.change(
+                        update_table,
+                        [
+                            hidden_leaderboard_table_for_search,
+                            shown_columns,
+                            search_bar,
+                        ],
+                        leaderboard_table,
+                        queue=True,
+                    )
         with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():

src/datasets.json ADDED Viewed

	@@ -0,0 +1,130 @@

+[
+    {
+        "task_type": "mmlu",
+        "dstype": "mc",
+        "group": "Banking",
+        "subtext": "You are an AI that selects the most accurate answer in Azerbaijani based on a given question. You will be provided with a question in Azerbaijani and multiple options in Azerbaijani. Choose the single letter (A, B, C, D) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Banking_Exam_MCQ",
+        "name": "Banking_Exam_MCQ"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_azerbaycan_dili",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on grammatical concepts and linguistics. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Azerbaijani_Lang_MC",
+        "name": "Azerbaijani_Lang_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_edebiyyat",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on literary and historical facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Literature_MC",
+        "name": "Azerbaijani_Lit_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_biologiya",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on biology. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Biology_MC",
+        "name": "Biology_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_cografiya",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on geographical and environmental knowledge. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Geography_MC",
+        "name": "Geography_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_mentiq",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on logical reasoning and problem-solving. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Logic_MC",
+        "name": "Logic_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_tarix",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on historical and cultural facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/History_MC",
+        "name": "History_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_informatika",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on technology and computer science. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Informatics_MC",
+        "name": "Informatics_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_fizika",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on physics concepts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Physics_MC",
+        "name": "Physics_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_kimya",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on chemistry and scientific concepts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Chemistry_MC",
+        "name": "Chemistry_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "kmc_azerbaycan_tarixi",
+        "group": "MMLU",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on historical facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Azerbaijani_Hist_MC",
+        "name": "Azerbaijani_Hist_MC"
+    },
+    {
+        "task_type": "mmlu",
+        "dstype": "tc",
+        "group": "Banking",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani. Your task is to select the correct option from the given question and answer choices. You are given a statement along with multiple options that represent different topics. Choose the option that best categorizes the statement based on its topic. Choose the single letter (A, B, C, D, E, F, G, H, I, J) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/Banking_Call_Classification_MC",
+        "name": "Banking_Call_Classification_MC"
+    },
+    {
+        "task_type": "arc",
+        "dstype": "arc",
+        "group": "ARC",
+        "subtext": "You are an AI designed to answer questions in Azerbaijani based on reasoning and knowledge. Your task is to select the correct option from the given question and answer choices. You are given a question along with multiple options. Choose the correct option. Choose the single letter (A, B, C, D) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/ARC",
+        "name": "ARC"
+    },
+    {
+        "task_type": "gsm8k",
+        "dstype": "mmc",
+        "group": "GSM8K",
+        "subtext": "You are an AI designed to solve mathematical word problems in Azerbaijani. Your task is to analyze the given question and select the correct option from the provided choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
+        "data": "LLM-Beetle/GSM8K",
+        "name": "GSM8K"
+    },
+    {
+        "task_type": "qa",
+        "dstype": "qa",
+        "group": "Banking",
+        "subtext": "",
+        "data": "LLM-Beetle/Banking_QA",
+        "name": "Banking_QA"
+    },
+    {
+        "task_type": "rag",
+        "dstype": "cqa",
+        "group": "CQA",
+        "subtext": "",
+        "data": "LLM-Beetle/Wiki_CQA",
+        "name": "Wiki_CQA"
+    }
+]

src/display/about.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
@@ -9,30 +11,24 @@ class Task:
 # Init: to update with your specific keys
-class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("MMLU", "metric_name", "MMLU")
-    task1 = Task("Synthetic_QA", "metric_name", "Synthetic_QA")
-    task2 = Task("Support_MC", "metric_name", "Support_MC")
-    task3 = Task("Context_QA", "metric_name", "Context_QA")
-    task4 = Task("Banking_MC", "metric_name", "Banking_MC")
-    task5 = Task("ARC", "metric_name", "ARC")
-    task6 = Task("Binary_QA", "metric_name", "Binary_QA")
-    task7 = Task("ANL_Quad", "metric_name", "ANL_Quad")
-class Tasks_Grouped(Enum):
-    task0 = Task("MMLU", "metric_name", "MMLU")
-    task1 = Task("Synthetic_QA", "metric_name", "Synthetic_QA")
-    task2 = Task("Support_MC", "metric_name", "Support_MC")
-    task3 = Task("Context_QA", "metric_name", "Context_QA")
-    task4 = Task("Banking_MC", "metric_name", "Banking_MC")
-    task5 = Task("ARC", "metric_name", "ARC")
-    task6 = Task("Binary_QA", "metric_name", "Binary_QA")
-    task7 = Task("ANL_Quad", "metric_name", "ANL_Quad")
 # Your leaderboard name

 from dataclasses import dataclass
 from enum import Enum
+import json
 @dataclass
 class Task:
 # Init: to update with your specific keys
+def create_task_list():
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    with open("src/datasets.json") as f:
+        data = json.load(f)
+    groups = []
+    names = []
+    for d in data:
+        groups.append(d['group'])
+        names.append(d['name'])
+    tasks = []
+    for name in names:
+        tasks.append(Task(name, "metric_name", name))
+    return tasks, list(set(groups))
 # Your leaderboard name

src/display/utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from dataclasses import dataclass, make_dataclass
-from src.display.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -18,6 +18,8 @@ class ColumnContent:
     never_hidden: bool = False
     dummy: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
@@ -25,8 +27,10 @@ auto_eval_column_dict.append(["model_submission_date", ColumnContent, ColumnCont
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
@@ -47,4 +51,44 @@ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]

 from dataclasses import dataclass, make_dataclass
+from src.display.about import create_task_list
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
     never_hidden: bool = False
     dummy: bool = False
+Tasks, Groups = create_task_list()
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.col_name, "number", True)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+#for grouping
+## Leaderboard columns
+auto_eval_group_dict = []
+# Init
+auto_eval_group_dict.append(["model_submission_date", ColumnContent, ColumnContent("Submission Date", "str", True, never_hidden=True)])
+auto_eval_group_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+#Scores
+auto_eval_group_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
+for task in Groups:
+    auto_eval_group_dict.append([task.name, ColumnContent, ColumnContent(task.col_name, "number", True)])
+# Dummy column for the search bar (hidden by the custom CSS)
+auto_eval_group_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumnGroup = make_dataclass("AutoEvalColumnGroup", auto_eval_column_dict, frozen=True)
+## For the queue columns in the submission tab
+@dataclass(frozen=True)
+class EvalQueueColumnGroup:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    submitted_time = ColumnContent("submitted_time", "str", True)
+    status = ColumnContent("status", "str", True)
+# Column selection
+COLS_GROUP = [c.name for c in fields(AutoEvalColumnGroup) if not c.hidden]
+TYPES_GROUP = [c.type for c in fields(AutoEvalColumnGroup) if not c.hidden]
+EVAL_COLS_GROUP = [c.name for c in fields(EvalQueueColumnGroup)]
+EVAL_TYPES_GROUP = [c.type for c in fields(EvalQueueColumnGroup)]
+BENCHMARK_COLS_GROUP = [t.value.col_name for t in Groups]

src/envs.py CHANGED Viewed

@@ -8,11 +8,13 @@ OWNER = "LLM-Beetle"
 REPO_ID = f"{OWNER}/frontend"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 API = HfApi(token=TOKEN)

 REPO_ID = f"{OWNER}/frontend"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
+RESULTS_GROUP_REPO = "Emirrv/results"
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_RESULTS_GROUP_PATH = os.path.join(CACHE_PATH, "eval-results")
 API = HfApi(token=TOKEN)