Spaces:

SeaLLMs
/

LLM_Leaderboard_for_SEA

Running

App Files Files Community

lukecq commited on Nov 2, 2024

Commit

4ecf403

1 Parent(s): 0a7c2fd

udpate results to include SeaBench and private dataset

Browse files

Files changed (3) hide show

app.py +28 -16
src/display/about.py +22 -18
src/leaderboard/load_results.py +57 -38

app.py CHANGED Viewed

@@ -34,12 +34,22 @@ snapshot_download(
 def restart_space():
     API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
-all_columns = ['R','type', 'Model','open?', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
-show_columns = ['R', 'Model','type','open?','params(B)', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', ]
-TYPES = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
 # Load the data from the csv file
-csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240808.csv'
-df_m3exam, df_mmlu, df_avg = load_data(csv_path)
 demo = gr.Blocks(css=custom_css)
 with demo:
@@ -48,11 +58,12 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.Tab("🏅 Overall"):
             Leaderboard(
-                value=df_avg[show_columns],
                 select_columns=SelectColumns(
-                    default_selection=show_columns,
                     cant_deselect=["R", "Model"],
                     label="Select Columns to Display:",
                 ),
@@ -63,15 +74,15 @@ with demo:
                     "open?",
                     # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                     # ColumnFilter("Flagged", type="boolean", default=False),
-                    ColumnFilter("params(B)", default=[7, 9]),
                 ],
-                datatype=TYPES,
-                # column_widths=["2%", "33%"],
             )
-        with gr.Tab("M3Exam"):
             Leaderboard(
-                value=df_m3exam[show_columns],
                 select_columns=SelectColumns(
                     default_selection=show_columns,
                     cant_deselect=["R", "Model"],
@@ -84,15 +95,16 @@ with demo:
                     "open?",
                     # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                     # ColumnFilter("Flagged", type="boolean", default=False),
-                    ColumnFilter("params(B)", default=[7, 9]),
                 ],
                 datatype=TYPES,
                 # column_widths=["2%", "33%"],
             )
-        with gr.Tab("MMLU"):
             Leaderboard(
-                value=df_mmlu[show_columns],
                 select_columns=SelectColumns(
                     default_selection=show_columns,
                     cant_deselect=["R", "Model"],
@@ -105,7 +117,7 @@ with demo:
                     "open?",
                     # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                     # ColumnFilter("Flagged", type="boolean", default=False),
-                    ColumnFilter("params(B)", default=[7, 9]),
                 ],
                 datatype=TYPES,
                 # column_widths=["2%", "33%"],

 def restart_space():
     API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
+all_columns = ['R', 'Model', 'type', 'open?', 'avg-pub', 'avg-prv ⬇️', 'id-pub',
+       'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv', '#P(B)']
+show_columns = ['R', 'Model','type','open?','#P(B)', 'avg-pub', 'avg-prv ⬇️',
+        'id-pub', 'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv']
+TYPES = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
+show_columns_overall = ['R', 'Model', 'type', 'open?','#P(B)', 'SeaExam-pub', 'SeaExam-prv ⬇️',
+                'SeaBench-pub', 'SeaBench-prv']
+TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']
 # Load the data from the csv file
+csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20241030.csv'
+# csv_path = f'eval-results/SeaExam_results_20241030.csv'
+df = pd.read_csv(csv_path, skiprows=1, header=0)
+# df_m3exam, df_mmlu, df_avg = load_data(csv_path)
+df_seaexam, df_seabench, df_overall = load_data(csv_path)
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.Tab("🏅 Overall"):
             Leaderboard(
+                value=df_overall[show_columns_overall],
                 select_columns=SelectColumns(
+                    default_selection=show_columns_overall,
                     cant_deselect=["R", "Model"],
                     label="Select Columns to Display:",
                 ),
                     "open?",
                     # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                     # ColumnFilter("Flagged", type="boolean", default=False),
+                    ColumnFilter("#P(B)", default=[7, 9], label="Paramers(B)"),
                 ],
+                datatype=TYPES_overall,
+                # column_widths=["3%", "20%", "6%", "4%"]
             )
+        with gr.Tab("SeaExam"):
             Leaderboard(
+                value=df_seaexam[show_columns],
                 select_columns=SelectColumns(
                     default_selection=show_columns,
                     cant_deselect=["R", "Model"],
                     "open?",
                     # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                     # ColumnFilter("Flagged", type="boolean", default=False),
+                    ColumnFilter("#P(B)", default=[7, 9]),
                 ],
                 datatype=TYPES,
                 # column_widths=["2%", "33%"],
             )
+        with gr.Tab("SeaBench"):
             Leaderboard(
+                value=df_seabench[show_columns],
                 select_columns=SelectColumns(
                     default_selection=show_columns,
                     cant_deselect=["R", "Model"],
                     "open?",
                     # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
                     # ColumnFilter("Flagged", type="boolean", default=False),
+                    ColumnFilter("#P(B)", default=[7, 9]),
                 ],
                 datatype=TYPES,
                 # column_widths=["2%", "33%"],

src/display/about.py CHANGED Viewed

@@ -16,7 +16,7 @@ class Tasks(Enum):
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">📃 SeaExam Leaderboard</h1>"""
 # subtitle
 SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
@@ -26,8 +26,12 @@ SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Sout
 # This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "📝 About" tab for more information.
 # """
 INTRODUCTION_TEXT = """
-This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects). Refer to the "📝 About" tab for more information.
 """
 # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
@@ -38,31 +42,31 @@ This leaderboard is specifically designed to evaluate large language models (LLM
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 # About
-Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard aims to evaluate LLMs on exam-type benchmarks for English, Chinese and SEA languages, focusing on world knowledge and reasoning abilities. The five languages for evaluation are English (en), Chinese (zh), Indonesian (id), Thai (th), and Vietnamese (vi).
-Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
 ## Datasets
-The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam). The dataset consists of two tasks:
-- [**M3Exam**](https://arxiv.org/abs/2306.05179): a benchmark sourced from real and official human exam questions for evaluating LLMs in a multilingual, multimodal, and multilevel context. We post-process the data for the 5 languages.
-- [**MMLU**](https://arxiv.org/abs/2009.03300): a test to measure a text model's multitask accuracy in English. The test covers 57 tasks. We sample 50 questions from each task and translate the data into the other 4 languages with google translate.
 ## Evalation Criteria
-We evaluate the models with accuracy score.
-We have the following settings for evaluation:
-- **few-shot**: the default setting is few-shot (3-shot). All open-source models are evaluated with 3-shot.
-- **zero-shot**: the zero-shot setting is also available. As closed-source models has format issues with few-shot, they are evaluated with zero-shot.
 ## Reults
 How to interpret the leaderboard?
-* Each numerical value represet the accuracy (%).
-* The "M3Exam" and "MMLU" pages show the performance of each model for that dataset.
-* The "🏅 Overall" shows the average results of "M3Exam" and "MMLU".
-* The leaderboard is ranked by avg_sea, the average score across SEA languages (id, th, and vi).
 * The rank is in "R" column.
-* The "params(B)" column shows the number of parameters of the model in billions.
 ## Reproducibility
 To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">📃 SeaExam and SeaBench Leaderboard</h1>"""
 # subtitle
 SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
 # This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "📝 About" tab for more information.
 # """
+# INTRODUCTION_TEXT = """
+# This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks - SeaExam and open-ended benchmark - SeaBench. SeaExam reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects). Refer to the "📝 About" tab for more information.
+# """
 INTRODUCTION_TEXT = """
+This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench. SeaExam assesses world knowledge and reasoning capabilities through exam-style questions, while SeaBench evaluates instruction-following abilities and multi-turn conversational skills. For detailed methodology and results, please refer to the "📝 About" tab.
 """
 # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 # About
+Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard includes two benchmarks, SeaExam and SeaBench, with public (denoted as "pub") and private dataset (denoted as "prv"), respectively. SeaExam aims to evaluate LLMs on exam-type benchmarks for SEA languages, focusing on world knowledge and reasoning abilities. SeaBench aims to evaluate LLMs on instruction-following and multi-turn conversation skills. The  three languages for evaluation are Indonesian (id), Thai (th), and Vietnamese (vi).
 ## Datasets
+The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam) and SeaBench dataset (will be public available soon).
+- **SeaExam**: a benchmark sourced from real and official human exam questions in multiple-choice format.
+- **SeaBench**: a manually created benchmark for evaluating the model's ability to follow instructions and engage in multi-turn conversations. The questions are in open-ended format.
 ## Evalation Criteria
+- **SeaExam**:
+    We evaluate the models with accuracy score. We have the following settings for evaluation:
+    - **few-shot**: the default setting is few-shot (3-shot). All open-source models are evaluated with 3-shot.
+    - **zero-shot**: the zero-shot setting is also available. As closed-source models has format issues with few-shot, they are evaluated with zero-shot.
+_ **SeaBench**:
+    We evaluate the responses of the models with GPT-4o-2024-08-06. Each response is scored on a scale of 1-10.
 ## Reults
 How to interpret the leaderboard?
+* Each numerical value represet the accuracy (%) for SeaExam and score for SeaBench.
+* The "🏅 Overall" shows the average results across the three langauges for SeaExam public dataset (SeaExam-pub), SeaExam private dataset (SeaExam-prv), SeaBench public dataset (SeaBench-pub), (SeaBench-prv). This leaderboard is ranked by SeaExam-prv.
+* SeaExam and SeaBench have the results for each langauge in both public and private dataset. The leaderboard is ranked by avg_prv, the average score across SEA languages (id, th, and vi) in private set.
 * The rank is in "R" column.
+* The "#P(B)" column shows the number of parameters of the model in billions.
+* "open?" column indicates whether the model is open-source or proprietary.
 ## Reproducibility
 To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.

src/leaderboard/load_results.py CHANGED Viewed

@@ -34,59 +34,78 @@ def make_clickable_model(model_name, link=None):
     return model_name
 def load_data(data_path):
-    df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
-    columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
-    columns_sorted = ['R','type', 'Model','open?', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
     # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
-    df_m3exam = df.iloc[:, :11]  # M3Exam columns
-    df_mmlu = df.iloc[:, [0, 1, 2, 3, 11, 12, 13, 14, 15, 16, 17]]  # MMLU columns
-    df_avg = df.iloc[:, [0, 1, 2, 3, 18, 19, 20, 21, 22, 23, 24]]  # Average columns
-    df_mmlu.columns = columns
-    df_avg.columns = columns
-    # # multiply the values in the ['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea'] by 100 and display as 1 decimal
-    for df_tmp in [df_m3exam, df_mmlu, df_avg]:
-        df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] *= 100
-        df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
-    # rank the DataFrames by the 'avg_sea' column
-    df_m3exam['R'] = df_m3exam['avg_sea'].rank(ascending=False).astype(int)
-    df_mmlu['R'] = df_mmlu['avg_sea'].rank(ascending=False).astype(int)
-    df_avg['R'] = df_avg['avg_sea'].rank(ascending=False).astype(int)
-    # reorder the columns
-    df_m3exam = df_m3exam[columns_sorted]
-    df_mmlu = df_mmlu[columns_sorted]
-    df_avg = df_avg[columns_sorted]
-    # sort the DataFrames by the 'avg_sea' column in descending order
-    df_m3exam = df_m3exam.sort_values(by='avg_sea', ascending=False)
-    df_mmlu = df_mmlu.sort_values(by='avg_sea', ascending=False)
-    df_avg = df_avg.sort_values(by='avg_sea', ascending=False)
-    # change the column name from 'avg_sea' to 'avg_sea ⬇️'
-    df_m3exam = df_m3exam.rename(columns={'avg_sea': 'avg_sea ⬇️'})
-    df_mmlu = df_mmlu.rename(columns={'avg_sea': 'avg_sea ⬇️'})
-    df_avg = df_avg.rename(columns={'avg_sea': 'avg_sea ⬇️'})
     # map the values in the 'type' column to the following values: {'base': 'Base', 'chat': 'Chat'}
-    df_m3exam['type'] = df_m3exam['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
-    df_mmlu['type'] = df_mmlu['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
-    df_avg['type'] = df_avg['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
     # get the parameters of the models
-    df_m3exam['params(B)'] = df_m3exam['Model'].apply(get_model_size)
-    df_mmlu['params(B)'] = df_mmlu['Model'].apply(get_model_size)
-    df_avg['params(B)'] = df_avg['Model'].apply(get_model_size)
     # make the 'Model' column clickable
-    df_m3exam['Model'] = df_m3exam['Model'].apply(make_clickable_model)
-    df_mmlu['Model'] = df_mmlu['Model'].apply(make_clickable_model)
-    df_avg['Model'] = df_avg['Model'].apply(make_clickable_model)
-    return df_m3exam, df_mmlu, df_avg
 if __name__ == "__main__":

     return model_name
 def load_data(data_path):
+    df = pd.read_csv(data_path, skiprows=1, header=0)
+    columns = ['Model', 'type', 'open?', 'shot',  'id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']
+    columns_sorted = ['R','Model','type','open?','avg-pub','avg-prv','id-pub','th-pub','vi-pub', 'id-prv', 'th-prv', 'vi-prv']
+    columns_overall = ['Model', 'type', 'open?', 'shot',  'SeaExam-pub', 'SeaExam-prv', 'SeaBench-pub', 'SeaBench-prv']
+    columns_overall_sorted = ['R', 'Model', 'type', 'open?', 'shot', 'SeaExam-pub', 'SeaExam-prv', 'SeaBench-pub', 'SeaBench-prv']
     # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
+    df_seaexam = df.iloc[:, :12]  # M3Exam columns
+    df_seabench = df.iloc[:, [0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19]]  # MMLU columns
+    df_overall = df.iloc[:, [0, 1, 2, 3, 7, 11, 15, 19]]
+    df_seaexam.columns = columns
+    df_seabench.columns = columns
+    df_overall.columns = columns_overall
+    # drop the row if 'avg' column is NaN
+    df_seaexam = df_seaexam.dropna(subset=['id-pub','th-pub','vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv'])
+    df_seabench = df_seabench.dropna(subset=['id-pub','th-pub','vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv'])
+    df_overall = df_overall.dropna(subset=['SeaExam-pub', 'SeaExam-prv'])
+    # # multiply the values in the ['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea'] by 100 and display as 1 decimal
+    for df_tmp in [df_seaexam]:
+        df_tmp[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']] *= 100
+        df_tmp[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']] = df_tmp[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']].round(2)
+    df_seabench[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']] = df_seabench[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']].round(2)
+    df_overall[['SeaExam-pub', 'SeaExam-prv', ]] *= 100
+    df_overall[['SeaExam-pub', 'SeaExam-prv', 'SeaBench-pub', 'SeaBench-prv']] = df_overall[['SeaExam-pub', 'SeaExam-prv', 'SeaBench-pub', 'SeaBench-prv']].round(2)
+    # rank the DataFrames by the 'avg' column
+    df_seaexam['R'] = df_seaexam['avg-prv'].rank(ascending=False).astype(int)
+    df_seabench['R'] = df_seabench['avg-prv'].rank(ascending=False).astype(int)
+    df_overall['R'] = df_overall['SeaExam-prv'].rank(ascending=False).astype(int)
+    # reorder the columns
+    df_seaexam = df_seaexam[columns_sorted]
+    df_seabench = df_seabench[columns_sorted]
+    df_overall = df_overall[columns_overall_sorted]
+    # sort the DataFrames by the 'avg' column in descending order
+    df_seaexam = df_seaexam.sort_values(by='avg-prv', ascending=False)
+    df_seabench = df_seabench.sort_values(by='avg-prv', ascending=False)
+    df_overall = df_overall.sort_values(by='SeaExam-prv', ascending=False)
+    # change the column name from 'avg' to 'avg ⬇️'
+    df_seaexam = df_seaexam.rename(columns={'avg-prv': 'avg-prv ⬇️'})
+    df_seabench = df_seabench.rename(columns={'avg-prv': 'avg-prv ⬇️'})
+    df_overall = df_overall.rename(columns={'SeaExam-prv': 'SeaExam-prv ⬇️'})
     # map the values in the 'type' column to the following values: {'base': 'Base', 'chat': 'Chat'}
+    df_seaexam['type'] = df_seaexam['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
+    df_seabench['type'] = df_seabench['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
+    df_overall['type'] = df_overall['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
     # get the parameters of the models
+    # df_seaexam['params(B)'] = df_seaexam['Model'].apply(get_model_size)
+    # df_seabench['params(B)'] = df_seabench['Model'].apply(get_model_size)
+    # df_overall['params(B)'] = df_overall['Model'].apply(get_model_size)
+    df_seaexam['#P(B)'] = df_seaexam['Model'].apply(get_model_size)
+    df_seabench['#P(B)'] = df_seabench['Model'].apply(get_model_size)
+    df_overall['#P(B)'] = df_overall['Model'].apply(get_model_size)
     # make the 'Model' column clickable
+    df_seaexam['Model'] = df_seaexam['Model'].apply(make_clickable_model)
+    df_seabench['Model'] = df_seabench['Model'].apply(make_clickable_model)
+    df_overall['Model'] = df_overall['Model'].apply(make_clickable_model)
+    # return df_m3exam, df_mmlu, df_avg
+    return df_seaexam, df_seabench, df_overall
 if __name__ == "__main__":