lukecq commited on
Commit
4ecf403
·
1 Parent(s): 0a7c2fd

udpate results to include SeaBench and private dataset

Browse files
Files changed (3) hide show
  1. app.py +28 -16
  2. src/display/about.py +22 -18
  3. src/leaderboard/load_results.py +57 -38
app.py CHANGED
@@ -34,12 +34,22 @@ snapshot_download(
34
  def restart_space():
35
  API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
36
 
37
- all_columns = ['R','type', 'Model','open?', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
38
- show_columns = ['R', 'Model','type','open?','params(B)', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', ]
39
- TYPES = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
 
 
 
 
 
 
 
40
  # Load the data from the csv file
41
- csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240808.csv'
42
- df_m3exam, df_mmlu, df_avg = load_data(csv_path)
 
 
 
43
 
44
  demo = gr.Blocks(css=custom_css)
45
  with demo:
@@ -48,11 +58,12 @@ with demo:
48
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
49
 
50
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
51
  with gr.Tab("🏅 Overall"):
52
  Leaderboard(
53
- value=df_avg[show_columns],
54
  select_columns=SelectColumns(
55
- default_selection=show_columns,
56
  cant_deselect=["R", "Model"],
57
  label="Select Columns to Display:",
58
  ),
@@ -63,15 +74,15 @@ with demo:
63
  "open?",
64
  # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
65
  # ColumnFilter("Flagged", type="boolean", default=False),
66
- ColumnFilter("params(B)", default=[7, 9]),
67
  ],
68
- datatype=TYPES,
69
- # column_widths=["2%", "33%"],
70
  )
71
 
72
- with gr.Tab("M3Exam"):
73
  Leaderboard(
74
- value=df_m3exam[show_columns],
75
  select_columns=SelectColumns(
76
  default_selection=show_columns,
77
  cant_deselect=["R", "Model"],
@@ -84,15 +95,16 @@ with demo:
84
  "open?",
85
  # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
86
  # ColumnFilter("Flagged", type="boolean", default=False),
87
- ColumnFilter("params(B)", default=[7, 9]),
88
  ],
89
  datatype=TYPES,
90
  # column_widths=["2%", "33%"],
91
  )
 
92
 
93
- with gr.Tab("MMLU"):
94
  Leaderboard(
95
- value=df_mmlu[show_columns],
96
  select_columns=SelectColumns(
97
  default_selection=show_columns,
98
  cant_deselect=["R", "Model"],
@@ -105,7 +117,7 @@ with demo:
105
  "open?",
106
  # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
107
  # ColumnFilter("Flagged", type="boolean", default=False),
108
- ColumnFilter("params(B)", default=[7, 9]),
109
  ],
110
  datatype=TYPES,
111
  # column_widths=["2%", "33%"],
 
34
  def restart_space():
35
  API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
36
 
37
+ all_columns = ['R', 'Model', 'type', 'open?', 'avg-pub', 'avg-prv ⬇️', 'id-pub',
38
+ 'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv', '#P(B)']
39
+ show_columns = ['R', 'Model','type','open?','#P(B)', 'avg-pub', 'avg-prv ⬇️',
40
+ 'id-pub', 'th-pub', 'vi-pub', 'id-prv', 'th-prv', 'vi-prv']
41
+ TYPES = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
42
+
43
+ show_columns_overall = ['R', 'Model', 'type', 'open?','#P(B)', 'SeaExam-pub', 'SeaExam-prv ⬇️',
44
+ 'SeaBench-pub', 'SeaBench-prv']
45
+ TYPES_overall = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number']
46
+
47
  # Load the data from the csv file
48
+ csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20241030.csv'
49
+ # csv_path = f'eval-results/SeaExam_results_20241030.csv'
50
+ df = pd.read_csv(csv_path, skiprows=1, header=0)
51
+ # df_m3exam, df_mmlu, df_avg = load_data(csv_path)
52
+ df_seaexam, df_seabench, df_overall = load_data(csv_path)
53
 
54
  demo = gr.Blocks(css=custom_css)
55
  with demo:
 
58
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
59
 
60
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
61
+
62
  with gr.Tab("🏅 Overall"):
63
  Leaderboard(
64
+ value=df_overall[show_columns_overall],
65
  select_columns=SelectColumns(
66
+ default_selection=show_columns_overall,
67
  cant_deselect=["R", "Model"],
68
  label="Select Columns to Display:",
69
  ),
 
74
  "open?",
75
  # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
76
  # ColumnFilter("Flagged", type="boolean", default=False),
77
+ ColumnFilter("#P(B)", default=[7, 9], label="Paramers(B)"),
78
  ],
79
+ datatype=TYPES_overall,
80
+ # column_widths=["3%", "20%", "6%", "4%"]
81
  )
82
 
83
+ with gr.Tab("SeaExam"):
84
  Leaderboard(
85
+ value=df_seaexam[show_columns],
86
  select_columns=SelectColumns(
87
  default_selection=show_columns,
88
  cant_deselect=["R", "Model"],
 
95
  "open?",
96
  # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
97
  # ColumnFilter("Flagged", type="boolean", default=False),
98
+ ColumnFilter("#P(B)", default=[7, 9]),
99
  ],
100
  datatype=TYPES,
101
  # column_widths=["2%", "33%"],
102
  )
103
+
104
 
105
+ with gr.Tab("SeaBench"):
106
  Leaderboard(
107
+ value=df_seabench[show_columns],
108
  select_columns=SelectColumns(
109
  default_selection=show_columns,
110
  cant_deselect=["R", "Model"],
 
117
  "open?",
118
  # ColumnFilter("MOE", type="boolean", default=False, label="MoE"),
119
  # ColumnFilter("Flagged", type="boolean", default=False),
120
+ ColumnFilter("#P(B)", default=[7, 9]),
121
  ],
122
  datatype=TYPES,
123
  # column_widths=["2%", "33%"],
src/display/about.py CHANGED
@@ -16,7 +16,7 @@ class Tasks(Enum):
16
 
17
 
18
  # Your leaderboard name
19
- TITLE = """<h1 align="center" id="space-title">📃 SeaExam Leaderboard</h1>"""
20
 
21
  # subtitle
22
  SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
@@ -26,8 +26,12 @@ SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Sout
26
  # This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "📝 About" tab for more information.
27
  # """
28
 
 
 
 
 
29
  INTRODUCTION_TEXT = """
30
- This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects). Refer to the "📝 About" tab for more information.
31
  """
32
 
33
  # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
@@ -38,31 +42,31 @@ This leaderboard is specifically designed to evaluate large language models (LLM
38
  # Which evaluations are you running? how can people reproduce what you have?
39
  LLM_BENCHMARKS_TEXT = f"""
40
  # About
41
- Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard aims to evaluate LLMs on exam-type benchmarks for English, Chinese and SEA languages, focusing on world knowledge and reasoning abilities. The five languages for evaluation are English (en), Chinese (zh), Indonesian (id), Thai (th), and Vietnamese (vi).
42
 
43
- Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
44
 
45
  ## Datasets
46
- The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam). The dataset consists of two tasks:
47
- - [**M3Exam**](https://arxiv.org/abs/2306.05179): a benchmark sourced from real and official human exam questions for evaluating LLMs in a multilingual, multimodal, and multilevel context. We post-process the data for the 5 languages.
48
- - [**MMLU**](https://arxiv.org/abs/2009.03300): a test to measure a text model's multitask accuracy in English. The test covers 57 tasks. We sample 50 questions from each task and translate the data into the other 4 languages with google translate.
49
 
50
  ## Evalation Criteria
51
- We evaluate the models with accuracy score.
52
-
53
- We have the following settings for evaluation:
54
- - **few-shot**: the default setting is few-shot (3-shot). All open-source models are evaluated with 3-shot.
55
- - **zero-shot**: the zero-shot setting is also available. As closed-source models has format issues with few-shot, they are evaluated with zero-shot.
56
-
 
57
 
58
  ## Reults
59
  How to interpret the leaderboard?
60
- * Each numerical value represet the accuracy (%).
61
- * The "M3Exam" and "MMLU" pages show the performance of each model for that dataset.
62
- * The "🏅 Overall" shows the average results of "M3Exam" and "MMLU".
63
- * The leaderboard is ranked by avg_sea, the average score across SEA languages (id, th, and vi).
64
  * The rank is in "R" column.
65
- * The "params(B)" column shows the number of parameters of the model in billions.
 
66
 
67
  ## Reproducibility
68
  To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
 
16
 
17
 
18
  # Your leaderboard name
19
+ TITLE = """<h1 align="center" id="space-title">📃 SeaExam and SeaBench Leaderboard</h1>"""
20
 
21
  # subtitle
22
  SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
 
26
  # This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "📝 About" tab for more information.
27
  # """
28
 
29
+ # INTRODUCTION_TEXT = """
30
+ # This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks - SeaExam and open-ended benchmark - SeaBench. SeaExam reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects). Refer to the "📝 About" tab for more information.
31
+ # """
32
+
33
  INTRODUCTION_TEXT = """
34
+ This leaderboard evaluates Large Language Models (LLMs) on Southeast Asian (SEA) languages through two comprehensive benchmarks: SeaExam and SeaBench. SeaExam assesses world knowledge and reasoning capabilities through exam-style questions, while SeaBench evaluates instruction-following abilities and multi-turn conversational skills. For detailed methodology and results, please refer to the "📝 About" tab.
35
  """
36
 
37
  # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
 
42
  # Which evaluations are you running? how can people reproduce what you have?
43
  LLM_BENCHMARKS_TEXT = f"""
44
  # About
45
+ Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard includes two benchmarks, SeaExam and SeaBench, with public (denoted as "pub") and private dataset (denoted as "prv"), respectively. SeaExam aims to evaluate LLMs on exam-type benchmarks for SEA languages, focusing on world knowledge and reasoning abilities. SeaBench aims to evaluate LLMs on instruction-following and multi-turn conversation skills. The three languages for evaluation are Indonesian (id), Thai (th), and Vietnamese (vi).
46
 
 
47
 
48
  ## Datasets
49
+ The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam) and SeaBench dataset (will be public available soon).
50
+ - **SeaExam**: a benchmark sourced from real and official human exam questions in multiple-choice format.
51
+ - **SeaBench**: a manually created benchmark for evaluating the model's ability to follow instructions and engage in multi-turn conversations. The questions are in open-ended format.
52
 
53
  ## Evalation Criteria
54
+ - **SeaExam**:
55
+ We evaluate the models with accuracy score. We have the following settings for evaluation:
56
+ - **few-shot**: the default setting is few-shot (3-shot). All open-source models are evaluated with 3-shot.
57
+ - **zero-shot**: the zero-shot setting is also available. As closed-source models has format issues with few-shot, they are evaluated with zero-shot.
58
+
59
+ _ **SeaBench**:
60
+ We evaluate the responses of the models with GPT-4o-2024-08-06. Each response is scored on a scale of 1-10.
61
 
62
  ## Reults
63
  How to interpret the leaderboard?
64
+ * Each numerical value represet the accuracy (%) for SeaExam and score for SeaBench.
65
+ * The "🏅 Overall" shows the average results across the three langauges for SeaExam public dataset (SeaExam-pub), SeaExam private dataset (SeaExam-prv), SeaBench public dataset (SeaBench-pub), (SeaBench-prv). This leaderboard is ranked by SeaExam-prv.
66
+ * SeaExam and SeaBench have the results for each langauge in both public and private dataset. The leaderboard is ranked by avg_prv, the average score across SEA languages (id, th, and vi) in private set.
 
67
  * The rank is in "R" column.
68
+ * The "#P(B)" column shows the number of parameters of the model in billions.
69
+ * "open?" column indicates whether the model is open-source or proprietary.
70
 
71
  ## Reproducibility
72
  To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
src/leaderboard/load_results.py CHANGED
@@ -34,59 +34,78 @@ def make_clickable_model(model_name, link=None):
34
  return model_name
35
 
36
  def load_data(data_path):
37
- df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
38
 
39
- columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
40
- columns_sorted = ['R','type', 'Model','open?', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
 
 
41
 
42
  # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
43
- df_m3exam = df.iloc[:, :11] # M3Exam columns
44
- df_mmlu = df.iloc[:, [0, 1, 2, 3, 11, 12, 13, 14, 15, 16, 17]] # MMLU columns
45
- df_avg = df.iloc[:, [0, 1, 2, 3, 18, 19, 20, 21, 22, 23, 24]] # Average columns
46
- df_mmlu.columns = columns
47
- df_avg.columns = columns
48
 
49
- # # multiply the values in the ['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea'] by 100 and display as 1 decimal
50
- for df_tmp in [df_m3exam, df_mmlu, df_avg]:
51
- df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] *= 100
52
- df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
53
 
54
- # rank the DataFrames by the 'avg_sea' column
55
- df_m3exam['R'] = df_m3exam['avg_sea'].rank(ascending=False).astype(int)
56
- df_mmlu['R'] = df_mmlu['avg_sea'].rank(ascending=False).astype(int)
57
- df_avg['R'] = df_avg['avg_sea'].rank(ascending=False).astype(int)
58
 
59
- # reorder the columns
60
- df_m3exam = df_m3exam[columns_sorted]
61
- df_mmlu = df_mmlu[columns_sorted]
62
- df_avg = df_avg[columns_sorted]
63
 
64
- # sort the DataFrames by the 'avg_sea' column in descending order
65
- df_m3exam = df_m3exam.sort_values(by='avg_sea', ascending=False)
66
- df_mmlu = df_mmlu.sort_values(by='avg_sea', ascending=False)
67
- df_avg = df_avg.sort_values(by='avg_sea', ascending=False)
68
 
69
- # change the column name from 'avg_sea' to 'avg_sea ⬇️'
70
- df_m3exam = df_m3exam.rename(columns={'avg_sea': 'avg_sea ⬇️'})
71
- df_mmlu = df_mmlu.rename(columns={'avg_sea': 'avg_sea ⬇️'})
72
- df_avg = df_avg.rename(columns={'avg_sea': 'avg_sea ⬇️'})
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  # map the values in the 'type' column to the following values: {'base': 'Base', 'chat': 'Chat'}
75
- df_m3exam['type'] = df_m3exam['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
76
- df_mmlu['type'] = df_mmlu['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
77
- df_avg['type'] = df_avg['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
78
 
79
  # get the parameters of the models
80
- df_m3exam['params(B)'] = df_m3exam['Model'].apply(get_model_size)
81
- df_mmlu['params(B)'] = df_mmlu['Model'].apply(get_model_size)
82
- df_avg['params(B)'] = df_avg['Model'].apply(get_model_size)
 
 
 
 
83
 
84
  # make the 'Model' column clickable
85
- df_m3exam['Model'] = df_m3exam['Model'].apply(make_clickable_model)
86
- df_mmlu['Model'] = df_mmlu['Model'].apply(make_clickable_model)
87
- df_avg['Model'] = df_avg['Model'].apply(make_clickable_model)
88
 
89
- return df_m3exam, df_mmlu, df_avg
 
90
 
91
 
92
  if __name__ == "__main__":
 
34
  return model_name
35
 
36
  def load_data(data_path):
37
+ df = pd.read_csv(data_path, skiprows=1, header=0)
38
 
39
+ columns = ['Model', 'type', 'open?', 'shot', 'id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']
40
+ columns_sorted = ['R','Model','type','open?','avg-pub','avg-prv','id-pub','th-pub','vi-pub', 'id-prv', 'th-prv', 'vi-prv']
41
+ columns_overall = ['Model', 'type', 'open?', 'shot', 'SeaExam-pub', 'SeaExam-prv', 'SeaBench-pub', 'SeaBench-prv']
42
+ columns_overall_sorted = ['R', 'Model', 'type', 'open?', 'shot', 'SeaExam-pub', 'SeaExam-prv', 'SeaBench-pub', 'SeaBench-prv']
43
 
44
  # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
45
+ df_seaexam = df.iloc[:, :12] # M3Exam columns
46
+ df_seabench = df.iloc[:, [0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19]] # MMLU columns
47
+ df_overall = df.iloc[:, [0, 1, 2, 3, 7, 11, 15, 19]]
 
 
48
 
49
+ df_seaexam.columns = columns
50
+ df_seabench.columns = columns
51
+ df_overall.columns = columns_overall
 
52
 
53
+ # drop the row if 'avg' column is NaN
54
+ df_seaexam = df_seaexam.dropna(subset=['id-pub','th-pub','vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv'])
55
+ df_seabench = df_seabench.dropna(subset=['id-pub','th-pub','vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv'])
56
+ df_overall = df_overall.dropna(subset=['SeaExam-pub', 'SeaExam-prv'])
57
 
58
+ # # multiply the values in the ['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea'] by 100 and display as 1 decimal
59
+ for df_tmp in [df_seaexam]:
60
+ df_tmp[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']] *= 100
61
+ df_tmp[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']] = df_tmp[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']].round(2)
62
 
63
+ df_seabench[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']] = df_seabench[['id-pub', 'th-pub', 'vi-pub', 'avg-pub', 'id-prv', 'th-prv', 'vi-prv', 'avg-prv']].round(2)
 
 
 
64
 
65
+ df_overall[['SeaExam-pub', 'SeaExam-prv', ]] *= 100
66
+ df_overall[['SeaExam-pub', 'SeaExam-prv', 'SeaBench-pub', 'SeaBench-prv']] = df_overall[['SeaExam-pub', 'SeaExam-prv', 'SeaBench-pub', 'SeaBench-prv']].round(2)
 
 
67
 
68
+ # rank the DataFrames by the 'avg' column
69
+ df_seaexam['R'] = df_seaexam['avg-prv'].rank(ascending=False).astype(int)
70
+ df_seabench['R'] = df_seabench['avg-prv'].rank(ascending=False).astype(int)
71
+ df_overall['R'] = df_overall['SeaExam-prv'].rank(ascending=False).astype(int)
72
+
73
+ # reorder the columns
74
+ df_seaexam = df_seaexam[columns_sorted]
75
+ df_seabench = df_seabench[columns_sorted]
76
+ df_overall = df_overall[columns_overall_sorted]
77
+
78
+ # sort the DataFrames by the 'avg' column in descending order
79
+ df_seaexam = df_seaexam.sort_values(by='avg-prv', ascending=False)
80
+ df_seabench = df_seabench.sort_values(by='avg-prv', ascending=False)
81
+ df_overall = df_overall.sort_values(by='SeaExam-prv', ascending=False)
82
+
83
+ # change the column name from 'avg' to 'avg ⬇️'
84
+ df_seaexam = df_seaexam.rename(columns={'avg-prv': 'avg-prv ⬇️'})
85
+ df_seabench = df_seabench.rename(columns={'avg-prv': 'avg-prv ⬇️'})
86
+ df_overall = df_overall.rename(columns={'SeaExam-prv': 'SeaExam-prv ⬇️'})
87
+
88
  # map the values in the 'type' column to the following values: {'base': 'Base', 'chat': 'Chat'}
89
+ df_seaexam['type'] = df_seaexam['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
90
+ df_seabench['type'] = df_seabench['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
91
+ df_overall['type'] = df_overall['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
92
 
93
  # get the parameters of the models
94
+ # df_seaexam['params(B)'] = df_seaexam['Model'].apply(get_model_size)
95
+ # df_seabench['params(B)'] = df_seabench['Model'].apply(get_model_size)
96
+ # df_overall['params(B)'] = df_overall['Model'].apply(get_model_size)
97
+
98
+ df_seaexam['#P(B)'] = df_seaexam['Model'].apply(get_model_size)
99
+ df_seabench['#P(B)'] = df_seabench['Model'].apply(get_model_size)
100
+ df_overall['#P(B)'] = df_overall['Model'].apply(get_model_size)
101
 
102
  # make the 'Model' column clickable
103
+ df_seaexam['Model'] = df_seaexam['Model'].apply(make_clickable_model)
104
+ df_seabench['Model'] = df_seabench['Model'].apply(make_clickable_model)
105
+ df_overall['Model'] = df_overall['Model'].apply(make_clickable_model)
106
 
107
+ # return df_m3exam, df_mmlu, df_avg
108
+ return df_seaexam, df_seabench, df_overall
109
 
110
 
111
  if __name__ == "__main__":